# Find matching fragment pairs

Conditions
- dummy atoms have complementary subpockets
- dummy atoms have identical BRICS environment
- dummy atoms have same bond type

## 1. Load Fragment library

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import math
from pathlib import Path

import pandas as pd
#import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem, Draw, Lipinski, PandasTools

from kinfraglib import utils
from kinfraglib import filters



In [3]:
# Needed to display ROMol images in DataFrames
PandasTools.RenderImagesInAllDataFrames(images=True)

In [4]:
# Path to data
HERE = Path(_dh[-1])
PATH_DATA = HERE / '../../data'

In [5]:
fragment_library = utils.read_fragment_library(PATH_DATA / 'fragment_library')

In [6]:
pd.concat(fragment_library).reset_index(drop=True).shape

(7486, 15)

## 2. Apply Filters to fragment library

### 2.1. Prefilters

In [7]:
fragment_library_pre_filtered = filters.prefilters.pre_filters(
    fragment_library)

In [8]:
fragment_library_original = fragment_library.copy()
fragment_library = fragment_library_pre_filtered.copy()

### 2.2. unwanted substructures

In [9]:
pains_dict = filters.pains.get_pains(fragment_library)

In [10]:
fragment_library_filtered = pains_dict["fragment_library"]
pains_df = pains_dict["pains"]

In [11]:
HERE = Path(_dh[-1])
DATA =  HERE / '../../../Brenk'

In [12]:
brenk_dict = filters.unwanted_substructures.get_brenk(fragment_library_filtered, DATA)

Number of unwanted substructures in Brenk et al. collection: 104


In [13]:
fragment_library_filtered =  brenk_dict['fragment_library']
brenk_structs = brenk_dict['brenk']

### 2.3. Drug likeness

In [14]:
druglikeness_dict = filters.ruleofthree.get_ro3_frags(fragment_library_filtered)
fragment_library_filtered = druglikeness_dict["fragment_library"]
ro3_vals = druglikeness_dict["ro3"]

In [15]:
res_qed = filters.qed.get_qed(fragment_library_filtered)
fragment_library_filtered = res_qed["fragment_library"]
qed_vals = res_qed["qed"]

### 2.4. Synthesizability

In [16]:
synthesizability_dict = filters.building_blocks.check_building_blocks(
    fragment_library_filtered,
    str(str(PATH_DATA)+'/filters/DataWarrior/Enamine_Building_Blocks.sdf'))

In [17]:
fragment_library_filtered = synthesizability_dict['fragment_library']

In [18]:
d_sybas = filters.syba.calc_syba(
    fragment_library_filtered,
    cutoff=0)

In [19]:
fragment_library_filtered = d_sybas['fragment_library']
sybascore = d_sybas['sybas']

### 2.5. save only fragments passing all filters

In [20]:
fragment_library_filtered = filters.analysis.number_of_accepted(
    fragment_library_filtered, columns=['bool_pains', 'bool_brenk', 'ro3', 'qed', 'bool_bb', 'bool_syba'],
    min_accepted=6)

In [21]:
complete_fragment_library_filtered = fragment_library_filtered

In [22]:
for subpocket in fragment_library_filtered.keys():
    fragment_library_filtered[subpocket].drop(fragment_library_filtered[subpocket].loc[fragment_library_filtered[subpocket]['bool']==0].index, inplace=True)
    fragment_library_filtered[subpocket] = fragment_library_filtered[subpocket].reset_index(drop=True)

In [23]:
fragment_library_filtered['AP']

Unnamed: 0,subpocket,smiles,ROMol,ROMol_dummy,ROMol_original,kinase,family,group,complex_pdb,ligand_pdb,...,fragment_count,connections,connections_name,bool_pains,bool_brenk,ro3,qed,bool_bb,bool_syba,bool
0,AP,Cc1cc(N)[nH]n1,,,,ABL1,Abl,TK,2f4j,VX6,...,9,[SE],[AP=SE],1,1,1,1,1,1,1
1,AP,c1cnc2ccnn2c1,,,,ACTR2,STKR,TKL,3q4t,TAK,...,10,"[SE, GA]","[AP=SE, AP=GA]",1,1,1,1,1,1,1
2,AP,c1cn[nH]c1,,,,AKT2,Akt,AGC,2uw9,GVP,...,13,[FP],[AP=FP],1,1,1,1,1,1,1
3,AP,Nc1ncccc1O,,,,ALK,ALK,TK,2xp2,VGH,...,16,"[FP, SE]","[AP=FP, AP=SE]",1,1,1,1,1,1,1
4,AP,Nc1ncccn1,,,,ALK,ALK,TK,4dce,0JF,...,103,"[FP, SE]","[AP=FP, AP=SE]",1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,AP,c1ccc2[nH]nnc2c1,,,,ULK1,ULK,Other,6mnh,JVD,...,1,[FP],[AP=FP],1,1,1,1,1,1,1
124,AP,CNC(=O)c1ccccc1,,,,p38a,MAPK,CMGC,1zyj,BI5,...,3,"[GA, SE]","[AP=GA, AP=SE]",1,1,1,1,1,1,1
125,AP,Nc1nc2ccccc2s1,,,,p38a,MAPK,CMGC,3c5u,P41,...,1,"[SE, GA]","[AP=SE, AP=GA]",1,1,1,1,1,1,1
126,AP,Nc1ccnn1-c1ccccc1,,,,p38a,MAPK,CMGC,3ocg,OCG,...,1,[GA],[AP=GA],1,1,1,1,1,1,1


## 3. Get matching fragment pairs and check conditions

using https://github.com/volkamerlab/KinaseFocusedFragmentLibrary/blob/b7e684c26f75efffc2a9ba2383c9027cdd4c29a3/kinase_focused_fragment_library/recombination/cli.py#L205

In [24]:
from kinase_focused_fragment_library.recombination.classes_meta import \
    Combination, Compound, Port, Fragment

from kinase_focused_fragment_library.recombination.brics_rules import is_brics_bond
from rdkit.Chem.PropertyMol import PropertyMol
from functools import reduce
from kinase_focused_fragment_library.recombination.get_tuple import get_tuple

In [25]:
def get_valid_pairs(fragment_library):
    """
        *copied and adapted from kinase_focused_fragment_library*
    """
    data = {}  # (Fragments)
    frag_set = set()  # only used in initialization for avoiding duplicates in fragment data set (smiles & dummy atoms)

    #iterate through subpockets and fragments in subpockets
    #save subpocket_fragmentindex and dummy atoms, bonds etc
    for subpocket in fragment_library.keys():
        fragments = []
        for i, row in fragment_library[subpocket].iterrows():
            #get fragment and connecting subpockets
            fragment = row['ROMol_original']
            fragment = Chem.RemoveHs(fragment)
            frag_id = f'{subpocket}_{i}'
            
            # store unique atom identifiers
            for a, atom in enumerate(fragment.GetAtoms()):
                frag_atom_id = f'{subpocket}_{a}'
                atom.SetProp('frag_atom_id', frag_atom_id)
                
            # get all dummy atoms of this fragment except the ones corresponding to the X pool
            dummy_atoms = [a for a in fragment.GetAtoms() if a.GetSymbol() == '*' and not a.GetProp('subpocket').startswith('X')]
            if not dummy_atoms:
                continue
            
            frag_smiles, dummy_set = get_tuple(fragment, dummy_atoms)
            # check if this exact fragment has already been found
            if (frag_smiles, dummy_set) in frag_set:
                continue
            # if not, add this fragment to set of fragments
            frag_set.add((frag_smiles, dummy_set))

            # create dummy atom objects
            ports = [Port(atom_id=dummy.GetProp('frag_atom_id'), subpocket=subpocket, neighboring_subpocket=dummy.GetProp('subpocket'),
                          bond_type=fragment.GetBondBetweenAtoms(dummy.GetIdx(), dummy.GetNeighbors()[0].GetIdx()).GetBondType(),
                          environment=dummy.GetNeighbors()[0].GetProp('environment'))
                     for dummy in dummy_atoms]   
            # add all dummy atoms of this fragment to the queue
            #compound = Compound(frag_ids=[frag_id], subpockets=[subpocket], ports=ports, bonds=[])
            #combo = Combination(frag_ids=frozenset([frag_id]))
            
            # store fragment in constant data set
            fragment = Fragment(frag_id=frag_id, subpocket=subpocket, ports=ports)
            fragments.append(fragment)
        data[subpocket] = fragments
        
    n_frags = len(frag_set)

    print('Number of fragments: ', n_frags)
    
    return data

In [26]:
def checkvalid(data, fragment_library):
    matches = [] #save matching fragment pairs
    #iterate through subpockets
    for subpocket in fragment_library.keys():
        #iterate through fragments in subpockets
        for fragment in data[subpocket]:
            fragment_id1 = fragment.frag_id #store fragment ID of first fragment in pair
            #go through atom connnections and check neighbors, bond type and environment
            for i in range(0,len(fragment.ports)):
                neighbor = fragment.ports[i].neighboring_subpocket
                bond_type = fragment.ports[i].bond_type
                environment = fragment.ports[i].environment
                match = [] #store current matching fragment pair
                for frag2 in data[neighbor]:
                    fragment_id2 = frag2.frag_id  #store fragment ID of second fragment            
                    for i in range(0,len(frag2.ports)): 
                        #check environment type, subpocket, bond type
                        environment_match = is_brics_bond(environment, frag2.ports[i].environment) #check if BRICS environments are able to form connection
                        #if subpocket is adjacent, bond type is eqal and environments are matching, add as valid matching pair
                        if frag2.ports[i].neighboring_subpocket == subpocket and neighbor ==  frag2.ports[i].subpocket and frag2.ports[i].bond_type == bond_type and environment_match:
                            match.append([fragment_id1, fragment_id2])        
                matches.append(match) #add valid matching pair to list of matching pairs
    return matches  

In [27]:
res = get_valid_pairs(fragment_library_filtered)

Number of fragments:  487


In [28]:
valids = checkvalid(res, fragment_library_filtered)

In [29]:
valids[0]

[['AP_0', 'SE_0'],
 ['AP_0', 'SE_2'],
 ['AP_0', 'SE_3'],
 ['AP_0', 'SE_4'],
 ['AP_0', 'SE_5'],
 ['AP_0', 'SE_6'],
 ['AP_0', 'SE_7'],
 ['AP_0', 'SE_8'],
 ['AP_0', 'SE_9'],
 ['AP_0', 'SE_10'],
 ['AP_0', 'SE_11'],
 ['AP_0', 'SE_12'],
 ['AP_0', 'SE_13'],
 ['AP_0', 'SE_15'],
 ['AP_0', 'SE_16'],
 ['AP_0', 'SE_17'],
 ['AP_0', 'SE_18'],
 ['AP_0', 'SE_19'],
 ['AP_0', 'SE_21'],
 ['AP_0', 'SE_23'],
 ['AP_0', 'SE_24'],
 ['AP_0', 'SE_25'],
 ['AP_0', 'SE_27'],
 ['AP_0', 'SE_28'],
 ['AP_0', 'SE_29'],
 ['AP_0', 'SE_30'],
 ['AP_0', 'SE_31'],
 ['AP_0', 'SE_33'],
 ['AP_0', 'SE_37'],
 ['AP_0', 'SE_38'],
 ['AP_0', 'SE_41'],
 ['AP_0', 'SE_45'],
 ['AP_0', 'SE_46'],
 ['AP_0', 'SE_48'],
 ['AP_0', 'SE_49'],
 ['AP_0', 'SE_50'],
 ['AP_0', 'SE_52'],
 ['AP_0', 'SE_53'],
 ['AP_0', 'SE_54'],
 ['AP_0', 'SE_55'],
 ['AP_0', 'SE_56'],
 ['AP_0', 'SE_57'],
 ['AP_0', 'SE_58'],
 ['AP_0', 'SE_59'],
 ['AP_0', 'SE_64'],
 ['AP_0', 'SE_65'],
 ['AP_0', 'SE_66'],
 ['AP_0', 'SE_67'],
 ['AP_0', 'SE_68'],
 ['AP_0', 'SE_70'],
 ['AP_0',

### combine fragment pairs

In [30]:
def get_bonds(valids, data, fragment_library):
    bonds = [] #store bonds of valid matching pairs as atom IDs where connection is formed
    #go through all valid pairs
    for valid in valids:
        bond = []
        for val in valid:
            #load fragments that should get connected
            subpocket1 = val[0].split("_")[0]
            fragment1_index = int(val[0].split("_")[1])
            fragment1 = fragment_library[subpocket1]["ROMol_original"][fragment1_index] 
            #remove Hs before finding bonds otherwise bond ids not correct because for combining molecules without Hs are used
            fragment1 = Chem.RemoveHs(fragment1)
            
            subpocket2 = val[1].split("_")[0]
            fragment2_index = int(val[1].split("_")[1])
            fragment2 = fragment_library[subpocket2]["ROMol_original"][fragment2_index] 
            #remove Hs before finding bonds
            fragment2 = Chem.RemoveHs(fragment2)
            
            #i = 0
            bond1_id = None
            bond2_id = None
        
            
            data1 = data[subpocket1][fragment1_index] #get corresponding connection to load environment, bond type and neighboring subpocket
            for i in range(0,len(data1.ports)):
                environment1 = data1.ports[i].environment
                bond_type1 = data1.ports[i].bond_type
                neighbor1 = data1.ports[i].neighboring_subpocket
                
                data2 = data[subpocket2][fragment2_index] #for matching fragment also get the connection data
                for j in range(0,len(data2.ports)):
                    environment2 = data2.ports[j].environment
                    bond_type2 = data2.ports[j].bond_type
                    neighbor2 = data2.ports[j].neighboring_subpocket
                    
                    #check again if BRICS bond, bond types and subpockets are matching for a connection
                    if  is_brics_bond(environment1, environment2) and bond_type1 == bond_type2 and subpocket2 == neighbor1 and subpocket1 == neighbor2:
                        #get atom indices where connection is build
                        for atom in fragment1.GetAtoms():
                            atom_symbol = atom.GetSymbol()
                            if atom_symbol == "*":
                                bond1_id = subpocket1 + "_" + str(atom.GetIdx())
                        
                        for atom2 in fragment2.GetAtoms():
                            atom_symbol2 = atom2.GetSymbol()
                            if atom_symbol2 == "*":
                                bond2_id = subpocket2 + "_" + str(atom2.GetIdx())
                        
            bond.append([bond1_id, bond2_id, bond_type1]) #save atom indices and bond type for building the connection
        bonds.append(bond)
    return bonds

In [31]:
bonds = get_bonds(valids, res, fragment_library_filtered)

In [32]:
#bonds for pair
bonds[0][1]

['AP_7', 'SE_10', rdkit.Chem.rdchem.BondType.SINGLE]

In [33]:
def construct_ligand(fragment_ids, bond_ids, fragment_library):
    """
    *copied and adapted from kinase_focused_fragment_library*
    Construct a ligand by connecting multiple fragments based on a Combination object
    Parameters
    ----------
    fragment_ids: list of str
        Fragment IDs of recombined ligand, e.g. `["SE_2", "AP_0", "FP_2"]` (`<subpocket>_<fragment index in subpocket pool>`).
    bond_ids : list of list of str
        Bond IDs of recombined ligand, e.g. `[["FP_6", "AP_10"], ["AP_11", "SE_13"]]`: Atom (`<subpocket>_<atom ID>`) pairs per fragment bond.
    fragment_library : dict of pandas.DataFrame
        SMILES and RDKit molecules for fragments (values) per subpocket (key).
    Returns
    -------
    ligand: rdkit.Chem.rdchem.Mol or None
        Recombined ligand (or None if the ligand could not be constructed)
    """

    fragments = []
    for fragment_id in fragment_ids:

        # Get subpocket and fragment index in subpocket
        subpocket = fragment_id.split("_")[0]
        fragment_index = int(fragment_id.split("_")[1])
        fragment = fragment_library[subpocket].ROMol_original[fragment_index]

        # Store unique atom identifiers in original molecule (important for recombined ligand construction based on atom IDs)
        fragment = Chem.RemoveHs(fragment)
        for i, atom in enumerate(fragment.GetAtoms()):
            fragment_atom_id = f"{subpocket}_{i}"
            atom.SetProp("fragment_atom_id", fragment_atom_id)
            atom.SetProp("fragment_id", fragment.GetProp("complex_pdb"))
        fragment = PropertyMol(fragment)

        # Append fragment to list of fragments
        fragments.append(fragment)

    # Combine fragments using map-reduce model
    combo = reduce(Chem.CombineMols, fragments)

    bonds_matching = True
    ed_combo = Chem.EditableMol(combo)
    replaced_dummies = []

    #atoms = combo.GetAtoms()
    
    
    #for bond in bond_ids:

    dummy_1 = next(
            atom for atom in combo.GetAtoms() if atom.GetProp("fragment_atom_id") == bond_ids[0]
    )
    dummy_2 = next(
            atom for atom in combo.GetAtoms() if atom.GetProp("fragment_atom_id") == bond_ids[1]
    )
    atom_1 = dummy_1.GetNeighbors()[0]
    atom_2 = dummy_2.GetNeighbors()[0]

    # check bond types
    bond_type_1 = combo.GetBondBetweenAtoms(dummy_1.GetIdx(), atom_1.GetIdx()).GetBondType()
    bond_type_2 = combo.GetBondBetweenAtoms(dummy_2.GetIdx(), atom_2.GetIdx()).GetBondType()
    if bond_type_1 != bond_type_2:
        bonds_matching = False
        print("Bonds not matching")

    ed_combo.AddBond(atom_1.GetIdx(), atom_2.GetIdx(), order=bond_type_1)

    replaced_dummies.extend([dummy_1.GetIdx(), dummy_2.GetIdx()])

    # Do not construct this ligand if bond types are not matching
    if not bonds_matching:
        return

    # Remove replaced dummy atoms
    replaced_dummies.sort(reverse=True)
    for dummy in replaced_dummies:
        ed_combo.RemoveAtom(dummy)

    ligand = ed_combo.GetMol()

    # Replace remaining dummy atoms with hydrogens
    du = Chem.MolFromSmiles("*")
    h = Chem.MolFromSmiles("[H]", sanitize=False)
    ligand = AllChem.ReplaceSubstructs(ligand, du, h, replaceAll=True)[0]
    try:
        ligand = Chem.RemoveHs(ligand)
    except ValueError:
        print(Chem.MolToSmiles(ligand))
        return

    # Clear properties
    for prop in ligand.GetPropNames():
        ligand.ClearProp(prop)
    for atom in ligand.GetAtoms():
        atom.ClearProp("fragment_atom_id")

    # Generate 2D coordinates
    AllChem.Compute2DCoords(ligand)

    return ligand

In [34]:
def get_pairs(valids, bonds, fragment_library_filtered):
    pairs = []
    frags1 = []
    frags2 = []
    ids = []
    for i in range(0, len(valids)):
        for j in range(0, len(valids[i])):
            frag1 = fragment_library_filtered[valids[i][j][0].split("_")[0]]['ROMol_dummy'][int(valids[i][j][0].split("_")[1])]
            frag2 = fragment_library_filtered[valids[i][j][1].split("_")[0]]['ROMol_dummy'][int(valids[i][j][1].split("_")[1])]
            
            frags1.append(frag1)
            frags2.append(frag2)
            
            pair = construct_ligand(valids[i][j], bonds[i][j], fragment_library_filtered)
            pairs.append(pair)
            ids.append(valids[i][j])
            
    return pd.DataFrame({'fragment ids': ids, 'fragment1':frags1, 'fragment2':frags2, 'pair':pairs})

In [35]:
pair_df = get_pairs(valids, bonds, fragment_library_filtered)

In [36]:
pair_df

Unnamed: 0,fragment ids,fragment1,fragment2,pair
0,"[AP_0, SE_0]",,,
1,"[AP_0, SE_2]",,,
2,"[AP_0, SE_3]",,,
3,"[AP_0, SE_4]",,,
4,"[AP_0, SE_5]",,,
...,...,...,...,...
31913,"[B2_15, GA_5]",,,
31914,"[B2_15, GA_6]",,,
31915,"[B2_15, GA_52]",,,
31916,"[B2_15, GA_56]",,,


## ASKCOS retrosynthesis

In [37]:
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
#ToDo: make request parallel?
from time import time

def get_retro(pairs, fragment_library):
    children1 = []
    children2 = []
    plausibilities = []
    for i, row in pairs.iterrows():
        start = time()
        # get fragments from fragment library without dummy atoms (otherwise comparison not possible)
        subpocket1 = pairs['fragment ids'][i][0].split('_')[0]
        subpocket2 = pairs['fragment ids'][i][1].split('_')[0]
        id1 = int(pairs['fragment ids'][i][0].split('_')[1])
        id2 = int(pairs['fragment ids'][i][1].split('_')[1])
        cur_frag1 = fragment_library[subpocket1]['ROMol'][id1]
        cur_frag2 = fragment_library[subpocket2]['ROMol'][id2]
        
        pair_smiles = Chem.MolToSmiles(row['pair'])
        retro = call_retro(pair_smiles)
        
        best_child1 = []
        best_child2 = []
        best_plausibility = []
        if(len(retro['trees'])) > 0:
            #best_child1 = []
            #best_child2 = []
            #best_plausibility = []
            start2 = time ()
            for l in range(0, len(retro['trees'])):
                if len(retro['trees'][l]['children'][0]['children']) == 2:
                    plausibility = retro['trees'][0]['children'][0]['plausibility']
                    child1 = retro['trees'][l]['children'][0]['children'][0]['smiles']
                    price1 = retro['trees'][l]['children'][0]['children'][0]['ppg']
                    child2 = retro['trees'][l]['children'][0]['children'][1]['smiles']
                    price2 = retro['trees'][l]['children'][0]['children'][1]['ppg']
                    child1_mol = Chem.MolFromSmiles(child1)
                    child2_mol = Chem.MolFromSmiles(child2)
                    if child1_mol.HasSubstructMatch(cur_frag1) and child2_mol.HasSubstructMatch(cur_frag2):
                        if best_plausibility == []:
                            best_child1 = child1_mol 
                            best_child2 = child2_mol
                            best_plausibility = plausibility
                        elif best_plausibility < plausibility:
                            best_child1 = child1_mol 
                            best_child2 = child2_mol
                            best_plausibility = plausibility
                    elif child2_mol.HasSubstructMatch(cur_frag1) and child1_mol.HasSubstructMatch(cur_frag2):
                        if best_plausibility == []:
                            best_child1 = child2_mol 
                            best_child2 = child1_mol
                            best_plausibility = plausibility
                        elif best_plausibility < plausibility:
                            best_child1 = child2_mol 
                            best_child2 = child1_mol
                            best_plausibility = plausibility
        if best_plausibility != []:
            children1.append(best_child1)
            children2.append(best_child2)
            plausibilities.append(best_plausibility)            
        else:
            children1.append(None)
            children2.append(None)
            plausibilities.append(0)
        #ToDo: chek if where dummy atom is some functional group????
    df = pairs.copy()
    df.loc[:, 'children_1']=children1
    df.loc[:, 'children_2']=children2
    df.loc[:, 'pausibility']=plausibilities
    
    return df
        
import requests
from pprint import pprint       
def call_retro(pair_smiles):

    HOST = 'https://askcos.mit.edu/'
    params = {
        'smiles': pair_smiles, # required

        # optional with defaults shown
        'max_depth': 1, # maximum number of reaction steps
        'max_branching': 25, # ?max number of branches are looked at to find "best"?
        'expansion_time': 20, # how long the expansion can run
        'max_ppg': 100, # maximum price per gram
        'template_count': 100, 
        'max_cum_prob': 0.995, # which common probability reached until no more templates are used
        'chemical_property_logic': 'none', # molecules are buyable or not, can be 'none' (only price relevant), 'and' (price and heavy atoms constraint) or 'or' (one of both constraints is relevant) 
        'max_chemprop_c': 0, # max heavy atom contraints if 'and' or 'or' is used in 'chemical_property_logic'
        'max_chemprop_n': 0,
        'max_chemprop_o': 0,
        'max_chemprop_h': 0,
        'chemical_popularity_logic': 'none', # want to use popular chemicals as reasonable stopping points?
        'min_chempop_reactants': 5, # min frequence as popular reactant
        'min_chempop_products': 5, #min frequence as popular prouct
        'filter_threshold': 0.75, 

        'return_first': 'true' # default is false
    }
    resp = requests.get(HOST+'/api/treebuilder/', params=params, verify=False)
    return resp.json()

In [38]:
#pairs_test = pair_df[0:1000]

In [39]:
# from time import time
# start = time()
# retro_df = get_retro(pairs_test, fragment_library_filtered)
# print(time() - start)

In [40]:
#1000 need 3,5h

In [41]:
# retro_df

In [42]:
# accepted_retro = []
# for i, row in retro_df.iterrows():
#     if row.pausibility==0:
#         accepted_retro.append(0)
#     else:
#         accepted_retro.append(1)
        
# retro_df['bool_retro'] = accepted_retro
# retro_accepted = retro_df[retro_df['bool_retro']==1]
# retro_rejected = retro_df[retro_df['bool_retro']==0]

In [43]:
# retro_accepted

In [44]:
# retro_rejected

In [45]:
pairs_10 = pair_df[0:10]

In [46]:
%%time
retro_df = get_retro(pairs_10, fragment_library_filtered)

CPU times: user 62.5 ms, sys: 31.2 ms, total: 93.8 ms
Wall time: 1min 52s


In [47]:
retro_df

Unnamed: 0,fragment ids,fragment1,fragment2,pair,children_1,children_2,pausibility
0,"[AP_0, SE_0]",,,,,,0.786562
1,"[AP_0, SE_2]",,,,,,0.911037
2,"[AP_0, SE_3]",,,,,,0.999894
3,"[AP_0, SE_4]",,,,,,0.999138
4,"[AP_0, SE_5]",,,,,,0.996351
5,"[AP_0, SE_6]",,,,,,0.0
6,"[AP_0, SE_7]",,,,,,0.9788
7,"[AP_0, SE_8]",,,,,,0.0
8,"[AP_0, SE_9]",,,,,,0.999171
9,"[AP_0, SE_10]",,,,,,0.0


ToDo: count for every fragment how often it is included in retrosynthetic step

In [48]:
import copy #needed if you do not want to change original fragment_library dict

def retro_fragments(retro_df, fragment_library):
    fraglib = copy.deepcopy(fragment_library)
    #get list of fragment ids
    all_frags = []
    frag_ids = []
    for i, row in retro_df.iterrows():
        if row['pausibility'] != 0:
            frag_ids.append(retro_df['fragment ids'][i][0])
            frag_ids.append(retro_df['fragment ids'][i][1])
    all_frags = pd.DataFrame(frag_ids, columns = ['ids'])
    #count number of frags
    counts = all_frags.groupby('ids').size()
    #return counts
    #go through all subpockets and fragments and add number of contributions to retrosynth. pathways
    for subpocket in fraglib.keys():
        count_frags = []
        for i in range(0, len(fraglib[subpocket])):
            if hasattr(counts, str(subpocket + "_" + str(i))):
                attribute = str(subpocket + "_" + str(i))
                num_counts = getattr(counts,attribute)
                count_frags.append(num_counts)
                              
            else:
                count_frags.append(0)
        fraglib[subpocket]['retro_count'] = count_frags
                       
    return counts, fraglib

In [49]:
frags, fraglib = retro_fragments(retro_df, fragment_library_filtered)

In [50]:
fraglib['AP']

Unnamed: 0,subpocket,smiles,ROMol,ROMol_dummy,ROMol_original,kinase,family,group,complex_pdb,ligand_pdb,...,connections,connections_name,bool_pains,bool_brenk,ro3,qed,bool_bb,bool_syba,bool,retro_count
0,AP,Cc1cc(N)[nH]n1,,,,ABL1,Abl,TK,2f4j,VX6,...,[SE],[AP=SE],1,1,1,1,1,1,1,7
1,AP,c1cnc2ccnn2c1,,,,ACTR2,STKR,TKL,3q4t,TAK,...,"[SE, GA]","[AP=SE, AP=GA]",1,1,1,1,1,1,1,0
2,AP,c1cn[nH]c1,,,,AKT2,Akt,AGC,2uw9,GVP,...,[FP],[AP=FP],1,1,1,1,1,1,1,0
3,AP,Nc1ncccc1O,,,,ALK,ALK,TK,2xp2,VGH,...,"[FP, SE]","[AP=FP, AP=SE]",1,1,1,1,1,1,1,0
4,AP,Nc1ncccn1,,,,ALK,ALK,TK,4dce,0JF,...,"[FP, SE]","[AP=FP, AP=SE]",1,1,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,AP,c1ccc2[nH]nnc2c1,,,,ULK1,ULK,Other,6mnh,JVD,...,[FP],[AP=FP],1,1,1,1,1,1,1,0
124,AP,CNC(=O)c1ccccc1,,,,p38a,MAPK,CMGC,1zyj,BI5,...,"[GA, SE]","[AP=GA, AP=SE]",1,1,1,1,1,1,1,0
125,AP,Nc1nc2ccccc2s1,,,,p38a,MAPK,CMGC,3c5u,P41,...,"[SE, GA]","[AP=SE, AP=GA]",1,1,1,1,1,1,1,0
126,AP,Nc1ccnn1-c1ccccc1,,,,p38a,MAPK,CMGC,3ocg,OCG,...,[GA],[AP=GA],1,1,1,1,1,1,1,0


In [51]:
frags

ids
AP_0    7
SE_0    1
SE_2    1
SE_3    1
SE_4    1
SE_5    1
SE_7    1
SE_9    1
dtype: int64

In [52]:
getattr(frags, 'AP_0')

7

## Parallel api request?

In [53]:
pairs_10 = pair_df[0:100]

In [54]:
import multiprocessing as mp
num_cpu = mp.cpu_count()
num_cpu

8

In [55]:
#create bins depending on number of cores
import numpy as np
df_split = np.array_split(pairs_10, num_cpu)

#### For Parallel try
1. write extra function for api request (done: call_retro(pair_smiles))
2. call api request parallel (done with call_retro_parallel)

In [56]:
#create list of smiles because parallel cannot take dfs with molecules
pairs_10_smiles = []
for mol in pairs_10['pair']:
    pairs_10_smiles.append(Chem.MolToSmiles(mol))

In [57]:
import multiprocessing as mp
num_cpu = mp.cpu_count()
num_cpu

8

In [58]:
#create bins depending on number of cores
import numpy as np
df_split = np.array_split(pairs_10_smiles, num_cpu)

In [59]:
def call_retro_parallel(pair_smiles):
    pairs = []
    children1 = []
    children2 = []
    plausibilities = []
    for smile in pair_smiles:
        pairs.append(smile)
        cur_children1 = []
        cur_children2 = []
        cur_plausibilities = []
        HOST = 'https://askcos.mit.edu/'
        params = {
            'smiles': smile, # required

            # optional with defaults shown
            'max_depth': 1, # maximum number of reaction steps
            'max_branching': 25, # ?max number of branches are looked at to find "best"?
            'expansion_time': 20, # how long the expansion can run
            'max_ppg': 100, # maximum price per gram
            'template_count': 100, 
            'max_cum_prob': 0.995, # which common probability reached until no more templates are used
            'chemical_property_logic': 'none', # molecules are buyable or not, can be 'none' (only price relevant), 'and' (price and heavy atoms constraint) or 'or' (one of both constraints is relevant) 
            'max_chemprop_c': 0, # max heavy atom contraints if 'and' or 'or' is used in 'chemical_property_logic'
            'max_chemprop_n': 0,
            'max_chemprop_o': 0,
            'max_chemprop_h': 0,
            'chemical_popularity_logic': 'none', # want to use popular chemicals as reasonable stopping points?
            'min_chempop_reactants': 5, # min frequence as popular reactant
            'min_chempop_products': 5, #min frequence as popular prouct
            'filter_threshold': 0.75, 

            'return_first': 'true' # default is false
        }
        resp = requests.get(HOST+'/api/treebuilder/', params=params, verify=False)
        #res.append(resp.json())
        retro = resp.json()

        if(len(retro['trees'])) > 0:
            for l in range(0, len(retro['trees'])):
                if len(retro['trees'][l]['children'][0]['children']) == 2:
                    plausibility = retro['trees'][0]['children'][0]['plausibility']
                    child1 = retro['trees'][l]['children'][0]['children'][0]['smiles']
                    child2 = retro['trees'][l]['children'][0]['children'][1]['smiles']
                    cur_children1.append(child1)
                    cur_children2.append(child2)
                    cur_plausibilities.append(plausibility)
                    
        else:
            cur_children1.append(None)
            cur_children2.append(None)
            cur_plausibilities.append(0)
        children1.append(cur_children1)
        children2.append(cur_children2)
        plausibilities.append(cur_plausibilities)
    res = pd.DataFrame(list(zip(pairs, children1, children2, plausibilities)),
               columns =['pair', 'child 1', 'child 2', 'plausibility'])
    return res

In [60]:
%%time
from joblib import Parallel, delayed
para_res = Parallel(n_jobs=num_cpu)(delayed(call_retro_parallel)(split) for split in df_split)

CPU times: user 46.9 ms, sys: 1.05 s, total: 1.09 s
Wall time: 6min 30s


In [61]:
para_result = pd.concat(para_res)
para_result

Unnamed: 0,pair,child 1,child 2,plausibility
0,Cc1cc(Nc2ccc(C#N)cc2)[nH]n1,[Cc1cc(N)[nH]n1],[N#Cc1ccc(F)cc1],[0.786561847]
1,CCOc1ccc(Nc2cc(C)n[nH]2)cc1,"[CCOc1ccc(B(O)O)cc1, CCOc1ccc(Br)cc1]","[Cc1cc(N)[nH]n1, Cc1cc(N)[nH]n1]","[0.999550641, 0.999550641]"
2,Cc1cc(Nc2c[nH]nc2C)[nH]n1,[Cc1cc(N)[nH]n1],[Cc1n[nH]cc1Br],[0.999893665]
3,COc1ncccc1Nc1cc(C)n[nH]1,[COc1ncccc1Br],[Cc1cc(N)[nH]n1],[0.99913764]
4,Cc1cc(Nc2cccc(S(C)(=O)=O)c2)[nH]n1,[CS(=O)(=O)c1cccc(Br)c1],[Cc1cc(N)[nH]n1],[0.985232]
...,...,...,...,...
7,O=C(Nc1cnn2cccnc12)C1CC1,[None],[None],[0]
8,Clc1ccc(-c2cnn3cccnc23)cc1,[Brc1cnn2cccnc12],[OB(O)c1ccc(Cl)cc1],[0.999995828]
9,OCCOc1ccc(-c2cnn3cccnc23)cc1,[Brc1cnn2cccnc12],[OCCOc1ccc(Br)cc1],[0.999502897]
10,c1cc(-c2cnn3cccnc23)cc(C2CCNCC2)c1,[None],[None],[0]


In [62]:
children_list = []
for i, row in para_result.iterrows():
    for l in range(len(row['child 1'])):
        children_list.append(row['child 1'][l])
        children_list.append(row['child 2'][l])
children_list = set(children_list)

In [63]:
children_list

{'Brc1ccc2c(c1)CCO2',
 'Brc1ccc2cn[nH]c2c1',
 'Brc1cn[nH]c1',
 'Brc1cnc2ccccc2c1',
 'Brc1cncnc1',
 'Brc1cnn2cccnc12',
 'CC(C)(C)c1ccc(B(O)O)cc1',
 'CC(C)(C)c1ccc(N)cc1',
 'CCOc1ccc(B(O)O)cc1',
 'CCOc1ccc(B2OC(C)(C)C(C)(C)O2)cc1',
 'CCOc1ccc(Br)cc1',
 'CCc1ccc(B(O)O)cc1',
 'CCc1ccc(Br)cc1',
 'CN(C)c1ccc(B(O)O)cc1',
 'COc1cc(B(O)O)cc(OC)c1OC',
 'COc1cc(Br)c(OC)cc1Br',
 'COc1cc(Br)cc(OC)c1',
 'COc1cc(Br)cc(OC)c1OC',
 'COc1cc(Br)ccc1Cl',
 'COc1cc(I)ccc1O',
 'COc1cc(N)cc(OC)c1',
 'COc1cc(N)cc(OC)c1OC',
 'COc1cc(OC)cc(B(O)O)c1',
 'COc1ccc(B(O)O)cc1',
 'COc1ccc(B(O)O)cc1OC',
 'COc1ccc(Cl)cc1B(O)O',
 'COc1ccc(F)cc1',
 'COc1ncccc1B(O)O',
 'COc1ncccc1Br',
 'CS(=O)(=O)c1ccc(Cl)c(Br)c1',
 'CS(=O)(=O)c1cccc(B(O)O)c1',
 'CS(=O)(=O)c1cccc(Br)c1',
 'CSc1cccc(B(O)O)c1',
 'Cc1cc(C)cc(B(O)O)c1',
 'Cc1cc(C)cc(F)c1',
 'Cc1cc(Cl)[nH]n1',
 'Cc1cc(Cl)nc(N)n1',
 'Cc1cc(N)[nH]n1',
 'Cc1cc(N)sn1',
 'Cc1n[nH]cc1Br',
 'Cc1nccc2ccccc12',
 'Cc1nn(C)cc1B1OC(C)(C)C(C)(C)O1',
 'Cc1nn(C)cc1Br',
 'Clc1cc2ccccc2cn1',
 'Cl

In [64]:
children_mols = []
children_smiles = []
for smile in children_list:
    if smile != None:
        children_mols.append(Chem.MolFromSmiles(smile))
        children_smiles.append(smile)
    else:
        children_mols.append(None)
        children_smiles.append(None)
children_df = pd.DataFrame(list(zip(children_mols, children_smiles)), columns = ('child_mol', 'child_smiles'))

Next step: compare if fragment is substructure of children  

Note: when calling parallel can only give lists with smiles --> need to calc mols in every parallel process

In [65]:
pairs_frags_10_smiles = []
frag1_10 = []
frag2_10 = []
pair = []
for fragids in pairs_10['fragment ids']:
    frag1_10.append(Chem.MolToSmiles(fragment_library_filtered[fragids[0].split("_")[0]]['ROMol'][int(fragids[0].split("_")[1])]))
    frag2_10.append(Chem.MolToSmiles(fragment_library_filtered[fragids[1].split("_")[0]]['ROMol'][int(fragids[1].split("_")[1])]))
for pairmol in pairs_10['pair']:
    pair.append(Chem.MolToSmiles(pairmol))
pairs_frags_10_smiles = pd.DataFrame(list(zip(pairs_10['fragment ids'], frag1_10, frag2_10, pair)), columns=('fragment ids', 'fragment 1', 'fragment 2', 'pair'))
#pairs_frags_10_smiles.set_index('pair', inplace=True)
pairs_frags_10_smiles

Unnamed: 0,fragment ids,fragment 1,fragment 2,pair
0,"[AP_0, SE_0]",Cc1cc(N)[nH]n1,N#Cc1ccccc1,Cc1cc(Nc2ccc(C#N)cc2)[nH]n1
1,"[AP_0, SE_2]",Cc1cc(N)[nH]n1,CCOc1ccccc1,CCOc1ccc(Nc2cc(C)n[nH]2)cc1
2,"[AP_0, SE_3]",Cc1cc(N)[nH]n1,Cc1cc[nH]n1,Cc1cc(Nc2c[nH]nc2C)[nH]n1
3,"[AP_0, SE_4]",Cc1cc(N)[nH]n1,COc1ccccn1,COc1ncccc1Nc1cc(C)n[nH]1
4,"[AP_0, SE_5]",Cc1cc(N)[nH]n1,CS(=O)(=O)c1ccccc1,Cc1cc(Nc2cccc(S(C)(=O)=O)c2)[nH]n1
...,...,...,...,...
95,"[AP_1, SE_20]",c1cnc2ccnn2c1,NC(=O)C1CC1,O=C(Nc1cnn2cccnc12)C1CC1
96,"[AP_1, SE_21]",c1cnc2ccnn2c1,Clc1ccccc1,Clc1ccc(-c2cnn3cccnc23)cc1
97,"[AP_1, SE_24]",c1cnc2ccnn2c1,OCCOc1ccccc1,OCCOc1ccc(-c2cnn3cccnc23)cc1
98,"[AP_1, SE_25]",c1cnc2ccnn2c1,c1ccc(C2CCNCC2)cc1,c1cc(-c2cnn3cccnc23)cc(C2CCNCC2)c1


In [66]:
def get_mol(smiles_list):
    mols = []
    smiles = []
    for smile in smiles_list:
        if smile != None:
            mols.append(Chem.MolFromSmiles(smile))
            smiles.append(smile)
        else:
            mols.append(None)
            smiles.append(None)
    df = pd.DataFrame(list(zip(mols, smiles)), columns = ('mol', 'smiles'))
    return df

## ToDo: change comparison, go through fragment pair list and get corresponding retro pair etc....

In [67]:
def compare_mols2(para_result, pairs_frags_10_smiles):
    para_res = para_result.copy(deep = True)
    para_res.set_index('pair', inplace=True)
    smiles_list = list(pairs_frags_10_smiles['fragment 1'])
    smiles_list.extend(list(pairs_frags_10_smiles['fragment 2']))
    children_list = []
    for i, row in para_res.iterrows():
        for l in range(len(row['child 1'])):
            children_list.append(row['child 1'][l])
            children_list.append(row['child 2'][l])
    smiles_list.extend(list(children_list))
    smiles_list = set(smiles_list) #unique list of all smiles (fragments and children)
    mols = get_mol(smiles_list)
    mols.set_index('smiles', inplace=True) #get mol from specific smiles mols.loc['Cc1cc(N)[nH]n1']['mol']
    #dataframe for result which frags are matching
    column_names = ['fragment ids', 'fragment 1', 'fragment 2', 'pair', 'child 1', 'child 2', 'plausibility']
    result_df = pd.DataFrame(columns = column_names)
    
    for i, row in pairs_frags_10_smiles.iterrows():
        cur_pair_smiles = row['pair']
        cur_frag1_smiles = row['fragment 1']
        frag1_mol = mols.loc[cur_frag1_smiles]['mol'] 
        cur_frag2_smiles = row['fragment 2']
        frag2_mol = mols.loc[cur_frag2_smiles]['mol'] 
        frag_ids = row['fragment ids']
        cur_children1_smiles = para_res.loc[cur_pair_smiles]['child 1']
        cur_children2_smiles = para_res.loc[cur_pair_smiles]['child 2']
        cur_probs = para_res.loc[cur_pair_smiles]['plausibility']
        #go through children lists and compare
        for l in range(len(cur_children1_smiles)):
            child1_smiles = cur_children1_smiles[l]
            child2_smiles = cur_children2_smiles[l]
            child1_mol = mols.loc[child1_smiles]['mol']
            child2_mol = mols.loc[child2_smiles]['mol']
            if not child1_mol is None and not child2_mol is None:
                if child1_mol.HasSubstructMatch(frag1_mol) and child2_mol.HasSubstructMatch(frag2_mol):
                    result_df = result_df.append({'fragment ids': frag_ids, 'fragment 1': cur_frag1_smiles, 'fragment 2': cur_frag2_smiles, 'pair': cur_pair_smiles, 'child 1': child1_smiles, 'child 2': child2_smiles, 'plausibility': cur_probs[l]}, ignore_index=True)
                elif child1_mol.HasSubstructMatch(frag2_mol) and child2_mol.HasSubstructMatch(frag1_mol):
                    result_df = result_df.append({'fragment ids': frag_ids, 'fragment 1': cur_frag1_smiles, 'fragment 2': cur_frag2_smiles, 'pair': cur_pair_smiles, 'child 1': child2_smiles, 'child 2': child1_smiles, 'plausibility': cur_probs[l]}, ignore_index=True)
            else:
                result_df = result_df.append({'fragment ids': frag_ids, 'fragment 1': cur_frag1_smiles, 'fragment 2': cur_frag2_smiles, 'pair': cur_pair_smiles, 'child 1': None, 'child 2': None, 'plausibility': 0}, ignore_index=True)
                
    return result_df  

In [68]:
%%time
res_df = compare_mols2(para_result, pairs_frags_10_smiles)

CPU times: user 203 ms, sys: 0 ns, total: 203 ms
Wall time: 194 ms


In [69]:
df_split_2 = np.array_split(pairs_frags_10_smiles, num_cpu)

In [74]:
df_split_2

[     fragment ids      fragment 1          fragment 2  \
 0    [AP_0, SE_0]  Cc1cc(N)[nH]n1         N#Cc1ccccc1   
 1    [AP_0, SE_2]  Cc1cc(N)[nH]n1         CCOc1ccccc1   
 2    [AP_0, SE_3]  Cc1cc(N)[nH]n1         Cc1cc[nH]n1   
 3    [AP_0, SE_4]  Cc1cc(N)[nH]n1          COc1ccccn1   
 4    [AP_0, SE_5]  Cc1cc(N)[nH]n1  CS(=O)(=O)c1ccccc1   
 5    [AP_0, SE_6]  Cc1cc(N)[nH]n1            Cn1ccnn1   
 6    [AP_0, SE_7]  Cc1cc(N)[nH]n1    COc1cccc(OC)c1OC   
 7    [AP_0, SE_8]  Cc1cc(N)[nH]n1            Cn1cccn1   
 8    [AP_0, SE_9]  Cc1cc(N)[nH]n1         Cc1ccn(C)n1   
 9   [AP_0, SE_10]  Cc1cc(N)[nH]n1       c1cnn2ccnc2c1   
 10  [AP_0, SE_11]  Cc1cc(N)[nH]n1           Oc1ccccc1   
 11  [AP_0, SE_12]  Cc1cc(N)[nH]n1          COc1ccccc1   
 12  [AP_0, SE_13]  Cc1cc(N)[nH]n1    c1ccc2[nH]cnc2c1   
 
                                   pair  
 0          Cc1cc(Nc2ccc(C#N)cc2)[nH]n1  
 1          CCOc1ccc(Nc2cc(C)n[nH]2)cc1  
 2            Cc1cc(Nc2c[nH]nc2C)[nH]n1  
 3             COc

In [70]:
%%time
from joblib import Parallel, delayed
para_res2 = Parallel(n_jobs=num_cpu)(delayed(compare_mols2)(para_result, split) for split in df_split_2)

CPU times: user 15.6 ms, sys: 0 ns, total: 15.6 ms
Wall time: 164 ms


In [69]:
res_df

Unnamed: 0,fragment ids,fragment 1,fragment 2,pair,child 1,child 2,plausibility
0,"[AP_0, SE_0]",Cc1cc(N)[nH]n1,N#Cc1ccccc1,Cc1cc(Nc2ccc(C#N)cc2)[nH]n1,Cc1cc(N)[nH]n1,N#Cc1ccc(Br)cc1,0.936692
1,"[AP_0, SE_2]",Cc1cc(N)[nH]n1,CCOc1ccccc1,CCOc1ccc(Nc2cc(C)n[nH]2)cc1,Cc1cc(N)[nH]n1,CCOc1ccc(Br)cc1,0.911037
2,"[AP_0, SE_3]",Cc1cc(N)[nH]n1,Cc1cc[nH]n1,Cc1cc(Nc2c[nH]nc2C)[nH]n1,,,0.000000
3,"[AP_0, SE_4]",Cc1cc(N)[nH]n1,COc1ccccn1,COc1ncccc1Nc1cc(C)n[nH]1,Cc1cc(N)[nH]n1,COc1ncccc1Br,0.999138
4,"[AP_0, SE_5]",Cc1cc(N)[nH]n1,CS(=O)(=O)c1ccccc1,Cc1cc(Nc2cccc(S(C)(=O)=O)c2)[nH]n1,Cc1cc(N)[nH]n1,CS(=O)(=O)c1cccc(B(O)O)c1,0.996351
...,...,...,...,...,...,...,...
97,"[AP_1, SE_20]",c1cnc2ccnn2c1,NC(=O)C1CC1,O=C(Nc1cnn2cccnc12)C1CC1,,,0.000000
98,"[AP_1, SE_21]",c1cnc2ccnn2c1,Clc1ccccc1,Clc1ccc(-c2cnn3cccnc23)cc1,Brc1cnn2cccnc12,OB(O)c1ccc(Cl)cc1,0.999996
99,"[AP_1, SE_24]",c1cnc2ccnn2c1,OCCOc1ccccc1,OCCOc1ccc(-c2cnn3cccnc23)cc1,Brc1cnn2cccnc12,OCCOc1ccc(Br)cc1,0.999503
100,"[AP_1, SE_25]",c1cnc2ccnn2c1,c1ccc(C2CCNCC2)cc1,c1cc(-c2cnn3cccnc23)cc(C2CCNCC2)c1,,,0.000000


### function to count number of contributions to retro pathway for each fragment

In [70]:
import copy #needed if you do not want to change original fragment_library dict

def retro_fragments(retro_df, fragment_library):
    fraglib = copy.deepcopy(fragment_library)
    #get list of fragment ids
    all_frags = []
    frag_ids = []
    for i, row in retro_df.iterrows():
        if row['plausibility'] != 0:
            frag_ids.append(retro_df['fragment ids'][i][0])
            frag_ids.append(retro_df['fragment ids'][i][1])
    all_frags = pd.DataFrame(frag_ids, columns = ['ids'])
    #count number of frags
    counts = all_frags.groupby('ids').size()
    #return counts
    #go through all subpockets and fragments and add number of contributions to retrosynth. pathways
    for subpocket in fraglib.keys():
        count_frags = []
        for i in range(0, len(fraglib[subpocket])):
            if hasattr(counts, str(subpocket + "_" + str(i))):
                attribute = str(subpocket + "_" + str(i))
                num_counts = getattr(counts,attribute)
                count_frags.append(num_counts)
                              
            else:
                count_frags.append(0)
        fraglib[subpocket]['retro_count'] = count_frags
                       
    return counts, fraglib

In [71]:
countfrag, fraglib_filtered = retro_fragments(res_df, fragment_library_filtered)

In [72]:
countfrag

ids
AP_0      47
AP_1      20
SE_0       2
SE_10      1
SE_101     1
SE_106     1
SE_11      1
SE_12      2
SE_14      1
SE_15      2
SE_17      1
SE_18      1
SE_19      2
SE_2       2
SE_21      3
SE_24      2
SE_26      1
SE_30      1
SE_38      1
SE_4       2
SE_41      1
SE_46      1
SE_5       3
SE_50      1
SE_52      1
SE_54      1
SE_57      2
SE_59      1
SE_6       1
SE_65      1
SE_66      1
SE_7       2
SE_70      1
SE_72      1
SE_73      2
SE_74      3
SE_77      1
SE_78      1
SE_79      1
SE_8       2
SE_84      1
SE_88      1
SE_89      2
SE_9       2
SE_90      2
SE_97      3
SE_98      1
dtype: int64

In [73]:
fraglib_filtered['AP']

Unnamed: 0,subpocket,smiles,ROMol,ROMol_dummy,ROMol_original,kinase,family,group,complex_pdb,ligand_pdb,...,connections,connections_name,bool_pains,bool_brenk,ro3,qed,bool_bb,bool_syba,bool,retro_count
0,AP,Cc1cc(N)[nH]n1,,,,ABL1,Abl,TK,2f4j,VX6,...,[SE],[AP=SE],1,1,1,1,1,1,1,47
1,AP,c1cnc2ccnn2c1,,,,ACTR2,STKR,TKL,3q4t,TAK,...,"[SE, GA]","[AP=SE, AP=GA]",1,1,1,1,1,1,1,20
2,AP,c1cn[nH]c1,,,,AKT2,Akt,AGC,2uw9,GVP,...,[FP],[AP=FP],1,1,1,1,1,1,1,0
3,AP,Nc1ncccc1O,,,,ALK,ALK,TK,2xp2,VGH,...,"[FP, SE]","[AP=FP, AP=SE]",1,1,1,1,1,1,1,0
4,AP,Nc1ncccn1,,,,ALK,ALK,TK,4dce,0JF,...,"[FP, SE]","[AP=FP, AP=SE]",1,1,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,AP,c1ccc2[nH]nnc2c1,,,,ULK1,ULK,Other,6mnh,JVD,...,[FP],[AP=FP],1,1,1,1,1,1,1,0
124,AP,CNC(=O)c1ccccc1,,,,p38a,MAPK,CMGC,1zyj,BI5,...,"[GA, SE]","[AP=GA, AP=SE]",1,1,1,1,1,1,1,0
125,AP,Nc1nc2ccccc2s1,,,,p38a,MAPK,CMGC,3c5u,P41,...,"[SE, GA]","[AP=SE, AP=GA]",1,1,1,1,1,1,1,0
126,AP,Nc1ccnn1-c1ccccc1,,,,p38a,MAPK,CMGC,3ocg,OCG,...,[GA],[AP=GA],1,1,1,1,1,1,1,0


# one function for retrosynthesis call

In [None]:
import multiprocessing as mp

def pairwise_retrosynthesis(fragment_library_filtered):
    res = get_pairs(fragment_library_filtered)
    valids = checkvalid(res, fragment_library_filtered)
    bonds = get_bonds(valids, res, fragment_library_filtered)
    pair_df = get_pairs(valids, bonds, fragment_library_filtered)
    num_cpu = mp.cpu_count()
    df_split = np.array_split(pair_df, num_cpu)
    #create list of smiles as parallel computing cannot handle molecules
    pairs_smiles = []
    for mol in pairs['pair']:
        pairs_smiles.append(Chem.MolToSmiles(mol))
    para_res = Parallel(n_jobs=num_cpu)(delayed(call_retro_parallel)(split) for split in df_split)
    para_result = pd.concat(para_res)
    children_list = []
    for i, row in para_result.iterrows():
        for l in range(len(row['child 1'])):
            children_list.append(row['child 1'][l])
            children_list.append(row['child 2'][l])
    children_list = set(children_list)
    
    children_mols = []
    children_smiles = []
    for smile in children_list:
        if smile != None:
            children_mols.append(Chem.MolFromSmiles(smile))
            children_smiles.append(smile)
        else:
            children_mols.append(None)
            children_smiles.append(None)
    children_df = pd.DataFrame(list(zip(children_mols, children_smiles)), columns = ('child_mol', 'child_smiles'))
    
    pairs_frags_smiles = []
    frag1 = []
    frag2 = []
    pair = []
    for fragids in pairs['fragment ids']:
        frag1.append(Chem.MolToSmiles(fragment_library_filtered[fragids[0].split("_")[0]]['ROMol'][int(fragids[0].split("_")[1])]))
        frag2.append(Chem.MolToSmiles(fragment_library_filtered[fragids[1].split("_")[0]]['ROMol'][int(fragids[1].split("_")[1])]))
    for pairmol in pairs['pair']:
        pair.append(Chem.MolToSmiles(pairmol))
    pairs_frags_smiles = pd.DataFrame(list(zip(pairs['fragment ids'], frag1, frag2, pair)), columns=('fragment ids', 'fragment 1', 'fragment 2', 'pair'))
    pairs_frags_smiles
    
    res_df = compare_mols(para_result, pairs_frags_smiles)
    countfrag, fraglib_filtered = retro_fragments(res_df, fragment_library_filtered)
    
    #Todo: write function to make res_df a pandas df with molecules
    mol_df = get_mol_df(res_df)
    return fraglib_filtered, res_df

In [None]:
def get_mol_df(res_df):
    frag1_mol = []
    frag2_mol = []
    pair_mol = []
    child1_mol = []
    child2_mol = []
    for i, row in res_df.iterrows():
        frag1_mol.append(Chem.MolFromSmiles(row['fragment 1']))
        frag2_mol.append(Chem.MolFromSmiles(row['fragment 2']))
        pair_mol.append(Chem.MolFromSmiles(row['pair']))
        if(row['child 1'] is not None):
            child1_mol.append(Chem.MolFromSmiles(row['child 1']))
            child2_mol.append(Chem.MolFromSmiles(row['child 2']))
        else:
            child1_mol.append(None)
            child2_mol.append(None)
        
    mol_df =  pd.DataFrame(list(zip(res_df['fragment ids'], frag1_mol, frag2_mol, pair_mol, child1_mol, child2_mol, res_df['plausibility'])), columns=("fragment ids", "fragment 1", "fragment 2", "pair", "child 1", "child 2"))
    
    return mol_df

# Calling retrosynthesizability as function
ToDo: check at weekend for complete filtered fragment library!

In [24]:
%%time
fragment_library_filtered, retro_df, countfrag = filters.retro.pairwise_retrosynthesis(fragment_library_filtered)
# 100 fragments/pairs
# CPU times: user 46.6 s, sys: 15.6 ms, total: 46.7 s
# Wall time: 7min 56s

# 10 fragments/pairs
# CPU times: user 46.4 s, sys: 2.47 s, total: 48.9 s
# Wall time: 1min 40s

Number of fragments:  487
CPU times: user 49.8 s, sys: 2.42 s, total: 52.2 s
Wall time: 1min 40s


In [62]:
retro_df

Unnamed: 0,fragment ids,fragment 1,fragment 2,pair,child 1,child 2,plausibility
0,"[AP_0, SE_0]",,,,,,0.999053
1,"[AP_0, SE_0]",,,,,,0.999053
2,"[AP_0, SE_2]",,,,,,0.911037
3,"[AP_0, SE_3]",,,,,,0.999894
4,"[AP_0, SE_4]",,,,,,0.999138
...,...,...,...,...,...,...,...
97,"[AP_1, SE_20]",,,,,,0.000000
98,"[AP_1, SE_21]",,,,,,0.999996
99,"[AP_1, SE_24]",,,,,,0.999503
100,"[AP_1, SE_25]",,,,,,0.000000


In [63]:
fragment_library_filtered['AP']

Unnamed: 0,subpocket,smiles,ROMol,ROMol_dummy,ROMol_original,kinase,family,group,complex_pdb,ligand_pdb,...,connections,connections_name,bool_pains,bool_brenk,ro3,qed,bool_bb,bool_syba,bool,retro_count
0,AP,Cc1cc(N)[nH]n1,,,,ABL1,Abl,TK,2f4j,VX6,...,[SE],[AP=SE],1,1,1,1,1,1,1,47
1,AP,c1cnc2ccnn2c1,,,,ACTR2,STKR,TKL,3q4t,TAK,...,"[SE, GA]","[AP=SE, AP=GA]",1,1,1,1,1,1,1,20
2,AP,c1cn[nH]c1,,,,AKT2,Akt,AGC,2uw9,GVP,...,[FP],[AP=FP],1,1,1,1,1,1,1,0
3,AP,Nc1ncccc1O,,,,ALK,ALK,TK,2xp2,VGH,...,"[FP, SE]","[AP=FP, AP=SE]",1,1,1,1,1,1,1,0
4,AP,Nc1ncccn1,,,,ALK,ALK,TK,4dce,0JF,...,"[FP, SE]","[AP=FP, AP=SE]",1,1,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,AP,c1ccc2[nH]nnc2c1,,,,ULK1,ULK,Other,6mnh,JVD,...,[FP],[AP=FP],1,1,1,1,1,1,1,0
124,AP,CNC(=O)c1ccccc1,,,,p38a,MAPK,CMGC,1zyj,BI5,...,"[GA, SE]","[AP=GA, AP=SE]",1,1,1,1,1,1,1,0
125,AP,Nc1nc2ccccc2s1,,,,p38a,MAPK,CMGC,3c5u,P41,...,"[SE, GA]","[AP=SE, AP=GA]",1,1,1,1,1,1,1,0
126,AP,Nc1ccnn1-c1ccccc1,,,,p38a,MAPK,CMGC,3ocg,OCG,...,[GA],[AP=GA],1,1,1,1,1,1,1,0


Test running for complete filtered fragment library
Start: 

In [47]:
test = []
if not test == []:
    print("")
    
else:
    print("test == []")

test == []


In [46]:
%%time
#0-4999
fragment_library_filtered, retro_df, countfrag, diff_mol_df = filters.retro.pairwise_retrosynthesis(fragment_library_filtered)

Number of fragments:  487
Number of pairs: 31918
CPU times: user 55.5 s, sys: 6.3 s, total: 1min 1s
Wall time: 6h 24min 1s


In [None]:
%%time
#running for 5000-9999
fragment_library_filtered2, retro_df2, countfrag2, diff_mol_df2 = filters.retro.pairwise_retrosynthesis(fragment_library_filtered)

Number of fragments:  487
Number of pairs: 31918


In [None]:
%%time
#running for 10000-10999
fragment_library_filtered3, retro_df3, countfrag3, diff_mol_df3 = filters.retro.pairwise_retrosynthesis(fragment_library_filtered)

In [36]:
retro_df

Unnamed: 0,fragment ids,fragment 1,fragment 2,pair,child 1,child 2,plausibility
0,"[AP_0, SE_0]",,,,,,0.936692
1,"[AP_0, SE_2]",,,,,,0.911037
2,"[AP_0, SE_3]",,,,,,0.999894
3,"[AP_0, SE_4]",,,,,,0.999138
4,"[AP_0, SE_5]",,,,,,0.996351
...,...,...,...,...,...,...,...
572,"[AP_3, SE_64]",,,,,,[0.993607044]
573,"[AP_3, SE_64]",,,,,,[0.993607044]
574,"[AP_3, SE_65]",,,,,,0
575,"[AP_3, SE_66]",,,,,,0.999692


In [37]:
diff_mol_df

Unnamed: 0,fragment ids,fragment 1,fragment 2,pair,child 1,child 2,plausibility
0,"[AP_0, SE_0]",,,,,,0.936692
1,"[AP_0, SE_4]",,,,,,0.999138
2,"[AP_0, SE_7]",,,,,,0.836471
3,"[AP_0, SE_12]",,,,,,0.840037
4,"[AP_0, SE_15]",,,,,,0.865217
5,"[AP_0, SE_23]",,,,,,0.950496
6,"[AP_0, SE_31]",,,,,,0.943756
7,"[AP_0, SE_37]",,,,,,0.98227
8,"[AP_0, SE_54]",,,,,,0.879014
9,"[AP_0, SE_57]",,,,,,0.998734
