In [1]:
import multiprocessing as mp

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import math
from pathlib import Path

import pandas as pd
#import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem, Draw, Lipinski, PandasTools

from kinfraglib import utils
from kinfraglib import filters



In [4]:
# Needed to display ROMol images in DataFrames
PandasTools.RenderImagesInAllDataFrames(images=True)

In [5]:
# Path to data
HERE = Path(_dh[-1])
PATH_DATA = HERE / '../../data'

In [6]:
fragment_library = utils.read_fragment_library(PATH_DATA / 'fragment_library')

In [7]:
fragment_library = filters.prefilters.pre_filters(
    fragment_library)

In [8]:
pains_dict = filters.pains.get_pains(fragment_library)

In [9]:
fragment_library = pains_dict["fragment_library"]

In [10]:
HERE = Path(_dh[-1])
DATA =  HERE / '../../../Brenk'

In [11]:
brenk_dict = filters.unwanted_substructures.get_brenk(fragment_library, DATA)

Number of unwanted substructures in Brenk et al. collection: 104


In [12]:
fragment_library =  brenk_dict['fragment_library']

In [13]:
druglikeness_dict = filters.ruleofthree.get_ro3_frags(fragment_library)
fragment_library = druglikeness_dict["fragment_library"]

In [14]:
res_qed = filters.qed.get_qed(fragment_library)
fragment_library = res_qed["fragment_library"]

In [15]:
synthesizability_dict = filters.building_blocks.check_building_blocks(
    fragment_library,
    str(str(PATH_DATA)+'/filters/DataWarrior/Enamine_Building_Blocks.sdf'))

In [16]:
fragment_library = synthesizability_dict['fragment_library']

In [17]:
d_sybas = filters.syba.calc_syba(
    fragment_library,
    cutoff=0)

In [18]:
fragment_library = d_sybas['fragment_library']

In [19]:
fragment_library = filters.analysis.number_of_accepted(
    fragment_library, columns=['bool_pains', 'bool_brenk', 'ro3', 'qed', 'bool_bb', 'bool_syba'],
    min_accepted=6)

In [20]:
for subpocket in fragment_library.keys():
    fragment_library[subpocket].drop(fragment_library[subpocket].loc[fragment_library[subpocket]['bool']==0].index, inplace=True)
    fragment_library[subpocket] = fragment_library[subpocket].reset_index(drop=True)

In [21]:
def get_tuple(fragment, dummy_atoms):

    """
    **copied from https://github.com/volkamerlab/KinaseFocusedFragmentLibrary/blob/b7e684c26f75efffc2a9ba2383c9027cdd4c29a3/kinase_focused_fragment_library/recombination/classes_meta.py**  # noqa: E501
    For a given fragment, returns:
    - smiles string with generic dummy atoms (dummy labels removed)
    - dummy atoms as tuples of frag_atom_id and subpocket (of the dummy = neighboring subpocket of
    the fragment)
    Parameters
    ----------
    fragment: RDKit Mol object
    dummy_atoms: list(RDKit Atom objects)
        list of all dummy atoms of the fragment
    Returns
    -------
    String
        SMILES string of the fragment
    frozenset(tuple)
        frozenset of tuples for each dummy atom containing the frag_atom_id and the subpocket of
        the dummy
    """

    frag_smiles = fragment
    # replace dummys with generic dummys (without atom number)
    # dummy tuple: (frag_atom_id, neighboring_subpocket), e.g. (AP_4, FP)
    dummy_set = []
    for dummy in dummy_atoms:
        frag_smiles = Chem.ReplaceSubstructs(
            frag_smiles, Chem.MolFromSmiles(dummy.GetSmarts()), Chem.MolFromSmiles("*")
        )[0]
        dummy_tuple = dummy.GetProp("frag_atom_id"), dummy.GetProp("subpocket")
        dummy_set.append(dummy_tuple)
    frag_smiles = Chem.MolToSmiles(frag_smiles)

    dummy_set = frozenset(dummy_set)

    return frag_smiles, dummy_set


class Compound:

    """
    **copied from https://github.com/volkamerlab/KinaseFocusedFragmentLibrary/blob/b7e684c26f75efffc2a9ba2383c9027cdd4c29a3/kinase_focused_fragment_library/recombination/classes_meta.py**  # noqa: E501
    Represents a combination of fragments including its dummy atoms
    Attributes
    ----------
    frag_ids: list(str)
        Strings representing the fragments that the molecule consists of
    subpockets: list(str)
        Subpockets that the molecule is targeting
    ports: list(Port)
        Port objects representing the dummy atoms of the molecule
    bonds: list(tuple(str))
        Bonds through which the fragments are connected.
        The bonds are stored as tuples of atom IDs.
    """

    def __init__(self, frag_ids, subpockets, ports, bonds):

        self.frag_ids = frag_ids
        self.subpockets = subpockets
        self.ports = ports
        self.bonds = bonds


class Fragment:

    """
    **copied from https://github.com/volkamerlab/KinaseFocusedFragmentLibrary/blob/b7e684c26f75efffc2a9ba2383c9027cdd4c29a3/kinase_focused_fragment_library/recombination/classes_meta.py**  # noqa: E501
    Represents a single fragment from the fragment library
    Attributes
    ----------
    frag_id: str
        ID of the fragment: subpocket_ID, e.g. AP_5
    subpocket: str
        Subpocket that the fragment is targeting
    ports: list(Port)
        Port objects representing the dummy atoms of the fragment
    """

    def __init__(self, frag_id, subpocket, ports):

        self.frag_id = frag_id
        self.subpocket = subpocket  # list of targeted subpockets
        self.ports = ports  # list of Port objects


class Port:

    """
    **copied from https://github.com/volkamerlab/KinaseFocusedFragmentLibrary/blob/b7e684c26f75efffc2a9ba2383c9027cdd4c29a3/kinase_focused_fragment_library/recombination/classes_meta.py**  # noqa: E501
    Represents a single dummy atom
    Attributes
    ----------
    atom_id: str
        frag_atom_id of the dummy atom
    subpocket: str
        Subpocket of the atom adjacent to the dummy atom (subpocket of the fragment containing
        the dummy)
    neighboring_subpocket: str
        Subpocket of the dummy atom
    bond_type: str
        Type of the bond connecting the dummy to its adjacent atom
    environment: str
        Type of the environment of the current fragment (of the adjacent atom)
    """

    def __init__(
        self, atom_id, subpocket, neighboring_subpocket, bond_type, environment
    ):

        self.atom_id = atom_id
        self.subpocket = subpocket
        self.neighboring_subpocket = neighboring_subpocket
        self.bond_type = bond_type
        self.environment = environment


class Combination:

    """
    **copied from https://github.com/volkamerlab/KinaseFocusedFragmentLibrary/blob/b7e684c26f75efffc2a9ba2383c9027cdd4c29a3/kinase_focused_fragment_library/recombination/classes_meta.py**  # noqa: E501
    Comparable representation of a combination of fragments
    Attributes
    ----------
    frag_ids: frozenset(str)
        Strings representing the fragments that the molecule consists of
    bonds: frozenset(tuple(str))
        Bonds through which the fragments are connected.
        The bonds are stored as tuples of atom IDs.
    Methods
    ----------
    __eq__()
        Two Combination objects are equal if they consist of the same fragments which are
        connected through the same bonds.
    """

    def __init__(self, frag_ids, bonds=None):
        self.frag_ids = frag_ids
        self.bonds = bonds

    def __eq__(self, other):
        return self.frag_ids == other.frag_ids and self.bonds == other.bonds

    def __ne__(self, other):
        return not self.__eq__(other)

    def __hash__(self):
        return hash((self.frag_ids, self.bonds))


In [22]:
def get_valid_pairs(fragment_library):
    """
        *copied and adapted from kinase_focused_fragment_library*
    """
    data = {}  # (Fragments)
    frag_set = set()  # only used in initialization for avoiding duplicates in fragment data set (smiles & dummy atoms)

    #iterate through subpockets and fragments in subpockets
    #save subpocket_fragmentindex and dummy atoms, bonds etc
    for subpocket in fragment_library.keys():
        fragments = []
        for i, row in fragment_library[subpocket].iterrows():
            #get fragment and connecting subpockets
            fragment = row['ROMol_original']
            fragment = Chem.RemoveHs(fragment)
            frag_id = f'{subpocket}_{i}'
            
            # store unique atom identifiers
            for a, atom in enumerate(fragment.GetAtoms()):
                frag_atom_id = f'{subpocket}_{a}'
                atom.SetProp('frag_atom_id', frag_atom_id)
                
            # get all dummy atoms of this fragment except the ones corresponding to the X pool
            dummy_atoms = [a for a in fragment.GetAtoms() if a.GetSymbol() == '*' and not a.GetProp('subpocket').startswith('X')]
            if not dummy_atoms:
                continue
            
            frag_smiles, dummy_set = get_tuple(fragment, dummy_atoms)
            # check if this exact fragment has already been found
            if (frag_smiles, dummy_set) in frag_set:
                continue
            # if not, add this fragment to set of fragments
            frag_set.add((frag_smiles, dummy_set))

            # create dummy atom objects
            ports = [Port(atom_id=dummy.GetProp('frag_atom_id'), subpocket=subpocket, neighboring_subpocket=dummy.GetProp('subpocket'),
                          bond_type=fragment.GetBondBetweenAtoms(dummy.GetIdx(), dummy.GetNeighbors()[0].GetIdx()).GetBondType(),
                          environment=dummy.GetNeighbors()[0].GetProp('environment'))
                     for dummy in dummy_atoms]   
            # add all dummy atoms of this fragment to the queue
            #compound = Compound(frag_ids=[frag_id], subpockets=[subpocket], ports=ports, bonds=[])
            #combo = Combination(frag_ids=frozenset([frag_id]))
            
            # store fragment in constant data set
            fragment = Fragment(frag_id=frag_id, subpocket=subpocket, ports=ports)
            fragments.append(fragment)
        data[subpocket] = fragments
        
    n_frags = len(frag_set)

    print('Number of fragments: ', n_frags)
    
    return data

In [23]:
def checkvalid(data, fragment_library):
    matches = [] #save matching fragment pairs
    #iterate through subpockets
    for subpocket in fragment_library.keys():
        #iterate through fragments in subpockets
        for fragment in data[subpocket]:
            fragment_id1 = fragment.frag_id #store fragment ID of first fragment in pair
            #go through atom connnections and check neighbors, bond type and environment
            for i in range(0,len(fragment.ports)):
                neighbor = fragment.ports[i].neighboring_subpocket
                bond_type = fragment.ports[i].bond_type
                environment = fragment.ports[i].environment
                match = [] #store current matching fragment pair
                for frag2 in data[neighbor]:
                    fragment_id2 = frag2.frag_id  #store fragment ID of second fragment            
                    for i in range(0,len(frag2.ports)): 
                        #check environment type, subpocket, bond type
                        environment_match = filters.brics_rules.is_brics_bond(environment, frag2.ports[i].environment) #check if BRICS environments are able to form connection
                        #if subpocket is adjacent, bond type is eqal and environments are matching, add as valid matching pair
                        if frag2.ports[i].neighboring_subpocket == subpocket and neighbor ==  frag2.ports[i].subpocket and frag2.ports[i].bond_type == bond_type and environment_match:
                            match.append([fragment_id1, fragment_id2])        
                matches.append(match) #add valid matching pair to list of matching pairs
    return matches  

In [24]:
res = get_valid_pairs(fragment_library)

Number of fragments:  487


In [25]:
valids = checkvalid(res, fragment_library)

In [26]:
def get_bonds(valids, data, fragment_library):
    bonds = [] #store bonds of valid matching pairs as atom IDs where connection is formed
    #go through all valid pairs
    for valid in valids:
        bond = []
        for val in valid:
            #load fragments that should get connected
            subpocket1 = val[0].split("_")[0]
            fragment1_index = int(val[0].split("_")[1])
            fragment1 = fragment_library[subpocket1]["ROMol_original"][fragment1_index] 
            #remove Hs before finding bonds otherwise bond ids not correct because for combining molecules without Hs are used
            fragment1 = Chem.RemoveHs(fragment1)
            
            subpocket2 = val[1].split("_")[0]
            fragment2_index = int(val[1].split("_")[1])
            fragment2 = fragment_library[subpocket2]["ROMol_original"][fragment2_index] 
            #remove Hs before finding bonds
            fragment2 = Chem.RemoveHs(fragment2)
            
            #i = 0
            bond1_id = None
            bond2_id = None
        
            
            data1 = data[subpocket1][fragment1_index] #get corresponding connection to load environment, bond type and neighboring subpocket
            for i in range(0,len(data1.ports)):
                environment1 = data1.ports[i].environment
                bond_type1 = data1.ports[i].bond_type
                neighbor1 = data1.ports[i].neighboring_subpocket
                
                data2 = data[subpocket2][fragment2_index] #for matching fragment also get the connection data
                for j in range(0,len(data2.ports)):
                    environment2 = data2.ports[j].environment
                    bond_type2 = data2.ports[j].bond_type
                    neighbor2 = data2.ports[j].neighboring_subpocket
                    
                    #check again if BRICS bond, bond types and subpockets are matching for a connection
                    if  filters.brics_rules.is_brics_bond(environment1, environment2) and bond_type1 == bond_type2 and subpocket2 == neighbor1 and subpocket1 == neighbor2:
                        #get atom indices where connection is build
                        for atom in fragment1.GetAtoms():
                            atom_symbol = atom.GetSymbol()
                            if atom_symbol == "*":
                                bond1_id = subpocket1 + "_" + str(atom.GetIdx())
                        
                        for atom2 in fragment2.GetAtoms():
                            atom_symbol2 = atom2.GetSymbol()
                            if atom_symbol2 == "*":
                                bond2_id = subpocket2 + "_" + str(atom2.GetIdx())
                        
            bond.append([bond1_id, bond2_id, bond_type1]) #save atom indices and bond type for building the connection
        bonds.append(bond)
    return bonds

In [27]:
bonds = get_bonds(valids, res, fragment_library)

In [28]:
def construct_ligand(fragment_ids, bond_ids, fragment_library):
    """
    *copied and adapted from kinase_focused_fragment_library*
    Construct a ligand by connecting multiple fragments based on a Combination object
    Parameters
    ----------
    fragment_ids: list of str
        Fragment IDs of recombined ligand, e.g. `["SE_2", "AP_0", "FP_2"]` (`<subpocket>_<fragment index in subpocket pool>`).
    bond_ids : list of list of str
        Bond IDs of recombined ligand, e.g. `[["FP_6", "AP_10"], ["AP_11", "SE_13"]]`: Atom (`<subpocket>_<atom ID>`) pairs per fragment bond.
    fragment_library : dict of pandas.DataFrame
        SMILES and RDKit molecules for fragments (values) per subpocket (key).
    Returns
    -------
    ligand: rdkit.Chem.rdchem.Mol or None
        Recombined ligand (or None if the ligand could not be constructed)
    """

    fragments = []
    for fragment_id in fragment_ids:

        # Get subpocket and fragment index in subpocket
        subpocket = fragment_id.split("_")[0]
        fragment_index = int(fragment_id.split("_")[1])
        fragment = fragment_library[subpocket].ROMol_original[fragment_index]

        # Store unique atom identifiers in original molecule (important for recombined ligand construction based on atom IDs)
        fragment = Chem.RemoveHs(fragment)
        for i, atom in enumerate(fragment.GetAtoms()):
            fragment_atom_id = f"{subpocket}_{i}"
            atom.SetProp("fragment_atom_id", fragment_atom_id)
            atom.SetProp("fragment_id", fragment.GetProp("complex_pdb"))
        fragment = PropertyMol(fragment)

        # Append fragment to list of fragments
        fragments.append(fragment)

    # Combine fragments using map-reduce model
    combo = reduce(Chem.CombineMols, fragments)

    bonds_matching = True
    ed_combo = Chem.EditableMol(combo)
    replaced_dummies = []

    #atoms = combo.GetAtoms()
    
    
    #for bond in bond_ids:

    dummy_1 = next(
            atom for atom in combo.GetAtoms() if atom.GetProp("fragment_atom_id") == bond_ids[0]
    )
    dummy_2 = next(
            atom for atom in combo.GetAtoms() if atom.GetProp("fragment_atom_id") == bond_ids[1]
    )
    atom_1 = dummy_1.GetNeighbors()[0]
    atom_2 = dummy_2.GetNeighbors()[0]

    # check bond types
    bond_type_1 = combo.GetBondBetweenAtoms(dummy_1.GetIdx(), atom_1.GetIdx()).GetBondType()
    bond_type_2 = combo.GetBondBetweenAtoms(dummy_2.GetIdx(), atom_2.GetIdx()).GetBondType()
    if bond_type_1 != bond_type_2:
        bonds_matching = False
        print("Bonds not matching")

    ed_combo.AddBond(atom_1.GetIdx(), atom_2.GetIdx(), order=bond_type_1)

    replaced_dummies.extend([dummy_1.GetIdx(), dummy_2.GetIdx()])

    # Do not construct this ligand if bond types are not matching
    if not bonds_matching:
        return

    # Remove replaced dummy atoms
    replaced_dummies.sort(reverse=True)
    for dummy in replaced_dummies:
        ed_combo.RemoveAtom(dummy)

    ligand = ed_combo.GetMol()

    # Replace remaining dummy atoms with hydrogens
    du = Chem.MolFromSmiles("*")
    h = Chem.MolFromSmiles("[H]", sanitize=False)
    ligand = AllChem.ReplaceSubstructs(ligand, du, h, replaceAll=True)[0]
    try:
        ligand = Chem.RemoveHs(ligand)
    except ValueError:
        print(Chem.MolToSmiles(ligand))
        return

    # Clear properties
    for prop in ligand.GetPropNames():
        ligand.ClearProp(prop)
    for atom in ligand.GetAtoms():
        atom.ClearProp("fragment_atom_id")

    # Generate 2D coordinates
    AllChem.Compute2DCoords(ligand)

    return ligand

In [29]:
def get_pairs(valids, bonds, fragment_library_filtered):
    pairs = []
    frags1 = []
    frags2 = []
    ids = []
    for i in range(0, len(valids)):
        for j in range(0, len(valids[i])):
            frag1 = fragment_library_filtered[valids[i][j][0].split("_")[0]]['ROMol_dummy'][int(valids[i][j][0].split("_")[1])]
            frag2 = fragment_library_filtered[valids[i][j][1].split("_")[0]]['ROMol_dummy'][int(valids[i][j][1].split("_")[1])]
            
            frags1.append(frag1)
            frags2.append(frag2)
            
            pair = construct_ligand(valids[i][j], bonds[i][j], fragment_library_filtered)
            pairs.append(pair)
            ids.append(valids[i][j])
            
    return pd.DataFrame({'fragment ids': ids, 'fragment1':frags1, 'fragment2':frags2, 'pair':pairs})

In [30]:
from rdkit.Chem.PropertyMol import PropertyMol
from functools import reduce

In [31]:
pair_df = get_pairs(valids, bonds, fragment_library)

In [36]:
pair_smiles = []
for pair in pair_df['pair']:
    pair_smiles.append(Chem.MolToSmiles(pair))
    
unique_smiles = pd.DataFrame({'pair': pair_smiles})['pair'].unique()

# Multiprocessing

In [38]:
def call_retro_parallel(pair_smiles):
    """
    One step retrosynthesis using ASKCOS for all valid build pairs of fragments.
    Saving the plausibility and the children that can build this pair according to retrosynthetic
    analysis.

    Parameters
    ----------
    pair_smiles : numpy array
        containing SMILES strings of pairs build by fragments

    Returns
    -------
    pandas DataFrame
        containing the pair, the children building this pair and their plausibility

    """
    pairs = []
    children1 = []
    children2 = []
    plausibilities = []
    # n_attempts = 5
    for smile in pair_smiles:
        if smile in open('retro.txt').read():
            print('already requested')
        else:
            print(smile)
            pairs.append(smile)
            cur_children1 = []
            cur_children2 = []
            cur_plausibilities = []
            HOST = "https://askcos.mit.edu/"
            params = {
                "smiles": smile,  # required
                # optional with defaults shown
                "max_depth": 1,  # maximum number of reaction steps
                "max_branching": 25,  # ?max number of branches are looked at to find "best"?
                "expansion_time": 20,  # how long the expansion can run
                "max_ppg": 100,  # maximum price per gram
                "template_count": 100,
                # "max_cum_prob"
                # which common probability reached until no more templates are used
                "max_cum_prob": 0.995,
                # "chemical_property_logic"
                # molecules are buyable or not, can be 'none' (only price relevant),
                # 'and' (price and heavy atoms constraint) or
                # 'or' (one of both constraints is relevant)
                "chemical_property_logic": "none",
                # max heavy atom contraints if 'and' or 'or' is used in 'chemical_property_logic'
                "max_chemprop_c": 0,
                "max_chemprop_n": 0,
                "max_chemprop_o": 0,
                "max_chemprop_h": 0,
                # want to use popular chemicals as reasonable stopping points?
                "chemical_popularity_logic": "none",
                "min_chempop_reactants": 5,  # min frequence as popular reactant
                "min_chempop_products": 5,  # min frequence as popular prouct
                "filter_threshold": 0.75,
                "return_first": "true",  # default is false
            }
            # for attempt in range(n_attempts):
            try:
                resp = requests.get(HOST + "/api/treebuilder/", params=params,
                                    verify=False)
            except requests.exceptions.Timeout as err:
                print(err)
                sleep(50)
                continue
            retro = resp.json()

            if "trees" in retro:
                if (len(retro["trees"])) > 0:
                    for num_tree in range(0, len(retro["trees"])):
                        if len(retro["trees"][num_tree]["children"][0]["children"]) == 2:
                            plausibility = retro["trees"][0]["children"][0]["plausibility"]
                            child1 = retro["trees"][num_tree]["children"][0]["children"][0][
                                "smiles"
                            ]
                            child2 = retro["trees"][num_tree]["children"][0]["children"][1][
                                "smiles"
                            ]
                            cur_children1.append(child1)
                            cur_children2.append(child2)
                            cur_plausibilities.append(plausibility)

                else:
                    cur_children1.append(None)
                    cur_children2.append(None)
                    cur_plausibilities.append(0)
            else:
                cur_children1.append(None)
                cur_children2.append(None)
                cur_plausibilities.append(0)
            children1.append(cur_children1)
            children2.append(cur_children2)
            plausibilities.append(cur_plausibilities)

    res = pd.DataFrame(
        list(zip(pairs, children1, children2, plausibilities)),
        columns=["pair", "child 1", "child 2", "plausibility"],
    )

    return res

In [34]:
import numpy as np
import requests
import warnings

In [35]:
 num_cpu = mp.cpu_count()

In [36]:
df_split = np.array_split(unique_smiles[0:10], num_cpu)

In [39]:
# manual multiprocessing
# %%time
# warnings.filterwarnings("ignore")
# processes = []

# for i in range(num_cpu):
#     p = mp.Process(target=call_retro_parallel, args=(df_split[i],))
#     p.start()
#     processes.append(p)
    
# for process in processes:
#     process.join()

already requested
already requestedalready requested

already requestedalready requested

already requested
already requested
already requested
already requested
already requested
CPU times: user 0 ns, sys: 438 ms, total: 438 ms
Wall time: 491 ms


In [None]:
# df_split = np.array_split(unique_smiles[3000:3100], num_cpu)

In [38]:
# saves only after iterating through all smiles 
# %%time
# warnings.filterwarnings("ignore")

# df_split = np.array_split(unique_smiles[3000:3100], num_cpu)
# p = mp.Pool(num_cpu)
# with open("retro.txt", "a+") as f_object:
#     for result in p.map(call_retro_parallel, df_split):
#             #save current result in csv file
#         for i, row in result.iterrows():
#             f_object.write(str(row['pair']) + "; " + str(row['child 1']) + "; " + str(row['child 2']) + "; " + str(row['plausibility']) + "\n")
#     p.close()
#     p.join()
#                 # print(row['pair'])
#             # f_object.write(result.to_string())
# f_object.close()
# #result = p.map(call_retro_parallel, df_split)


CN(C)c1ccc(Nc2ccncn2)cc1c1ccc2[nH]c(Nc3ccncn3)nc2c1Cn1cc(Nc2ccncn2)c(C(F)(F)F)n1CC(C)(C)c1ccc(Nc2ccncn2)cc1FC(F)(F)c1cccc(Nc2ncc(Br)cn2)c1CCc1cccc(Nc2ccncn2)n1OCCn1cc(Nc2ncc(Br)cn2)cn1






Cc1ccc(Cl)c(Nc2ncc(Br)cn2)c1
Cc1ccc(Nc2ccncn2)nc1C
c1ccc2ncc(Nc3ccncn3)cc2c1
Clc1cccc(Nc2ncc(Br)cn2)c1Cl
COc1ccc2cccc(Nc3ccncn3)c2c1
Brc1cnc(Nc2cccnc2)nc1
Clc1ncc(Nc2ccncn2)cn1
NC(=O)c1ccc(Nc2ccncn2)cc1
O=C(Nc1cccc(Nc2ncc(Br)cn2)c1)c1ccccc1
Cc1cc(Nc2ccncn2)nc(N)n1
Brc1cnc(NNCc2cccnc2)nc1
c1cc(Nc2cncnc2)ncn1
Fc1ccccc1Nc1ncc(Br)cn1
Brc1cnc(Nc2ccc3c(c2)CCC3)nc1
O=C1CCCN1c1cccc(Nc2ccncn2)c1
CSc1cccc(Nc2ccncn2)c1
Brc1ccc(Nc2ccncn2)cc1
Clc1ccccc1Nc1ncc(Br)cn1
Oc1ccc2cccc(Nc3ccncn3)c2c1
OCCc1ccc(Nc2ccncn2)cc1
c1cnc(NNc2ccncn2)nc1
O[C@@H]1CC[C@@H](Nc2ccncn2)CC1
Brc1cnc(Nc2ccccc2Br)nc1
COc1cc(F)c(Nc2ccncn2)cc1OC
Cc1ncc(Nc2ncc(Br)cn2)[nH]1
Cc1ccc(Nc2ncc(Br)cn2)c(Br)c1
O=C(Nc1ccccc1)c1ccc(Nc2ccncn2)cc1
COc1ccc(Cl)cc1Nc1ccncn1
COc1cc(Nc2ccncn2)ccc1Cl
CC[C@@H](CO)Nc1ccncn1
COc1cc(N2CCNCC2)ccc1Nc1ccncn1
c1ccc2c(

In [None]:
# %%time
# warnings.filterwarnings("ignore")
# df_split1 = np.array_split(unique_smiles, 100)
# p = mp.Pool(num_cpu)
# with open("retro.txt", "a+") as f_object:
#     for split in df_split1:
#         df_split = np.array_split(split, num_cpu)
#         for result in p.map(call_retro_parallel, df_split):
#                 #save current result in csv file
#             for i, row in result.iterrows():
#                 f_object.write(str(row['pair']) + "; " + str(row['child 1']) + "; " + str(row['child 2']) + "; " + str(row['plausibility']) + "\n")
#     p.close()
#     p.join()
# f_object.close()

already requestedalready requestedalready requestedalready requestedalready requestedalready requestedalready requested

already requested


already requested

already requestedalready requested
already requested
already requested
already requestedalready requested
already requestedalready requested


already requested



already requestedalready requestedalready requestedalready requestedalready requestedalready requestedalready requested

already requested





already requestedalready requestedalready requestedalready requestedalready requestedalready requestedalready requestedalready requested







already requestedalready requestedalready requested
already requestedalready requestedalready requested
already requestedalready requested

already requested


already requested
already requestedalready requested
already requestedalready requestedalready requested
already requested

already requestedalready requested


already requested
already requested


already requestedalready requ

In [None]:
%%time
warnings.filterwarnings("ignore")
# filter out smiles that are already requested
filtered_smiles = []
for smiles in unique_smiles:
    if not smiles in open('retro.txt').read():
        filtered_smiles.append(smiles)
print(len(filtered_smiles))
df_split1 = np.array_split(filtered_smiles, 150)
p = mp.Pool(num_cpu)
for split in df_split1:
    with open("retro.txt", "a+") as f_object:
        df_split = np.array_split(split, num_cpu)
        for result in p.map(call_retro_parallel, df_split):
                #save current result in csv file
            # print(result)
            for i, row in result.iterrows():
                f_object.write(str(row['pair']) + "; " + str(row['child 1']) + "; " + str(row['child 2']) + "; " + str(row['plausibility']) + "\n")
    f_object.close()
p.close()
p.join()


13083
Nc1cc(-c2ccc(OC(F)(F)F)cc2)[nH]n1Nc1cc(-c2ccc3c(c2)CCO3)[nH]n1CSc1cccc(-c2cc(N)n[nH]2)c1Cc1cc(C)cc(-c2cc(N)n[nH]2)c1CNS(=O)(=O)c1ccc(-c2cc(N)n[nH]2)cc1Nc1cc(-c2ccc(Cl)c(O)c2)[nH]n1N#Cc1cnc(-c2cc(N)n[nH]2)cn1Nc1cc(-c2ccncn2)[nH]n1







Nc1cc(-c2n[nH]c3ccccc23)[nH]n1
COc1ccc2cccc(-c3cc(N)n[nH]3)c2c1
Nc1cc(-c2cnccn2)[nH]n1
Nc1cc(-c2cscn2)[nH]n1
Nc1cc(-c2cc3ccccc3cn2)[nH]n1
Nc1cc(-c2ccc(C(F)(F)F)cc2)[nH]n1
Nc1cc([C@@H]2CC[C@@H](O)CC2)[nH]n1
Nc1cc(-c2c(F)cccc2F)[nH]n1
Nc1cc(COc2ccccc2Cl)[nH]n1
Nc1cc(-c2cccc3ccc(O)cc23)[nH]n1
CCc1cccc(-c2cc(N)n[nH]2)n1
COC(=O)c1ccc(-c2cc(N)n[nH]2)o1
CC[C@@H](CO)c1cc(N)n[nH]1
CC(=O)Nc1cccc(-c2cc(N)n[nH]2)c1
CN(C)c1ccc(-c2cc(N)n[nH]2)cc1
COc1cc(F)c(-c2cc(N)n[nH]2)cc1OC
Nc1cc(-c2c(Cl)cccc2Cl)[nH]n1
Nc1cc(-c2ccc(F)c(Cl)c2)[nH]n1
Nc1cc(-c2ccc(F)cc2Cl)[nH]n1
Cc1ccc(-c2cc(N)n[nH]2)nc1C
COc1cc(-c2cc(N)n[nH]2)ccc1O
Nc1cc(-c2ccoc2)[nH]n1
COc1cc(-c2cc(N)n[nH]2)ccc1Cl
Nc1cc(Oc2ccc(F)cc2)[nH]n1
Cc1nc(C)c(-c2cc(N)n[nH]2)s1
Nc1cc([C@H]2CC[C@@H](O)C2)[nH]n1
Nc1cc(-c

In [43]:
# does not do anything

# from multiprocessing.pool import Pool, ThreadPool
# from multiprocessing import cpu_count
# import pandas as pd

# def worker(process_pool, index, split):
#     print("worker called")
#     # out = process_pool.map(myfunction, range(*split_range))
#     out = process_pool.map(call_parallel_retro, split)
#     pd.DataFrame(out).to_csv(f'filename_{index}.csv', header=False, index=False)



# N_SPLITS = 5
# n_processes = min(N_SPLITS, cpu_count())
# # split_ranges = split(RANGE_START, RANGE_STOP, N_SPLITS) # [(0, 100), (100, 200), ... (400, 500)]
# df_split = np.array_split(unique_smiles[4000:4020], N_SPLITS)
# process_pool = Pool(n_processes)
# thread_pool = ThreadPool(N_SPLITS)
# for index in range(len(df_split)):
#     split = df_split[index]
#     thread_pool.apply_async(worker, args=(process_pool, index, split))
# # wait for all threading tasks to complete:
# thread_pool.close()
# thread_pool.join()

worker calledworker called
worker called
worker calledworker called




In [32]:
import numpy as np
import requests
import warnings

In [44]:
def call_retro_parallel(smile):
    """
    One step retrosynthesis using ASKCOS for all valid build pairs of fragments.
    Saving the plausibility and the children that can build this pair according to retrosynthetic
    analysis.

    Parameters
    ----------
    pair_smiles : numpy array
        containing SMILES strings of pairs build by fragments

    Returns
    -------
    pandas DataFrame
        containing the pair, the children building this pair and their plausibility

    """
    pairs = []
    children1 = []
    children2 = []
    plausibilities = []
    # n_attempts = 5
    #print(smile)
    pairs.append(smile)
    cur_children1 = []
    cur_children2 = []
    cur_plausibilities = []
    HOST = "https://askcos.mit.edu/"
    params = {
                "smiles": smile,  # required
                # optional with defaults shown
                "max_depth": 1,  # maximum number of reaction steps
                "max_branching": 25,  # ?max number of branches are looked at to find "best"?
                "expansion_time": 20,  # how long the expansion can run
                "max_ppg": 100,  # maximum price per gram
                "template_count": 100,
                # "max_cum_prob"
                # which common probability reached until no more templates are used
                "max_cum_prob": 0.995,
                # "chemical_property_logic"
                # molecules are buyable or not, can be 'none' (only price relevant),
                # 'and' (price and heavy atoms constraint) or
                # 'or' (one of both constraints is relevant)
                "chemical_property_logic": "none",
                # max heavy atom contraints if 'and' or 'or' is used in 'chemical_property_logic'
                "max_chemprop_c": 0,
                "max_chemprop_n": 0,
                "max_chemprop_o": 0,
                "max_chemprop_h": 0,
                # want to use popular chemicals as reasonable stopping points?
                "chemical_popularity_logic": "none",
                "min_chempop_reactants": 5,  # min frequence as popular reactant
                "min_chempop_products": 5,  # min frequence as popular prouct
                "filter_threshold": 0.75,
                "return_first": "true",  # default is false
    }
            # for attempt in range(n_attempts):
    # try:
    resp = requests.get(HOST + "/api/treebuilder/", params=params,
                                    verify=False)
    # except requests.exceptions.Timeout as err:
    #     print(err)
    #     sleep(50)
    #     continue
    retro = resp.json()

    if "trees" in retro:
        if (len(retro["trees"])) > 0:
            for num_tree in range(0, len(retro["trees"])):
                if len(retro["trees"][num_tree]["children"][0]["children"]) == 2:
                    plausibility = retro["trees"][0]["children"][0]["plausibility"]
                    child1 = retro["trees"][num_tree]["children"][0]["children"][0][
                                "smiles"
                    ]
                    child2 = retro["trees"][num_tree]["children"][0]["children"][1][
                                "smiles"
                    ]
                    cur_children1.append(child1)
                    cur_children2.append(child2)
                    cur_plausibilities.append(plausibility)

        else:
            cur_children1.append(None)
            cur_children2.append(None)
            cur_plausibilities.append(0)
    else:
        cur_children1.append(None)
        cur_children2.append(None)
        cur_plausibilities.append(0)
    children1.append(cur_children1)
    children2.append(cur_children2)
    plausibilities.append(cur_plausibilities)

    res = pd.DataFrame(
        list(zip(pairs, children1, children2, plausibilities)),
        columns=["pair", "child 1", "child 2", "plausibility"],
    )

    return res

In [None]:
%%time
warnings.filterwarnings("ignore")
def worker(working_queue, output_queue):
    while True:
        if working_queue.empty() == True:
            break #this is the so-called 'poison pill'    
        else:
            smiles = working_queue.get() #get smiles from working queue
            res = call_retro_parallel(smiles)
            #call askcos for one smiles
            #then save result string to output_queue
            for i, row in res.iterrows():
                output_queue.put(str(row['pair']) + "; " + str(row['child 1']) + "; " + str(row['child 2']) + "; " + str(row['plausibility']) + "\n")
        if output_queue.qsize() > 100:
            print("Size of output_queue > 100.")
            print("Writing output_queue to file..")
            with open("retro.txt", "a+") as f_object:
                while True:
                    if output_q.empty() == True:
                        break
                    else:
                        f_object.write(output_q.get_nowait())
            f_object.close()
    return

filtered_smiles = []
for smiles in unique_smiles:
    if not smiles in open('retro.txt').read():
        filtered_smiles.append(smiles)
print(len(filtered_smiles))
working_q = mp.Queue()
output_q = mp.Queue()
for f_smiles in filtered_smiles:
    working_q.put(f_smiles)
processes = [mp.Process(target=worker,args=(working_q, output_q)) for i in range(mp.cpu_count())]
for proc in processes:
    proc.start()
for proc in processes:
    proc.join()
with open("retro.txt", "a+") as f_object:
    while True:
        if output_q.empty() == True:
            break
        f_object.write(output_q.get_nowait())
f_object.close()

9277
Size of output_queue > 100.
Writing output_queue to file..
Size of output_queue > 100.
Writing output_queue to file..
Size of output_queue > 100.
Writing output_queue to file..
Size of output_queue > 100.
Writing output_queue to file..


In [46]:
retro_data = pd.read_csv('retro.txt', sep="; ", header=None)
retro_data

Unnamed: 0,0,1,2,3
0,Cc1cc(Nc2ccc(C#N)cc2)[nH]n1,"['Cc1cc(N)[nH]n1', 'Cc1cc(Cl)[nH]n1', 'Cc1cc(N...","['N#Cc1ccc(B(O)O)cc1', 'N#Cc1ccc(N)cc1', 'N#Cc...","[0.999052584, 0.999052584, 0.999052584]"
1,CCOc1ccc(Nc2cc(C)n[nH]2)cc1,['CCOc1ccc(Br)cc1'],['Cc1cc(N)[nH]n1'],[0.911036551]
2,Cc1cc(Nc2c[nH]nc2C)[nH]n1,[None],[None],[0]
3,COc1ncccc1Nc1cc(C)n[nH]1,[None],[None],[0]
4,Cc1cc(Nc2cccc(S(C)(=O)=O)c2)[nH]n1,"['CS(=O)(=O)c1cccc(B(O)O)c1', 'CS(=O)(=O)c1ccc...","['Cc1cc(N)[nH]n1', 'Cc1cc(N)[nH]n1']","[0.996350527, 0.996350527]"
...,...,...,...,...
5626,COc1ccc(Cc2ccc(N)nc2)cc1,['COc1ccc(CCl)cc1'],['Nc1ccc(Br)cn1'],[0.976026356]
5627,Nc1ccc(-c2cncnc2)cn1,[None],[None],[0]
5628,Nc1ccc(C[C@H](O)CO)cn1,[None],[None],[0]
5629,COc1ccc(Cl)cc1-c1ccc(N)nc1,"['CC1(C)OB(c2ccc(N)nc2)OC1(C)C', 'COc1ccc(Cl)c...","['COc1ccc(Cl)cc1Br', 'Nc1ccc(Cl)cn1']","[0.99998, 0.99998]"
