In [3]:
from molli.external import rdkit as mrd
from molli.external.rdkit import atom_filter as maf
import pickle
import numpy as np
from rdkit import Chem
from rdkit.Chem import rdqueries as chemq
from rdkit.Chem.PropertyMol import PropertyMol
from rdkit.Chem.Draw import rdMolDraw2D
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from pprint import pprint
import pandas as pd

def sp3_type(rdmol:PropertyMol):
    '''
    This takes a numpy array of Atom IDs and returns a boolean for where SP3 atoms exist. 
    Inputs to this function are built for an ORDERED LIST OF ALL ATOM IDs from LEAST TO GREATEST.
    Will most likely be added to later iterations of molli but is not currently available in Molli 1.2.1
    '''
    sp3_atoms = chemq.HybridizationEqualsQueryAtom(Chem.HybridizationType.SP3)
    sp3 = np.array([x.GetIdx() for x in rdmol.GetAtomsMatchingQuery(sp3_atoms)])
    atoms_array = np.array([x.GetIdx() for x in rdmol.GetAtoms()])
    sp3_bool = np.in1d(atoms_array, sp3)
    return sp3_bool

def has_chiraltag(rdmol:PropertyMol):
    '''
    This takes a numpy array of Atom IDs and returns a boolean for where the atom has a chiral tag.
    Inputs to this function are built for an ORDERED LIST OF ALL ATOM IDs from LEAST TO GREATEST.
    Will most likely be added to later iterations of molli but is not currently available in Molli 1.2.1
    '''
    has_chiraltags = chemq.HasChiralTagQueryAtom()
    has_chiraltag = np.array([x.GetIdx() for x in rdmol.GetAtomsMatchingQuery(has_chiraltags)])
    atoms_array = np.array([x.GetIdx() for x in rdmol.GetAtoms()])
    has_chiraltag_bool = np.in1d(atoms_array, has_chiraltag)
    return has_chiraltag_bool

def sort_ids(s: str):
    '''This will correctly sort any type of reaction IDs'''
    _, b = s.split('_')
    return int(b)

def connect_check(rdmol, alk_bool: np.ndarray):
    """
    This checks if the atoms identified are connected
    """

    # This returns dictionary of atom index : atom object for the indices where the bool array was true
    isolated_carbon_atoms = [
        rdmol.GetAtomWithIdx(int(i)) for i in np.where(alk_bool)[0]
    ]

    carbon1_neighbor_atom_idx = list()
    carbon1 = isolated_carbon_atoms[0]
    carbon2 = isolated_carbon_atoms[1]

    carbon1_neighbor_atoms = carbon1.GetNeighbors()

    for neighbor in carbon1_neighbor_atoms:
        neighbor_idx = neighbor.GetIdx()
        carbon1_neighbor_atom_idx.append(neighbor_idx)
    if carbon2.GetIdx() in carbon1_neighbor_atom_idx:
        return True
    else:
        print(f'{rdmol.GetProp("_Name")} do not have carbons connecting')
        return False

def full_check(
        rdmol: PropertyMol,
        af_filter: np.ndarray,
        prop_value: str = "_Diol",
):
    rdmol.SetProp(prop_value, "".join("1" if v else "0" for v in af_filter))
    recall_af_filter = np.array(
        [True if v == "1" else False for v in rdmol.GetProp(prop_value)]
    )
    problem = 0
    # Tests to make sure the array property is set and returned properly
    if all(recall_af_filter == af_filter):
        if connect_check(rdmol, af_filter):
            return problem
        else:
            problem = 1
            return problem
    else:
        print(
            f'{rdmol.GetProp("_Name")} did not correctly return alkene boolean, appended to problematic mol object list'
        )
        problem = 1
        return problem

def update_visualize_mols(
    name: str,
    rdmol_list: list,
    subImgSize=(700, 700),
    legendFontSize=30,
    molsPerRow=5,
    label: str = "_Name",
    highlight_bonds=True,
    highlight_prop="_Alkene",
):
    legends = [i.GetProp(label) for i in rdmol_list]
    nRows = len(rdmol_list) // molsPerRow
    if len(rdmol_list) % molsPerRow:
        nRows += 1
    fullSize = (molsPerRow * subImgSize[0], nRows * subImgSize[1])

    d2d = rdMolDraw2D.MolDraw2DSVG(
        fullSize[0], fullSize[1], subImgSize[0], subImgSize[1]
    )
    d2d.drawOptions().legendFontSize = legendFontSize
    if len(rdmol_list) != 0:
        highlight_atoms_vals = list()
        highlight_bonds_vals = list()

        # This finds the atoms highlighted and bonds that exist between them
        if highlight_bonds:
            for rdmol in rdmol_list:
                # print(rdmol)
                # print(rdmol.GetProp('_Alkene'))
                _filter = np.array(
                    [True if v == "1" else False for v in rdmol.GetProp(highlight_prop)]
                )
                # print(_filter)
                # raise ValueError()
                sub_atoms = [int(i) for i in np.where(_filter)[0]]
                highlight_atoms_vals.append(sub_atoms)
                sub_bonds = list()
                for bond in rdmol.GetBonds():
                    a1 = bond.GetBeginAtomIdx()
                    a2 = bond.GetEndAtomIdx()
                    if (a1 in sub_atoms) and (a2 in sub_atoms):
                        sub_bonds.append(rdmol.GetBondBetweenAtoms(a1, a2).GetIdx())
                highlight_bonds_vals.append(sub_bonds)
        else:
            highlight_atoms_vals = None
            highlight_bonds = None

        d2d.DrawMolecules(
            rdmol_list,
            highlightAtoms=highlight_atoms_vals,
            highlightBonds=highlight_bonds_vals,
            legends=legends,
        )
        d2d.FinishDrawing()

        with open(f"{name}.svg", "w") as f:
            f.write(d2d.GetDrawingText())

def create_rdlist(
        df: pd.DataFrame,
        label_col :str,
        smi_col: str,
        pkl_name = None):
    
    rdlist = list()

    #This creates a dictionary associated with a label and smiles string
    label_map = {df[label_col][i] : df[smi_col][i] for i in df.index}
    
    for label_id in label_map:
        rdmol = PropertyMol(Chem.MolFromSmiles(label_map[label_id]))
        rdmol.SetProp("_Name", f'{label_id}')
        rdlist.append(rdmol)
    
    if pkl_name:
        with open(f'{pkl_name}', 'wb') as f:
            pickle.dump(rdlist, f)

    return rdlist

In [4]:
DB_df = pd.read_csv(f'SAD_Database.csv')

num = DB_df.shape[0]

#This re-orders the dataframe based on the correct title of the reactant name, and then resets the index to make it simple to write an ordered dictionary
prod_argsort = np.vectorize(sort_ids)(DB_df['Reactant ID']).argsort()
sort_react_df = DB_df.iloc[prod_argsort]
sort_react_df = sort_react_df.reset_index(drop=True)

prd = create_rdlist(
    sort_react_df,
    label_col = 'Product ID',
    smi_col = 'Product SMILES',
    # pkl_name = f'SAD_{num}_Entries_Products.pkl'
)

final_mols = list()

print(f'There are {len(prd)} diols')

rem_mols = list()
filter1_diols = list()
final_diol_mols = list()

There are 987 diols


In [5]:
#Check 1
for rdmol in prd:
    maf_mol = maf(rdmol)
    mol_bool = sp3_type(rdmol) & maf_mol.carbon_type() & maf_mol.smarts_query('CO')
    if np.count_nonzero(mol_bool) == 2:
        problem = full_check(rdmol, mol_bool, prop_value="_Diol")
        if not problem:
            final_diol_mols.append(rdmol)
            filter1_diols.append(rdmol)
        else:
            rem_mols.append(rdmol)
    else:
        rem_mols.append(rdmol)

# update_visualize_mols('Filter1', filter1_diols, highlight_bonds=True, highlight_prop="_Diol")
print(f"Check 1 has found {len(filter1_diols)} alkenes")
print(f"There are {len(rem_mols)} remaining")

  sp3_bool = np.in1d(atoms_array, sp3)


Check 1 has found 363 alkenes
There are 624 remaining


In [6]:
rem_mols_2 = list()
filter2_diols = list()
#Check 2
for rdmol in rem_mols:
    maf_mol = maf(rdmol)
    mol_bool = (
        sp3_type(rdmol)
        & maf_mol.carbon_type() 
        & maf_mol.in_1_ring()
        & maf_mol.het_neighbors_1()
        & ~maf_mol.smarts_query("[OR1X2][CR1X4]*[NR1X3][CR1X3](=O)")
        & ~maf_mol.smarts_query("COC")
        & ~maf_mol.smarts_query("cOC")
        & ~maf_mol.smarts_query("OCC*N")
        & ~maf_mol.smarts_query("OCCN")
    )
    if np.count_nonzero(mol_bool) == 2:
        problem = full_check(rdmol, mol_bool, prop_value="_Diol")
        if not problem:
            final_diol_mols.append(rdmol)
            filter2_diols.append(rdmol)
        else:
            rem_mols_2.append(rdmol)
    else:
        rem_mols_2.append(rdmol)

# update_visualize_mols('Filter2', filter2_diols, highlight_bonds=True, highlight_prop="_Diol")
print(f"Check 2 has found {len(filter2_diols)} alkenes")
print(f"There are {len(rem_mols_2)} remaining")


  sp3_bool = np.in1d(atoms_array, sp3)


Check 2 has found 26 alkenes
There are 598 remaining


In [7]:
rem_mols_3 = list()
filter3_diols = list()
#Check 3
for rdmol in rem_mols_2:
    maf_mol = maf(rdmol)
    mol_bool = (
        sp3_type(rdmol)
        & maf_mol.carbon_type()
        & maf_mol.smarts_query("[OHX2][CH2][CX4][OHX2]")
        & ~maf_mol.smarts_query("[OHX2][CH2][CX4](C[OH])[OHX2]")
        & ~maf_mol.smarts_query("[OR1X2][CR1X4]*[NR1X3][CR1X3](=O)")
        & ~maf_mol.smarts_query("COC")
        & ~maf_mol.smarts_query("cOC")
        & ~maf_mol.smarts_query("OCC*N")
        & ~maf_mol.smarts_query("OCCN")
    )
    if np.count_nonzero(mol_bool) == 2:
        problem = full_check(rdmol, mol_bool, prop_value="_Diol")
        if not problem:
            final_diol_mols.append(rdmol)
            filter3_diols.append(rdmol)
        else:
            rem_mols_3.append(rdmol)
    else:
        rem_mols_3.append(rdmol)

# update_visualize_mols('Filter3', filter3_diols, highlight_bonds=True, highlight_prop="_Diol")
print(f"Check 3 has found {len(filter3_diols)} alkenes")
print(f"There are {len(rem_mols_3)} remaining")

  sp3_bool = np.in1d(atoms_array, sp3)


Check 3 has found 158 alkenes
There are 440 remaining


In [8]:
rem_mols_4 = list()
filter4_diols = list()
#Check 4
for rdmol in rem_mols_3:
    maf_mol = maf(rdmol)
    mol_bool = (
        sp3_type(rdmol)
        & maf_mol.carbon_type()
        & maf_mol.smarts_query("[OHX2][CX4][CX4][OHX2]")
        & ~maf_mol.smarts_query("[OHX2][CX4][CX4](C[OH])[OHX2]")
        & ~maf_mol.smarts_query("[OHX2][CX4](C[OH])[CX4](C[OH])[OHX2]")
        & ~maf_mol.smarts_query("[OHX2][CX4](C[OH])[CX4][OHX2]")
        & ~maf_mol.smarts_query("[OR1X2][CR1X4]*[NR1X3][CR1X3](=O)")
        & ~maf_mol.smarts_query("COC")
        & ~maf_mol.smarts_query("cOC")
        & ~maf_mol.smarts_query("OCC*N")
        & ~maf_mol.smarts_query("OCCN")
    )
    if np.count_nonzero(mol_bool) == 2:
        problem = full_check(rdmol, mol_bool, prop_value="_Diol")
        if not problem:
            final_diol_mols.append(rdmol)
            filter4_diols.append(rdmol)
        else:
            rem_mols_4.append(rdmol)
    else:
        rem_mols_4.append(rdmol)

# update_visualize_mols('Filter4', filter4_diols, highlight_bonds=True, highlight_prop="_Diol")
print(f"Check 4 has found {len(filter4_diols)} alkenes")
print(f"There are {len(rem_mols_4)} remaining")

  sp3_bool = np.in1d(atoms_array, sp3)


Check 4 has found 289 alkenes
There are 151 remaining


In [9]:
rem_mols_5 = list()
filter5_diols = list()
#Check 5
for rdmol in rem_mols_4:
    maf_mol = maf(rdmol)
    mol_bool = (
        sp3_type(rdmol)
        & maf_mol.carbon_type()
        & has_chiraltag(rdmol)
    )
    if np.count_nonzero(mol_bool) == 2:
        problem = full_check(rdmol, mol_bool, prop_value="_Diol")
        if not problem:
            final_diol_mols.append(rdmol)
            filter5_diols.append(rdmol)
        else:
            rem_mols_5.append(rdmol)
    else:
        rem_mols_5.append(rdmol)

# update_visualize_mols('Filter5', filter5_diols, highlight_bonds=True, highlight_prop="_Diol")
print(f"Check 5 has found {len(filter5_diols)} alkenes")
print(f"There are {len(rem_mols_5)} remaining")

Check 5 has found 127 alkenes
There are 24 remaining


  sp3_bool = np.in1d(atoms_array, sp3)
  has_chiraltag_bool = np.in1d(atoms_array, has_chiraltag)


In [10]:
rem_mols_6 = list()
filter6_diols = list()
#Check 6
for rdmol in rem_mols_5:
    maf_mol = maf(rdmol)
    mol_bool = (
        sp3_type(rdmol)
        & maf_mol.carbon_type()
        & (maf_mol.smarts_query("[OHX2][CX4][CX4][OHX2]") | has_chiraltag(rdmol))
    )
    if np.count_nonzero(mol_bool) == 2:
        problem = full_check(rdmol, mol_bool, prop_value="_Diol")
        if not problem:
            final_diol_mols.append(rdmol)
            filter6_diols.append(rdmol)
        else:
            rem_mols_6.append(rdmol)
    else:
        rem_mols_6.append(rdmol)

# update_visualize_mols('Filter6', filter6_diols, highlight_bonds=True, highlight_prop="_Diol")
print(f"Check 6 has found {len(filter6_diols)} alkenes")
print(f"There are {len(rem_mols_6)} remaining")

Check 6 has found 24 alkenes
There are 0 remaining


  sp3_bool = np.in1d(atoms_array, sp3)
  has_chiraltag_bool = np.in1d(atoms_array, has_chiraltag)


In [None]:
# update_visualize_mols('Remaining_Mols', rem_mols_6, highlight_bonds=False)

print(len(final_diol_mols))
with open(f"1_Database_Diol_Identify.pkl", 'wb') as f:
    pickle.dump(final_diol_mols, f)

987
