In [61]:
from molli.external import rdkit as mrd
from molli.external.rdkit import atom_filter as maf
import pickle
import numpy as np
from rdkit import Chem
from rdkit.Chem.PropertyMol import PropertyMol
from rdkit.Chem.Draw import rdMolDraw2D
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from pprint import pprint
import pandas as pd

def sort_ids(s: str):
    '''This will correctly sort any type of reaction IDs'''
    _, b = s.split('_')
    return int(b)

def connect_check(rdmol, alk_bool: np.ndarray):
    """
    This checks if the atoms identified are connected
    """

    # This returns dictionary of atom index : atom object for the indices where the bool array was true
    isolated_carbon_atoms = [
        rdmol.GetAtomWithIdx(int(i)) for i in np.where(alk_bool)[0]
    ]

    carbon1_neighbor_atom_idx = list()
    carbon1 = isolated_carbon_atoms[0]
    carbon2 = isolated_carbon_atoms[1]

    carbon1_neighbor_atoms = carbon1.GetNeighbors()

    for neighbor in carbon1_neighbor_atoms:
        neighbor_idx = neighbor.GetIdx()
        carbon1_neighbor_atom_idx.append(neighbor_idx)
    if carbon2.GetIdx() in carbon1_neighbor_atom_idx:
        return True
    else:
        print(f'{rdmol.GetProp("_Name")} do not have carbons connecting')
        return False

def update_visualize_mols(
    name: str,
    rdmol_list: list,
    subImgSize=(700, 700),
    legendFontSize=30,
    molsPerRow=5,
    label: str = "_Name",
    highlight_bonds=True,
    highlight_prop="_Alkene",
):
    '''
    This allows the visualization of the alkenes isolated through RDKit.
    '''
    #Formats the legend
    legends = [i.GetProp(label) for i in rdmol_list]

    #Formats the rows and drawing
    nRows = len(rdmol_list) // molsPerRow
    if len(rdmol_list) % molsPerRow:
        nRows += 1
    fullSize = (molsPerRow * subImgSize[0], nRows * subImgSize[1])

    d2d = rdMolDraw2D.MolDraw2DSVG(
        fullSize[0], fullSize[1], subImgSize[0], subImgSize[1]
    )
    d2d.drawOptions().legendFontSize = legendFontSize

    #Highlights then draws the bond
    if len(rdmol_list) != 0:
        highlight_atoms_vals = list()
        highlight_bonds_vals = list()

        # This finds the atoms highlighted and bonds that exist between them
        if highlight_bonds:
            for rdmol in rdmol_list:

                _filter = np.array(
                    [True if v == "1" else False for v in rdmol.GetProp(highlight_prop)]
                )
                sub_atoms = [int(i) for i in np.where(_filter)[0]]
                highlight_atoms_vals.append(sub_atoms)
                sub_bonds = list()

                for bond in rdmol.GetBonds():
                    a1 = bond.GetBeginAtomIdx()
                    a2 = bond.GetEndAtomIdx()
                    if (a1 in sub_atoms) and (a2 in sub_atoms):
                        sub_bonds.append(rdmol.GetBondBetweenAtoms(a1, a2).GetIdx())
                highlight_bonds_vals.append(sub_bonds)
        else:
            highlight_atoms_vals = None
            highlight_bonds = None

        d2d.DrawMolecules(
            rdmol_list,
            highlightAtoms=highlight_atoms_vals,
            highlightBonds=highlight_bonds_vals,
            legends=legends,
        )
        d2d.FinishDrawing()

        with open(f"{name}.svg", "w") as f:
            f.write(d2d.GetDrawingText())

def alkene_filter(
    rdmol_list: list,
    methods: list[str] = list(),
    prop_value: str = "_Alkene",
    visualize: bool = True,
    vis_name="Filter",
):
    '''
    This takes a list of alkene mols, then applies various atom filters available
    in the molli RDKit interface
    '''
    remaining_mols = list()
    alkene_mols = list()
    problematic_rdmol = list()
    not_connected_alkenes = list()

    for rdmol in rdmol_list:
        maf_mol = maf(rdmol)
        # This line attempts to take any method provided on the maf_mol
        # This includes if a tilde is present but is expected at the beginning of the string
        af_filter = np.logical_and.reduce(
            [
                (
                    eval(f"maf_mol.{l}", {}, {"maf_mol": maf_mol})
                    if "~" not in l
                    else eval(f"~maf_mol.{l[1:]}", {}, {"maf_mol": maf_mol})
                )
                for l in methods
            ]
        )

        # This checks that the two atoms found are connected and can be recalled correctly
        if np.count_nonzero(af_filter) == 2:
            rdmol.SetProp(prop_value, "".join("1" if v else "0" for v in af_filter))
            recall_af_filter = np.array(
                [True if v == "1" else False for v in rdmol.GetProp(prop_value)]
            )
            # Tests to make sure the array property is set and returned properly
            if all(recall_af_filter == af_filter):
                if connect_check(rdmol, af_filter):
                    alkene_mols.append(rdmol)
                else:
                    remaining_mols.append(rdmol)
                    not_connected_alkenes.append(rdmol)
            else:
                print(
                    f'{rdmol.GetProp("_Name")} did not correctly return alkene boolean, appended to problematic mol object list'
                )
                remaining_mols.append(rdmol)
                problematic_rdmol.append(rdmol)
        else:
            remaining_mols.append(rdmol)

    #This can be used to visualize the molecules isolated
    if visualize:
        if len(alkene_mols) != 0:
            update_visualize_mols(
                vis_name, alkene_mols, highlight_bonds=True, highlight_prop=prop_value
            )
        if len(not_connected_alkenes) != 0:
            update_visualize_mols(
                f"{vis_name}_not_connected",
                not_connected_alkenes,
                highlight_bonds=True,
                highlight_prop=prop_value,
            )
        if len(not_connected_alkenes) != 0:
            update_visualize_mols(
                f"{vis_name}_problematic",
                problematic_rdmol,
                highlight_bonds=True,
                highlight_prop=prop_value,
            )
    
    #This confirms no alkenes are lost
    assert (
        all_len := len(problematic_rdmol)
        + len(not_connected_alkenes)
        + len(remaining_mols)
        + len(alkene_mols)
    ) == len(
        rdmol_list
    ), f"Not correct! Current length = {all_len}, Original length = {len(rdmol_list)}"
    
    # These alkenes need to be rerun
    print(f"{vis_name} has found {len(alkene_mols)} alkenes")
    print(f"There are {len(remaining_mols)} remaining")
    return remaining_mols, alkene_mols

def create_rdlist(
        df: pd.DataFrame,
        label_col :str,
        smi_col: str,
        pkl_name: str|None):
    
    rdlist = list()

    #This creates a dictionary associated with a label and smiles string
    label_map = {df[label_col][i] : df[smi_col][i] for i in df.index}
    
    for label_id in label_map:
        rdmol = PropertyMol(Chem.MolFromSmiles(label_map[label_id]))
        rdmol.SetProp("_Name", f'{label_id}')
        rdlist.append(rdmol)
    
    if pkl_name:
        with open(f'{pkl_name}', 'wb') as f:
            pickle.dump(rdlist, f)

    return rdlist


In [62]:
DB_df = pd.read_csv(f'SAD_Database.csv')

num = DB_df.shape[0]

#This re-orders the dataframe based on the correct title of the reactant name, and then resets the index to make it simple to write an ordered dictionary
react_argsort = np.vectorize(sort_ids)(DB_df['Reactant ID']).argsort()
sort_react_df = DB_df.iloc[react_argsort]
sort_react_df = sort_react_df.reset_index(drop=True)

react = create_rdlist(
    DB_df,
    label_col = 'Reactant ID',
    smi_col = 'Reactant SMILES',
    pkl_name = f'SAD_Database_Reactants.pkl'
)

prd = create_rdlist(
    DB_df,
    label_col = 'Product ID',
    smi_col = 'Product SMILES',
    pkl_name = f'SAD_Database_Products.pkl'
)

final_mols = list()

#This identifies the names of reactants with multiple alkenes
adn = DB_df[['Reactant ID','Additional Notes']]
mult_alk_names = adn[adn['Additional Notes'].fillna('').str.contains('multiple alkenes')]['Reactant ID'].unique()

multiple_alkenes = [x for x in react if x.GetProp("_Name") in mult_alk_names]
react = [x for x in react if x not in multiple_alkenes]

print(f'There are {len(multiple_alkenes)} alkenes with multiple possible alkenes')
print(f'There are {len(react)} alkenes')
print(f'There are {len(prd)} diols')

There are 73 alkenes with multiple possible alkenes
There are 711 alkenes
There are 987 diols


In [63]:
#Check 1
rem_mol, alk_mol = alkene_filter(
    rdmol_list=react,
    methods=["sp2_type()", "carbon_type()"],
    visualize=False,
    vis_name="Filter1"
)

final_mols.extend(alk_mol)

Filter1 has found 124 alkenes
There are 587 remaining


In [64]:
#Check 2
rem_mol, alk_mol = alkene_filter(
    rdmol_list=rem_mol,
    methods=["sp2_type()", "carbon_type()", "in_1_ring()", "het_neighbors_0()", "~aromatic_type()"],
    visualize=False,
    vis_name="Filter2"
)

final_mols.extend(alk_mol)

Filter2 has found 36 alkenes
There are 551 remaining


In [65]:
#Check 3
rem_mol, alk_mol = alkene_filter(
    rdmol_list=rem_mol,
    methods=["sp2_type()", "carbon_type()","het_neighbors_0()", "~aromatic_type()"],
    visualize=False,
    vis_name="Filter3"
)

final_mols.extend(alk_mol)


Filter3 has found 468 alkenes
There are 83 remaining


In [66]:
#Check 4
rem_mol, alk_mol = alkene_filter(
    rdmol_list=rem_mol,
    methods=["sp2_type()" , 'carbon_type()' , 'het_neighbors_1()' , '~in_1_ring()' , '~aromatic_type()' , "~smarts_query('[$([CX3]=[OX1]),$([CX3+]-[OX1-])]')"],
    visualize=False,
    vis_name="Filter4"
)

final_mols.extend(alk_mol)


Filter4 has found 0 alkenes
There are 83 remaining


In [67]:
#Check 5
rem_mol, alk_mol = alkene_filter(
    rdmol_list=rem_mol,
    methods=["sp2_type()" , "carbon_type()" , "het_neighbors_1()" , "in_1_ring()" , "~aromatic_type()" , "~smarts_query('[$([CX3]=[OX1]),$([CX3+]-[OX1-])]')"],
    visualize=False,
    vis_name="Filter5"
)

final_mols.extend(alk_mol)

Filter5 has found 0 alkenes
There are 83 remaining


In [68]:
#Check 6
rem_mol, alk_mol = alkene_filter(
    rdmol_list=rem_mol,
    methods=["sp2_type()" , "carbon_type()" , "~het_neighbors_3()" , "in_1_ring()" , "~aromatic_type()" , "~smarts_query('[$([CX3]=[OX1]),$([CX3+]-[OX1-])]')"],
    visualize=False,
    vis_name="Filter6"
)

final_mols.extend(alk_mol)

Filter6 has found 35 alkenes
There are 48 remaining


In [69]:
#Check 7
rem_mol, alk_mol = alkene_filter(
    rdmol_list=rem_mol,
    methods=["sp2_type()" , "carbon_type()" , "~het_neighbors_3()" , "~in_2_rings()" , "~aromatic_type()" , "~smarts_query('[$([CX3]=[OX1]),$([CX3+]-[OX1-])]')"],
    visualize=False,
    vis_name="Filter7"
)

final_mols.extend(alk_mol)

Filter7 has found 46 alkenes
There are 2 remaining


In [70]:
#Check 8
rem_mol, alk_mol = alkene_filter(
    rdmol_list=rem_mol,
    methods=["sp2_type()" , "carbon_type()" , "~in_2_rings()" , "~het_neighbors_3()" , "aromatic_type()" , "ring_size5()" , "~smarts_query('[$([CX3]=[OX1]),$([CX3+]-[OX1-])]')" , "~smarts_query('c1cscn1')"],
    visualize=False,
    vis_name="Filter8"
)

final_mols.extend(alk_mol)

Filter8 has found 0 alkenes
There are 2 remaining


In [71]:
#Check 9
rem_mol, alk_mol = alkene_filter(
    rdmol_list=rem_mol,
    methods=["sp2_type()" , "carbon_type()" , "~in_2_rings()" , "~het_neighbors_3()" , "smarts_query('[NX3][CX3]=[CX3]')" , "~smarts_query('[$([CX3]=[OX1]),$([CX3+]-[OX1-])]')"],
    visualize=False,
    vis_name="Filter9"
)

final_mols.extend(alk_mol)

Filter9 has found 0 alkenes
There are 2 remaining


In [72]:
#Check 10
rem_mol, alk_mol = alkene_filter(
    rdmol_list=rem_mol,
    methods=["sp2_type()" , "carbon_type()" , "smarts_query('[nX3H][cX3]([CX4])[cX3]([CX4])')"],
    visualize=False,
    vis_name="Filter10"
)

final_mols.extend(alk_mol)

Filter10 has found 0 alkenes
There are 2 remaining


In [73]:
#Check 11
rem_mol, alk_mol = alkene_filter(
    rdmol_list=rem_mol,
    methods=["sp2_type()" , "carbon_type()" , "smarts_query('[CX3]=[CX3][CX4][OH]')" , "~smarts_query('C=CC=C')"],
    visualize=False,
    vis_name="Filter11"
)

final_mols.extend(alk_mol)

Filter11 has found 0 alkenes
There are 2 remaining


In [74]:
#Check 12
rem_mol, alk_mol = alkene_filter(
    rdmol_list=rem_mol,
    methods=["sp2_type()" , "carbon_type()" , "~het_neighbors_3()" , "smarts_query('c1[nH]c(=O)oc1')"],
    visualize=False,
    vis_name="Filter12"
)

final_mols.extend(alk_mol)

Filter12 has found 0 alkenes
There are 2 remaining


In [75]:
#Check 13
rem_mol, alk_mol = alkene_filter(
    rdmol_list=rem_mol,
    methods=["sp2_type()" , "carbon_type()" , "~het_neighbors_2()" , "smarts_query('O=c(o)cc')"],
    visualize=False,
    vis_name="Filter13"
)

final_mols.extend(alk_mol)

Filter13 has found 0 alkenes
There are 2 remaining


In [76]:
#Check 14
rem_mol, alk_mol = alkene_filter(
    rdmol_list=rem_mol,
    methods=["sp2_type()" , "carbon_type()" , "~aromatic_type()"],
    visualize=False,
    vis_name="Filter14"
)

final_mols.extend(alk_mol)

Filter14 has found 0 alkenes
There are 2 remaining


In [77]:
#Check 15
rem_mol, alk_mol = alkene_filter(
    rdmol_list=rem_mol,
    methods=["sp2_type()" , "carbon_type()" , "in_2_rings()", "smarts_query('cc[n,o]c=O')", "~smarts_query('c1ccccc1')"],
    visualize=False,
    vis_name="Filter15"
)

final_mols.extend(alk_mol)


Filter15 has found 2 alkenes
There are 0 remaining


In [78]:
for rdmol in rem_mol:
    acmol = maf(rdmol)

    acbool = acmol.sp2_type() & acmol.carbon_type() & ~acmol.aromatic_type()

    if np.count_nonzero(acbool) > 2:
        multiple_alkenes.append(rdmol)
    else:
        print(rdmol.GetProp("_Name"))

In [79]:
with open(f"SAD_Database_alk_no_mult.pkl", 'wb') as f:
    pickle.dump(final_mols, f)

#Can be used to visualize structures with multiple alkenes
# update_visualize_mols(
#     name='multiple',
#     rdmol_list=multiple_alkenes,
#     highlight_bonds=False,
# )

with open(f"SAD_Database_mult.pkl", 'wb') as f:
    pickle.dump(multiple_alkenes, f)