In [8]:
import molli as ml
from molli.external.rdkit import atom_filter as maf
import pickle
import numpy as np
from rdkit import Chem
from rdkit.Chem.PropertyMol import PropertyMol
from rdkit.Chem.Draw import rdMolDraw2D
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from pprint import pprint
import pandas as pd
import os

def sort_ids(s: str):
    '''This will correctly sort any type of reaction IDs'''
    _, b = s.split('_')
    return int(b)

def connect_check(rdmol, alk_bool: np.ndarray):
    """
    This checks if the atoms identified are connected
    """

    # This returns dictionary of atom index : atom object for the indices where the bool array was true
    isolated_carbon_atoms = [
        rdmol.GetAtomWithIdx(int(i)) for i in np.where(alk_bool)[0]
    ]

    carbon1_neighbor_atom_idx = list()
    carbon1 = isolated_carbon_atoms[0]
    carbon2 = isolated_carbon_atoms[1]

    carbon1_neighbor_atoms = carbon1.GetNeighbors()

    for neighbor in carbon1_neighbor_atoms:
        neighbor_idx = neighbor.GetIdx()
        carbon1_neighbor_atom_idx.append(neighbor_idx)
    if carbon2.GetIdx() in carbon1_neighbor_atom_idx:
        return True
    else:
        print(f'{rdmol.GetProp("_Name")} do not have carbons connecting')
        return False

def update_visualize_mols(
    name: str,
    rdmol_list: list,
    subImgSize=(700, 700),
    legendFontSize=30,
    molsPerRow=5,
    label: str = "_Name",
    highlight_bonds=True,
    highlight_prop="_Alkene",
):
    '''
    This allows the visualization of the alkenes isolated through RDKit.
    '''
    #Formats the legend
    legends = [i.GetProp(label) for i in rdmol_list]

    #Formats the rows and drawing
    nRows = len(rdmol_list) // molsPerRow
    if len(rdmol_list) % molsPerRow:
        nRows += 1
    fullSize = (molsPerRow * subImgSize[0], nRows * subImgSize[1])

    d2d = rdMolDraw2D.MolDraw2DSVG(
        fullSize[0], fullSize[1], subImgSize[0], subImgSize[1]
    )
    d2d.drawOptions().legendFontSize = legendFontSize

    #Highlights then draws the bond
    if len(rdmol_list) != 0:
        highlight_atoms_vals = list()
        highlight_bonds_vals = list()

        # This finds the atoms highlighted and bonds that exist between them
        if highlight_bonds:
            for rdmol in rdmol_list:

                _filter = np.array(
                    [True if v == "1" else False for v in rdmol.GetProp(highlight_prop)]
                )
                sub_atoms = [int(i) for i in np.where(_filter)[0]]
                highlight_atoms_vals.append(sub_atoms)
                sub_bonds = list()

                for bond in rdmol.GetBonds():
                    a1 = bond.GetBeginAtomIdx()
                    a2 = bond.GetEndAtomIdx()
                    if (a1 in sub_atoms) and (a2 in sub_atoms):
                        sub_bonds.append(rdmol.GetBondBetweenAtoms(a1, a2).GetIdx())
                highlight_bonds_vals.append(sub_bonds)
        else:
            highlight_atoms_vals = None
            highlight_bonds = None

        d2d.DrawMolecules(
            rdmol_list,
            highlightAtoms=highlight_atoms_vals,
            highlightBonds=highlight_bonds_vals,
            legends=legends,
        )
        d2d.FinishDrawing()

        with open(f"{name}.svg", "w") as f:
            f.write(d2d.GetDrawingText())

def create_rdlist(
        df: pd.DataFrame,
        label_col :str,
        smi_col: str,
        pkl_name:str = None):
    
    rdlist = list()

    #This creates a dictionary associated with a label and smiles string
    label_map = {df[label_col][i] : df[smi_col][i] for i in df.index}
    
    for label_id in label_map:
        rdmol = PropertyMol(Chem.MolFromSmiles(label_map[label_id]))
        rdmol.SetProp("_Name", f'{label_id}')
        rdlist.append(rdmol)
    
    if pkl_name:
        with open(f'{pkl_name}', 'wb') as f:
            pickle.dump(rdlist, f)

    return rdlist


In [9]:
DB_df = pd.read_csv(f'SAD_Database.csv')

num = DB_df.shape[0]

#This re-orders the dataframe based on the correct title of the reactant name, and then resets the index to make it simple to write an ordered dictionary
react_argsort = np.vectorize(sort_ids)(DB_df['Reactant ID']).argsort()
sort_react_df = DB_df.iloc[react_argsort]
sort_react_df = sort_react_df.reset_index(drop=True)

with open('3_DB_Can_RDmols.pkl', 'rb') as f:
    react = pickle.load(f) 

[Chem.SanitizeMol(x) for x in react]

prd = create_rdlist(
    DB_df,
    label_col = 'Product ID',
    smi_col = 'Product SMILES',
)

prd_map = {x.GetProp("_Name") : x for x in prd}

with open(f'SAD_Database_mult_H.pkl', 'rb') as f:
    mult_alk = pickle.load(f)

#This matches reactant and product IDs
react_prod_id_map = {DB_df['Reactant ID'][i]: DB_df['Product ID'][i] for i in DB_df.index}

mult_alk_visualize = list()

### This function is used to visualize pairs of the reactants and products for identification

In [10]:
# update_visualize_mols(
#     name='Paired Multiple',
#     rdmol_list=mult_alk_visualize,
#     molsPerRow=2,
#     highlight_bonds=False,
# )

In [11]:
alkene_mols = list()
not_connected_alkenes = list()
problematic_rdmol = list()
vis_name = 'mult_alkenes_label'
done_map = {}

#This is the cache of alkenes for labeling
if os.path.exists('SAD_Database_mult_alk_cache_H.pkl'):
    with open('SAD_Database_mult_alk_cache_H.pkl', 'rb') as f:
        done_rd = pickle.load(f)
        done_map = {x.GetProp("_Name") : x for x in done_rd}

### This function creates a full array of Falses, then assigns True at specific indices based on the "mult_alk_visualize" visualization

##### Note: Since the multiple alkenes have been previously assigned, the cache is used for this. Normally this would be run in a python script filling in the inputs

In [12]:
#This function creates a full array of Falses, then assigns True at specific indices based
#On the "mult_alkenes" visualization
for rdmol in mult_alk:
    mol = maf(rdmol)
    mol_bool = np.full((len(rdmol.GetAtoms()),), fill_value=False)
    name = rdmol.GetProp("_Name")

    # This prevents repeats
    if (name in done_map):
        print(f'Reactant {name} done, skipping')
        alkene_mols.append(done_map[name])
        continue
    
    #The inputs are used here as they are labeled in the visualization
    print(f'For reactant: {name}')
    try:
        c1 = input('What is the value of the first carbon? ')
        c2 = input('What is the value of the second carbon? ')

        #Atoms start at 0
        c1 = eval(c1)-1
        c2 = eval(c2)-1
        mol_bool[c1] = True
        mol_bool[c2] = True
    except Exception as e:
        print(e)
        with open('SAD_Database_mult_alk_cache_H.pkl', 'wb') as f:
            pickle.dump(alkene_mols, f)
        raise ValueError('Error detected, restart with existing cache')

    #Stores as property of alkene
    rdmol.SetProp("_Alkene_w_H", "".join('1' if v else '0' for v in mol_bool))
    original_array = np.array([True if v == '1' else False for v in rdmol.GetProp("_Alkene_w_H")])
    
    #Runs checks on the carbons found
    if all(original_array == mol_bool):
        if connect_check(rdmol, original_array):
            alkene_mols.append(rdmol)
        else:
            not_connected_alkenes.append(rdmol)
    else:
        print(
            f'{rdmol.GetProp("_Name")} did not correctly return alkene boolean, appended to problematic mol object list'
        )
        problematic_rdmol.append(rdmol)

Reactant react_517 done, skipping
Reactant react_564 done, skipping
Reactant react_560 done, skipping
Reactant react_621 done, skipping
Reactant react_575 done, skipping
Reactant react_558 done, skipping
Reactant react_556 done, skipping
Reactant react_561 done, skipping
Reactant react_443 done, skipping
Reactant react_563 done, skipping
Reactant react_608 done, skipping
Reactant react_607 done, skipping
Reactant react_611 done, skipping
Reactant react_437 done, skipping
Reactant react_610 done, skipping
Reactant react_601 done, skipping
Reactant react_477 done, skipping
Reactant react_346 done, skipping
Reactant react_505 done, skipping
Reactant react_516 done, skipping
Reactant react_609 done, skipping
Reactant react_612 done, skipping
Reactant react_565 done, skipping
Reactant react_569 done, skipping
Reactant react_627 done, skipping
Reactant react_512 done, skipping
Reactant react_555 done, skipping
Reactant react_568 done, skipping
Reactant react_604 done, skipping
Reactant react

In [13]:
#This saves intermediate steps
with open(f'SAD_Database_alk_no_mult_H.pkl', 'rb') as f:
    no_mult = pickle.load(f)

with open(f'SAD_Database_alk_mult_h.pkl', 'wb') as f:
    pickle.dump(alkene_mols,f)

final_mols = no_mult + alkene_mols

final_mol_map = {x.GetProp("_Name") : x for x in final_mols}

with open(f'3_Database_Alkene_H_Identify.pkl', 'wb') as f:
    pickle.dump(final_mols, f)

In [14]:
mlib = ml.MoleculeLibrary('3_DB_Can_MLmols.mlib')
mlib_prop = ml.MoleculeLibrary('4_DB_MLmols_w_H.mlib', overwrite=True, readonly=False)

#This updates the properties of the existing MoleculeLibrary
with mlib.reading(), mlib_prop.writing():
    for key in mlib:
        m = mlib[key]
        rdmol_h = final_mol_map[key]
        #There are a few weird cases where canonicalization was run and running this line a second time results in a change of the SMILES
        smi_h = Chem.MolToSmiles(rdmol_h, canonical=False)
        alk_h = rdmol_h.GetProp("_Alkene_w_H")

        m.attrib['_Canonical_SMILES_H'] = smi_h
        m.attrib['_Alkene_w_H'] = alk_h

        mlib_prop[key] = m