In [1]:
import pandas as pd
from rdkit import Chem  
from rdkit.Chem import AllChem, Draw
from rdkit.Chem import Descriptors
import os
import sys
sys.path.append('../../../utils')
from preprocessing import is_mol_symmetric, group_symmetric_atoms

In [2]:
# loading the csvs from the preprocessed reactions
csvs = os.listdir('../preprocessed_reactions')
csvs = [x for x in csvs if x.endswith('.csv')]
csvs = [x for x in csvs if 'df' in x]

# getting the list of smiles
df_ = pd.read_csv('../preprocessed_reactions/' + csvs[0], index_col=0)
smiles = df_.Reactant_SMILES.unique()

# discard the smiles with no specified stereocenters.
# racemic mixtures are ok, but we discard non specific diastereoisomers
def discard(smi):
    m = Chem.MolFromSmiles(smi)
    chiral_tags1      = Chem.FindMolChiralCenters(m, includeUnassigned=True)
    mol_sym           = is_mol_symmetric(smi)
    if mol_sym:
        m, sym_at = group_symmetric_atoms(smi)
    num_stereocenters = len(chiral_tags1)

    # if there are no stereocenters, we can't discard it
    if num_stereocenters == 0:
        #print("No stereocenters: keeper")
        return False
    
    # if there is only one stereocenter, we can't discard it
    elif num_stereocenters == 1:
        #print("1 stereocenters: keeper")
        return False
    
    # if there are more than one stereocenters:
    else:
        # if all stereocenters are assigned, we can't discard it
        if all([x[1] != "?" for x in chiral_tags1]):
            #print("All stereocenters are assigned: keeper")
            #print(chiral_tags1)
            return False
        
        else:
            mol_sym  = is_mol_symmetric(smi)
            if mol_sym:
                m, sym_at = group_symmetric_atoms(smi)
                #print("Molecule is symmetric - should we discard?: discard for now")
                return True
            else:
                #print("Molecule is not symmetric - and not all stereocenters are assigned: discard")
                return True
            
keepers   = [x for x in smiles if not discard(x)]
discarded = [x for x in smiles if x not in keepers]

print("Keepers: ", len(keepers))
print("Discarded: ", len(discarded))

#Draw.MolsToGridImage([Chem.MolFromSmiles(x) for x in keepers], molsPerRow=5, subImgSize=(200,200), legends=[x for x in keepers])

Keepers:  199
Discarded:  56


In [4]:
smi_d = ''
for s in sorted(keepers):
    if Chem.MolFromSmiles(s).GetNumAtoms() < 15:
        smi_d += s + '.'

print(smi_d)

BrC12CC3CC(CC(C3)C1)C2.Brc1ccc(C2CCC2)cc1.C/C(=C/CC(C)O)[N+](=O)[O-].C1C2C3[C@H]4C1[C@@H]([C@@H]23)[C@@H]1C2CC3[C@@H]1[C@H]3[C@H]24.C1C2CC3CC1CC(C2)C3.C1C2CC3CC1CC(C2)C31CC1.C1C2C[C@H]3CC(C[C@@H]1C3)[C@@]21CO1.C1CC2CCC1C2.C1CC2CCC1CC2.C1CCC2(CC1)CC2.C1CCC[C@H]2C[C@H]2CC1.C1CCOC1.C1CCOCC1.C1CC[C@@H]2CCC[C@H]2C1.C1CC[C@H]2CCCC[C@@H]2C1.C1CC[C@H]2CCCC[C@H]2C1.C1CC[C@H]2O[C@H]2C1.C1CN2CCC1CC2.C1C[C@H]2C[C@@H]12.CC(=O)C1CCCC1=O.CC(=O)C1CCOC1=O.CC(=O)NC1[C@H]2C[C@@H]3C[C@@H](C[C@H]1C3)C2.CC(=O)OC1[C@H]2C[C@@H]3C[C@@H](C[C@H]1C3)C2.CC(=O)OCCC(C)CCCC(C)C.CC(=O)O[C@@H]1C[C@H](C)CC[C@H]1C(C)C.CC(=O)O[C@H]1CCC[C@@H]2CCCC[C@@H]21.CC(C)(C)C1CCC(O)CC1.CC(C)C(C)C.CC(C)C1CC1.CC(C)CC(C)(C)O.CC(C)CCC(C)(C)C.CC(C)CCC(C)(C)O.CC(C)CCC(C)N.CC(C)CCC1CC1.CC(C)CCN.CC(C)CCOC(=O)c1ccccc1.CC(C)Cc1ccccc1.CC(C)O.CC(C)c1ccccc1.CC(O)CCCCO.CC(O)CCCO.CC(O)CCO.CC(O)c1cc2ccccc2o1.CC(O)c1ccc(Br)cc1.CC(O)c1ccc(C#N)cc1.CC(O)c1ccc(Cl)cc1.CC(O)c1ccc(F)cc1.CC(O)c1ccccc1.CC1(C)OC12C1CC3CC(C1)CC2C3.CC1(C)[C@@H]2CC[C@@]1(C)[C@@H]

In [5]:
for csv in csvs:
    df = pd.read_csv('../preprocessed_reactions/' + csv, index_col=0)
    df = df[df.Reactant_SMILES.isin(keepers)]
    df.reset_index(drop=True, inplace=True)
    df.to_csv(csv)

199
199
199
199
199
199
199
199
199
199
