In [8]:
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from rdkit.Chem import AllChem as Chem
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.info')     

paths = ['../model/data/o3f.train.csv',
         '../model/data/o3f.test.csv',
         '../model/data/screening-data.train.csv',
         '../model/data/screening-data.test.csv',
         ]
properties =   ['CalcLabuteASA',
                'CalcTPSA',
                'CalcExactMolWt',
                'CalcNumLipinskiHBD',
                'CalcNumLipinskiHBA',
                'CalcNumHBD',
                'CalcNumHBA',
                'CalcNumRotatableBonds',
                'CalcNumRings',
                'CalcNumAromaticRings',
                'CalcNumSaturatedRings',
                'CalcNumHeterocycles',
                'CalcNumAromaticHeterocycles',
                'CalcNumAromaticCarbocycles',
                'CalcNumSaturatedHeterocycles',
                'CalcNumSaturatedCarbocycles',
                'CalcNumAliphaticRings',
                'CalcNumAliphaticHeterocycles',
                'CalcNumAliphaticCarbocycles',
                'CalcNumHeteroatoms',
                'CalcNumAmideBonds',
                'CalcFractionCSP3',
                ]

def calc_chemprops(mol):
    fn = lambda fn, mol : Chem.__dict__[fn](mol)
    props = {}
    for i in properties:
        props[i] = fn(i, mol)
    return props

def make_report(path):
    df = pd.read_csv(path)
    sequences = df['seq']
    usequences = df['seq'].unique()
    usmiles = df['smiles'].unique()
    hits = df['hit']
    mols = [Chem.MolFromSmiles(i) for i in usmiles]
    chemprops = {}
    for i, j in tqdm(zip(usmiles, mols), total=len(mols)):
        chemprops[i] = calc_chemprops(j)
    chemprops = pd.DataFrame(chemprops).T
    
    
x = make_report(paths[-1])
x

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 600/600 [00:00<00:00, 3164.78it/s]


Unnamed: 0,CalcLabuteASA,CalcTPSA,CalcExactMolWt,CalcMolFormula,CalcNumLipinskiHBD,CalcNumLipinskiHBA,CalcNumHBD,CalcNumHBA,CalcNumRotatableBonds,CalcNumRings,...,CalcNumAromaticHeterocycles,CalcNumAromaticCarbocycles,CalcNumSaturatedHeterocycles,CalcNumSaturatedCarbocycles,CalcNumAliphaticRings,CalcNumAliphaticHeterocycles,CalcNumAliphaticCarbocycles,CalcNumHeteroatoms,CalcNumAmideBonds,CalcFractionCSP3
"CC1=CC=C(C=C1)S([O-])(=O)=O.CCOCC(O)COC1=CC=C(NC(=O)CC[S+](C)C)C=C1 |c:3,5,31,t:1,19,21|",198.96119,124.99,499.169844,C23H33NO7S2,2,8,2,7,11,2,...,0,2,0,0,0,0,0,10,1,0.434783
"Cl.CN1CCCC(CC1)N1N=C(CC2=CC=C(Cl)C=C2)C2=CC=CC=C2C1=O |c:18,23,25,t:10,13,15,21|",176.079304,38.13,417.137468,C22H25Cl2N3O,0,4,0,4,3,4,...,1,2,1,0,1,1,0,6,0,0.363636
"CC[C@H]1OC(=O)C[C@@H](O)[C@H](C)[C@@H](O[C@@H]2O[C@H](C)[C@@H](O)[C@@H]([C@H]2O)N(C)C)[C@@H](CCN2C[C@@H](C)C[C@@H](C)C2)C[C@@H](C)C(=O)\C=C\C(C)=C\[C@@H]1CO[C@@H]1O[C@H](C)[C@@H](O)[C@@H](OC)[C@H]1OC |r,t:43,46|",364.573579,186.15,868.566041,C46H80N2O13,4,15,4,15,12,4,...,0,0,3,0,4,4,0,15,0,0.869565
"CN(CCOC1=CC=C(CC2SC(=O)NC2=O)C=C1)C1=CC=CC=N1 |c:18,23,25,t:5,7,21|",150.126304,71.53,357.114712,C18H19N3O3S,1,6,1,6,7,3,...,1,1,1,0,1,1,0,7,2,0.277778
"CCOC(=O)[C@H](CCC1=CC=CC=C1)N[C@@H](C)C(=O)N1[C@@H](CN(C)C1=O)C(O)=O |r,c:10,12,t:8|",169.158158,116.25,405.189986,C20H27N3O6,2,9,2,6,9,2,...,0,1,1,0,1,1,0,9,3,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Cl.CC1=CC=CC(C)=C1NC1=NCCCS1 |c:3,6,t:1,10|",107.336613,24.39,256.080097,C12H17ClN2S,1,2,1,3,1,2,...,0,1,0,0,1,1,0,4,0,0.416667
"CCCC1=NC(=C(N1CC1=CC=C(C=C1)C1=CC=CC=C1C1=NNN=N1)C(=O)OCC1=C(C)OC(=O)O1)C(C)(C)O |c:5,12,14,19,21,27,34,t:3,10,17,24|",234.551415,162.16,558.222683,C29H30N6O6,2,12,2,11,10,5,...,3,2,0,0,0,0,0,12,0,0.310345
"CC(O)=O.NC(=N)N\N=C\C1=C(Cl)C=CC=C1Cl |c:9,12,14|",113.899853,111.56,290.033731,C10H12Cl2N4O2,5,6,4,3,2,1,...,0,1,0,0,0,0,0,8,0,0.1
NCCCCCC(O)=O,54.859608,63.32,131.094629,C6H13NO2,3,3,2,2,5,0,...,0,0,0,0,0,0,0,3,0,0.833333
