In [11]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

import deepchem as dc
import pandas as pd
import pickle
import numpy as np
import warnings
import matplotlib.pyplot as plt
from utils import *
import tensorflow as tf
tf.get_logger().setLevel('ERROR')
warnings.filterwarnings('ignore')
cols = ['compound id (and file prefix)','SMILES','experimental value (kcal/mol)','experimental uncertainty (kcal/mol)','has_uncertainty','relative_uncertainty']

In [6]:
freesolv = pd.read_csv('dicts/full_dataset.csv')
freesolv

Unnamed: 0,id,smiles,iupac name,expt,tip3p,cha,gbnsr6,asc,igb5,null,train
0,mobley_5708811,COc1c(ccc(c1C(=O)O)Cl)Cl,dicamba,-9.86,-8.658,-7.32265,-9.63719,-10.151,-11.039,0,True
1,mobley_9460824,CCOP(=S)(OCC)SCSCC,diethoxy-(ethylsulfanylmethylsulfanyl)-thioxo-...,-4.37,-6.427,-9.14217,-6.61081,-9.874,-9.447,0,True
2,mobley_7200804,c1ccc(c(c1)O)F,2-fluorophenol,-5.29,-3.346,-2.76435,-3.36818,-4.508,-5.796,0,True
3,mobley_8208692,c1c(c(c(c(c1Cl)Cl)Cl)Cl)c2c(cc(c(c2Cl)Cl)Cl)Cl,"1,2,3,4-tetrachloro-5-(2,3,4,6-tetrachlorophen...",-4.61,-0.039,-2.46709,-1.18551,2.646,0.060,0,True
4,mobley_5157661,CC(C)C,isobutane,2.30,2.535,1.84093,1.75318,2.301,1.061,0,True
...,...,...,...,...,...,...,...,...,...,...,...
637,mobley_7794077,c1c(c(=O)[nH]c(=O)[nH]1)C(F)(F)F,5-trifluoromethyluracil,-15.46,-17.349,-16.84550,-14.47460,-22.520,-22.705,0,True
638,mobley_852937,c1cc(ccc1O)F,4-fluorophenol,-6.19,-4.955,-4.00628,-5.30964,-6.296,-7.347,0,True
639,mobley_7298388,Cc1cccc(c1)[N+](=O)[O-],1-methyl-3-nitro-benzene,-3.45,-3.278,-3.08463,-4.12391,-3.130,-3.885,0,True
640,mobley_7066554,c1cc(ccc1O)Cl,4-chlorophenol,-7.03,-5.373,-4.38936,-6.21981,-5.890,-7.147,0,True


In [7]:
amino_analogues = ['methane','propane','isobutane','n-butane','methanol','ethanol','methanethiol','methyl ethyl sulfide',
                   'acetamide','propionamide','toluene','p-cresol','1-methylimidazole','methylindole']
analogs=freesolv[(freesolv['iupac name'].isin(amino_analogues))]
analogs

Unnamed: 0,id,smiles,iupac name,expt,tip3p,cha,gbnsr6,asc,igb5,null,train
4,mobley_5157661,CC(C)C,isobutane,2.3,2.535,1.84093,1.75318,2.301,1.061,0,True
7,mobley_9055303,C,methane,2.0,2.446,1.47257,1.24229,1.411,0.676,0,True
46,mobley_1636752,CO,methanol,-5.1,-3.491,-4.07426,-4.24733,-4.789,-5.384,0,False
99,mobley_8427539,CCC(=O)N,propionamide,-9.4,-8.31,-8.8135,-7.67168,-10.116,-10.547,0,True
146,mobley_8048190,CC(=O)N,acetamide,-9.71,-8.82,-9.87065,-8.65282,-11.874,-11.643,0,True
241,mobley_525934,CS,methanethiol,-1.2,-0.273,-2.03832,-0.364978,-1.803,-2.368,0,True
247,mobley_2068538,CCC,propane,2.0,2.495,1.7034,1.58001,2.053,0.939,0,True
361,mobley_2925352,Cc1ccc(cc1)O,p-cresol,-6.13,-5.579,-4.56342,-5.73784,-5.67,-7.054,0,True
455,mobley_1923244,CCCC,n-butane,2.1,2.588,1.86422,1.77912,2.348,1.092,0,True
465,mobley_1873346,Cc1ccccc1,toluene,-0.9,-0.79,-1.62474,-1.10628,-0.815,-2.462,0,True


In [9]:
featurizer = dc.feat.ConvMolFeaturizer(per_atom_fragmentation=False)
amino_X=featurizer.featurize(analogs['smiles'].to_numpy())
params = {'epochs' : 500, 'dropout' : 0.4, 'batch_normalize' : False, 'batch_size' : 100, 
           'dense_layer_size' : 27, 'graph_conv_layers' : [32, 32]}

In [15]:
feats = {'tip3p':79,'cha':43,'gbnsr6':26,'asc':83,'igb5':1,'null':4}
amino_results = analogs.copy()
# model_index = 
for feat in feats:
    # print(feats[feat])
    model = dc.models.GraphConvModel(n_tasks=1, graph_conv_layers=params['graph_conv_layers'],
                                 mode='regression', dropout=params['dropout'], 
                                 batch_normalize=params['batch_normalize'], 
                                 batch_size=params['batch_size'], 
                                 dense_layer_size=params['dense_layer_size'],
                                 model_dir = 'tests/'+feat+'/model_'+str(feats[feat]))
    model.restore()
    preds = np.array(model.predict_on_batch(amino_X))
    amino_results[feat+'_ml'] = preds
    amino_results[feat+'_with_ml'] = amino_results[feat]+amino_results[feat+'_ml']
    print(feat)
    print('rmsd for physics model compared to expt:',np.round(rmsd(amino_results['expt'],amino_results[feat]),3))
    print('rmsd for physics + ML compared to expt:',np.round(rmsd(amino_results['expt'],amino_results[feat+'_with_ml']),3))
amino_results.drop(['smiles'],axis=1)[['expt','train','null_with_ml']]#.describe()

tip3p
rmsd for physics model compared to expt: 1.064
rmsd for physics + ML compared to expt: 0.396
cha
rmsd for physics model compared to expt: 0.866
rmsd for physics + ML compared to expt: 0.386
gbnsr6
rmsd for physics model compared to expt: 0.896
rmsd for physics + ML compared to expt: 0.44
asc
rmsd for physics model compared to expt: 0.815
rmsd for physics + ML compared to expt: 0.398
igb5
rmsd for physics model compared to expt: 1.13
rmsd for physics + ML compared to expt: 0.507
null
rmsd for physics model compared to expt: 5.492
rmsd for physics + ML compared to expt: 1.11


Unnamed: 0,expt,train,null_with_ml
4,2.3,True,1.986661
7,2.0,True,2.180612
46,-5.1,False,-3.2024
99,-9.4,True,-6.988823
146,-9.71,True,-8.689322
241,-1.2,True,-0.843943
247,2.0,True,1.819012
361,-6.13,True,-5.094168
455,2.1,True,1.733532
465,-0.9,True,-0.954215
