In [None]:
#Import Libraries
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
from rdkit.Chem.Scaffolds import MurckoScaffold 
import math
import matplotlib.colors
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import matplotlib.transforms as mtransforms
from sklearn import metrics
from scipy import stats as stats
from google.colab import files
%matplotlib inline 

In [None]:
### Process Data 
# Read Local CSV
# names = ['FUBrain', 'RenClear', 'FreeSolv', 'MicroClear', 'HemoTox', 'HepClear', 'Caco2', 'Sol', 'VDss', 'HalfLifeMWAdjusted']
name = "HalfLifeMWAdjusted"
dataframe = pd.read_csv("{}.csv".format(name))

# Prepare Fingerprints
mols = [Chem.MolFromSmiles(s) for s in dataframe.SMILES]
fps = [np.array(AllChem.GetMorganFingerprintAsBitVect(m,2)) for m in mols]
dataframe["Fingerprint"] = fps
data = pd.DataFrame(data={'FP':  dataframe.Fingerprint.to_numpy()})
del dataframe

In [None]:
# Set up for cross validation
from sklearn.model_selection import KFold
cv = KFold(n_splits=10, random_state=1, shuffle=True)

# use garbage collection to save on space
import gc
print(gc.get_count())
gc.collect()
print(gc.get_count())

# Perform cross validation and save the fingerprints from this
FA = []
FB = []

for train_index, test_index in cv.split(data):

  train_df = data[data.index.isin(train_index)]
  test_df = data[data.index.isin(test_index)]
  pair_subset_test = pd.merge(test_df, test_df, how='cross')
  FA += [pair_subset_test.FP_x]
  FB += [pair_subset_test.FP_y]
  del pair_subset_test

(489, 4, 8)
(44, 0, 0)


In [None]:
# Calculate the similarity values
similarity_list = []
for i in range(len(np.concatenate(FA))):
  similarity_list.append(DataStructs.TanimotoSimilarity(DataStructs.cDataStructs.CreateFromBitString("".join(np.concatenate(FA)[i].astype(str))), DataStructs.cDataStructs.CreateFromBitString("".join(np.concatenate(FB)[i].astype(str)))))


In [None]:
#Export the csv
dataframe = pd.DataFrame(similarity_list)
dataframe.rename(columns={0: 'Tanimoto'}, inplace = True)
dataframe.to_csv('{}_CV_Similarity_Scores.csv'.format(name), index = False)
files.download("{}_CV_Similarity_Scores.csv".format(name))

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Calculate Mean Absolute Difference for CV Results

In [None]:
#LGBM
name = 'RenClear'
model = 'LGBM'
scores = pd.read_csv('{}_CV_LGBM_Delta_Scores.csv'.format(name))
scores["MAD"] = abs(scores["trues"] - scores["preds"])


In [None]:
#ChemProp
name = 'Sol'
model = 'ChemProp'
scores = pd.read_csv("Sol_DM_10fCV_1_5_sum_comparison_ChemSim.csv").T
scores.columns =['True', 'Delta', 'Traditional']
scores["MAD"] = abs(scores["True"] - scores["Delta"])

Plotting

In [None]:
sim = pd.read_csv('{}_CV_Similarity_Scores.csv'.format(name))

In [None]:
### Plotting ###
fig, ax = plt.subplots()

ax.hexbin(scores["MAD"], (1 - sim['Tanimoto']), gridsize=(100),
                norm=matplotlib.colors.LogNorm(), mincnt=1)
fig.colorbar(ax.hexbin(scores["MAD"], (1 - sim['Tanimoto']), gridsize=(100),
                norm=matplotlib.colors.LogNorm(), mincnt=1))

#line of best fit
plt.plot(np.unique(scores["MAD"]), np.poly1d(np.polyfit(scores["MAD"], (1 - sim['Tanimoto']), 1))(np.unique(scores["MAD"])), 'k')


# X=Y line on the plot
ax.set_xlabel('Error')
ax.set_ylabel('Tanimoto Difference')
ax.set_ylim([0, 1])
plt.savefig("Sim-{}-{}-HB.png".format(model, name), facecolor='white', dpi = 600)

files.download("Sim-{}-{}-HB.png".format(model, name))
#plt.style.use('default')
#plt.show()

In [None]:
### Stats ###
pearson = stats.pearsonr(scores["MAD"], (1 - sim['Tanimoto']))
MAE = metrics.mean_absolute_error(scores["MAD"], (1 - sim['Tanimoto']))
RMSE = math.sqrt(metrics.mean_squared_error(scores["MAD"], (1 - sim['Tanimoto'])))

scoring = pd.DataFrame({'Metric': ['Pearson\'s r', 'MAE', 'RMSE'], 'Value': [round(pearson[0], 3), round(MAE, 3), round(RMSE, 3)]})
scoring


Unnamed: 0,Metric,Value
0,Pearson's r,0.025
1,MAE,0.635
2,RMSE,0.669
