In [5]:
import pandas as pd
from rdkit import Chem
# discussion of circular fingerprints: https://pubs.acs.org/doi/10.1021/ci100050t
from rdkit.Chem import AllChem
import numpy as np
from tqdm import tqdm
tqdm.pandas()
import os

#Other fingerprint types to explore? 
#useful example: https://medium.com/@gurkamaldeol/predicting-environmental-carcinogens-with-logistic-regression-knn-gradient-boosting-and-7973f88eb8b3

In [16]:
datasets=[i for i in os.listdir('data_cleaned') if i[-4:]=='.csv']
datasets

['deepchem_Lipophilicity.csv',
 'sol_del.csv',
 'HIV.csv',
 'clintox.csv',
 'bace.csv',
 'tox21.csv']

In [17]:
data_map={
    'HIV.csv': {'target':'HIV_active','structure':'smiles'},
    'bace.csv':{'target':'active','structure':'mol'},
    'tox21.csv':{'target':'NR-AhR','structure':'smiles'},
    'cintox.csv':{'target':'CT_TOX','structure':'smiles'},
    'Solubility_delaney-processed.csv':{'target':'binned_sol','structure':'smiles'},
    'deepchem_Lipophilicity.csv':{'target':'drug_like','structure':'smiles'}   
}

In [18]:
data_map[datasets[0]]

{'target': 'drug_like', 'structure': 'smiles'}

In [19]:
def generate_fingerprint(smiles,radius,bits):
    mol=Chem.MolFromSmiles(smiles)
    fp=AllChem.GetMorganFingerprintAsBitVect(mol,radius,bits)
    return(np.array(fp))

In [20]:
# test this: 
generate_fingerprint('C=C=C',2,1024)

array([0, 0, 0, ..., 0, 0, 0])

In [21]:
# build a test case with HIV data:
radius=2
bits=1024
df=pd.read_csv('data_cleaned/HIV.csv')
df['fp']=df['smiles'].progress_apply(lambda x: generate_fingerprint(x,radius,bits))
df.head(2)

100%|████████████████████████████████████| 41127/41127 [00:46<00:00, 876.07it/s]


Unnamed: 0.1,Unnamed: 0,smiles,activity,HIV_active,fp
0,0,CCC1=[O+][Cu-3]2([O+]=C(CC)C1)[O+]=C(CC)CC(CC)...,CI,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1,C(=Cc1ccccc1)C1=[O+][Cu-3]2([O+]=C(C=Cc3ccccc3...,CI,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [24]:
# split the data:
from sklearn.model_selection import train_test_split

X=[generate_fingerprint(mol,2,1024) for mol in tqdm(df['smiles'])]
y=df['HIV_active'].to_list()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

100%|████████████████████████████████████| 41127/41127 [00:46<00:00, 875.91it/s]


In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss

# Setup Cross validation:
scoring = ['accuracy', 'f1','roc_auc','neg_log_loss']
clf= LogisticRegression(random_state=0,solver='lbfgs',max_iter=1000,verbose=False)
scores = cross_validate(clf, X_train, y_train, scoring=scoring, cv=5,return_train_score=True)

In [37]:
scores

{'fit_time': array([1.62300897, 1.75004601, 1.51937699, 1.66005206, 1.82311893]),
 'score_time': array([0.0514369 , 0.05118299, 0.05176306, 0.05027509, 0.05627394]),
 'test_accuracy': array([0.96887664, 0.96903874, 0.96839034, 0.96660723, 0.96660723]),
 'train_accuracy': array([0.97442859, 0.97483385, 0.97479332, 0.97487437, 0.9747528 ]),
 'test_f1': array([0.34693878, 0.36963696, 0.37299035, 0.33548387, 0.31333333]),
 'train_f1': array([0.47980214, 0.49056604, 0.48595041, 0.49180328, 0.48976249]),
 'test_roc_auc': array([0.78478334, 0.77768512, 0.79928576, 0.79040174, 0.78191513]),
 'train_roc_auc': array([0.92498255, 0.9235879 , 0.9208621 , 0.9240289 , 0.9261204 ]),
 'test_neg_log_loss': array([-0.1262743 , -0.12951498, -0.12225949, -0.12971206, -0.13112324]),
 'train_neg_log_loss': array([-0.08466054, -0.0837571 , -0.08530654, -0.0841092 , -0.0839419 ])}

In [38]:
for score in scores:
    print(score,scores[score].mean(),'+/-',scores[score].std())

fit_time 1.6751205921173096 +/- 0.10460982391149612
score_time 0.05218639373779297 +/- 0.002102802175466697
test_accuracy 0.9679040363105852 +/- 0.0010801317068541325
train_accuracy 0.9747365861565893 +/- 0.0001592387964369198
test_f1 0.34767665944107967 +/- 0.022143670389070835
train_f1 0.487576872571201 +/- 0.0043512371238090255
test_roc_auc 0.7868142181691768 +/- 0.007481654455841829
train_roc_auc 0.9239163694543008 +/- 0.0017574744089441726
test_neg_log_loss -0.12777681412501382 +/- 0.0031831865984096675
train_neg_log_loss -0.08435505797364296 +/- 0.0005634421172886916


In [72]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0,solver='lbfgs',max_iter=1000)
clf.fit(X_train,y_train)
y_train_pred=clf.predict(X_train)
y_test_pred=clf.predict(X_test)



In [74]:
# Score the model
result={}
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

result.update({'train':{'accuracy':accuracy_score(y_train, y_train_pred),
                       'f1':f1_score(y_train, y_train_pred)}})

result.update({'test':{'accuracy':accuracy_score(y_test, y_test_pred),
                       'f1':f1_score(y_test, y_test_pred)}})


In [76]:
print('Logistic Regression Result')
result

Logistic Regression Result


{'train': {'accuracy': 0.9735775652455827, 'f1': 0.45702864756828776},
 'test': {'accuracy': 0.9693639369772418, 'f1': 0.3835616438356164}}

In [77]:
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)
y_train_pred=dummy_clf.predict(X_train)
y_test_pred=dummy_clf.predict(X_test)

result_dummy={}
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

result_dummy.update({'train':{'accuracy':accuracy_score(y_train, y_train_pred),
                       'f1':f1_score(y_train, y_train_pred)}})

result_dummy.update({'test':{'accuracy':accuracy_score(y_test, y_test_pred),
                       'f1':f1_score(y_test, y_test_pred)}})
print('Dummy Result')
result_dummy

Dummy Result


{'train': {'accuracy': 0.965051061760415, 'f1': 0.0},
 'test': {'accuracy': 0.9645010698307722, 'f1': 0.0}}

In [83]:
df['HIV_active'].value_counts()

0    39684
1     1443
Name: HIV_active, dtype: int64

In [84]:
df['HIV_active'].value_counts()/df.shape[0]

0    0.964914
1    0.035086
Name: HIV_active, dtype: float64