In [4]:
import pandas as pd
from rdkit import Chem
# discussion of circular fingerprints: https://pubs.acs.org/doi/10.1021/ci100050t
from rdkit.Chem import AllChem
import numpy as np
from tqdm import tqdm
tqdm.pandas()
import os

#Other fingerprint types to explore? 
#useful example: https://medium.com/@gurkamaldeol/predicting-environmental-carcinogens-with-logistic-regression-knn-gradient-boosting-and-7973f88eb8b3

In [17]:
datasets=[i for i in os.listdir('data_cleaned') if i[-4:]=='.csv']
datasets

['deepchem_Lipophilicity.csv',
 'sol_del.csv',
 'HIV.csv',
 'clintox.csv',
 'bace.csv',
 'tox21.csv']

In [18]:
data_map={
    'HIV.csv': {'target':'HIV_active','structure':'smiles'},
    'bace.csv':{'target':'active','structure':'mol'},
    'tox21.csv':{'target':'NR-AhR','structure':'smiles'},
    'cintox.csv':{'target':'CT_TOX','structure':'smiles'},
    'Solubility_delaney-processed.csv':{'target':'binned_sol','structure':'smiles'},
    'deepchem_Lipophilicity.csv':{'target':'drug_like','structure':'smiles'}   
}

In [19]:
data_map[datasets[0]]

{'target': 'drug_like', 'structure': 'smiles'}

In [None]:
AllChem.GetMorganFingerprintAsBitVect(m1,2,nBits=1024)

In [16]:
def generate_fingerprint(smiles,radius,bits):
    mol=Chem.MolFromSmiles(smiles)
    fp=AllChem.GetMorganFingerprintAsBitVect(mol,radius,bits)
    return(np.array(fp))

In [17]:
# test this: 
generate_fingerprint('C=C=C',2,1024)

array([0, 0, 0, ..., 0, 0, 0])

In [22]:
# build a test case with HIV data:
radius=2
bits=1024
df=pd.read_csv('data_cleaned/HIV.csv')
df['fp']=df['smiles'].progress_apply(lambda x: generate_fingerprint(x,radius,bits))
df.head(2)

100%|████████████████████████████████████████████████████| 41127/41127 [00:47<00:00, 861.94it/s]


Unnamed: 0.1,Unnamed: 0,smiles,activity,HIV_active,fp
0,0,CCC1=[O+][Cu-3]2([O+]=C(CC)C1)[O+]=C(CC)CC(CC)...,CI,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1,C(=Cc1ccccc1)C1=[O+][Cu-3]2([O+]=C(C=Cc3ccccc3...,CI,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [65]:
# split the data:
from sklearn.model_selection import train_test_split

X=[generate_fingerprint(mol,2,1024) for mol in df['smiles']]
y=df['HIV_active'].to_list()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)



In [72]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0,solver='lbfgs',max_iter=1000)
clf.fit(X_train,y_train)
y_train_pred=clf.predict(X_train)
y_test_pred=clf.predict(X_test)



In [74]:
# Score the model
result={}
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

result.update({'train':{'accuracy':accuracy_score(y_train, y_train_pred),
                       'f1':f1_score(y_train, y_train_pred)}})

result.update({'test':{'accuracy':accuracy_score(y_test, y_test_pred),
                       'f1':f1_score(y_test, y_test_pred)}})


In [76]:
print('Logistic Regression Result')
result

Logistic Regression Result


{'train': {'accuracy': 0.9735775652455827, 'f1': 0.45702864756828776},
 'test': {'accuracy': 0.9693639369772418, 'f1': 0.3835616438356164}}

In [77]:
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)
y_train_pred=dummy_clf.predict(X_train)
y_test_pred=dummy_clf.predict(X_test)

result_dummy={}
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

result_dummy.update({'train':{'accuracy':accuracy_score(y_train, y_train_pred),
                       'f1':f1_score(y_train, y_train_pred)}})

result_dummy.update({'test':{'accuracy':accuracy_score(y_test, y_test_pred),
                       'f1':f1_score(y_test, y_test_pred)}})
print('Dummy Result')
result_dummy

Dummy Result


{'train': {'accuracy': 0.965051061760415, 'f1': 0.0},
 'test': {'accuracy': 0.9645010698307722, 'f1': 0.0}}

In [83]:
df['HIV_active'].value_counts()

0    39684
1     1443
Name: HIV_active, dtype: int64

In [84]:
df['HIV_active'].value_counts()/df.shape[0]

0    0.964914
1    0.035086
Name: HIV_active, dtype: float64