# 1. ADMET

In [3]:
import os
import glob
import shutil
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.rdBase import BlockLogs
import sys
sys.path.append('./Codebase/utility')
from standardize import standardization
from medicinal_chemistry import calculate_ro5_properties, pains_filter

## 1.1. Smiles curation and standardization

In [2]:
df = pd.read_csv('./Data/repurDatabase(15235).csv', index_col=0)
df.dropna(subset=['Smiles'], inplace = True)
df = df[df['Smiles'] !='Did not work']
df.reset_index(drop=True, inplace = True)

In [None]:
df['Molecules'] = df['Smiles'].apply(standardization)
block = BlockLogs()

In [9]:
df.dropna(subset=['Molecules'], inplace = True)

## 1.2. Filter

In [None]:
df['ro5'] = df['Molecules'].apply(calculate_ro5_properties)
df['pains'] = df['Molecules'].apply(pains_filter)

In [13]:
ro5 = df[df['ro5']==True]
pains = ro5[ro5['pains']==True]
pains['StandardSmiles'] = pains['Molecules'].apply(Chem.MolToSmiles)

In [29]:
screen_data = pains[['Name', 'StandardSmiles']]

In [30]:
screen_data.to_csv('./Data/pains.csv')

# 2. SIMILARITY

In [40]:
sys.path.append('./Codebase/similarity')
from SimilarityCalculation import similarity_calculate
from rdkit.Avalon import pyAvalonTools as fpAvalon
from rdkit import Chem, DataStructs

In [35]:
# BMS-1166
base = "O=C(O)[C@@H]1N(CC2=CC(Cl)=C(OCC3=CC=CC(C4=CC=C(OCCO5)C5=C4)=C3C)C=C2OCC6=CC=CC(C#N)=C6)C[C@H](O)C1"
query = Chem.MolFromSmiles(base)
query.SetProp('_Name', 'BMS-1166')
avalon_query = fpAvalon.GetAvalonFP(query, 1024) 

In [38]:
screen_data = pd.read_csv('./Data/pains.csv', index_col=0)
screen_data['Molecules'] = screen_data['StandardSmiles'].apply(Chem.MolFromSmiles)

In [39]:
avalon_list = screen_data['Molecules'].apply(fpAvalon.GetAvalonFP, nBits=1024).tolist()

In [41]:
screen_data["tanimoto_avalon"] = DataStructs.BulkTanimotoSimilarity(avalon_query, avalon_list)

In [45]:
simi_screen=screen_data[screen_data["tanimoto_avalon"] > 0.32]
simi_screen.shape

(834, 4)

In [46]:
simi_screen.to_csv('./Data/simi_screen.csv')

# 3. QSAR

In [4]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
import os 
import sys
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

sys.path.append('./Codebase/ann')
from Featurizer import featurizer
from TargetNormalize import target_bin
from ModelArchitecture import Net
from annpipeline import model_pipeline

In [21]:
path = './Data/simi_screen.csv'
df_filter = pd.read_csv(path, index_col=0 )
df_filter.reset_index(drop=True, inplace = True)
df_filter.head()

Unnamed: 0,Name,StandardSmiles,Molecules,tanimoto_avalon
0,Ramipril,CCOC(=O)C(CCc1ccccc1)NC(C)C(=O)N1C(C(=O)O)C[C@...,<rdkit.Chem.rdchem.Mol object at 0x7f615f85c040>,0.370253
1,Reserpine,COC(=O)C1[C@H]2C[C@@H]3c4[nH]c5cc(OC)ccc5c4CCN...,<rdkit.Chem.rdchem.Mol object at 0x7f615f85ca50>,0.339731
2,Trospium,O=C(O[C@H]1C[C@H]2CC[C@@H](C1)[N+]21CCCC1)C(O)...,<rdkit.Chem.rdchem.Mol object at 0x7f615f85cb30>,0.339869
3,Moxifloxacin,COc1c(N2C[C@@H]3CCCN[C@@H]3C2)c(F)cc2c(=O)c(C(...,<rdkit.Chem.rdchem.Mol object at 0x7f615f85cf20>,0.346154
4,Nelfinavir,Cc1c(O)cccc1C(=O)N[C@@H](CSc1ccccc1)[C@H](O)CN...,<rdkit.Chem.rdchem.Mol object at 0x7f615f85d040>,0.384393


In [22]:
df_filter.drop(['tanimoto_avalon','Molecules'], axis =1, inplace = True)
df_filter['pIC50'] =0

In [23]:
df_filter

Unnamed: 0,Name,StandardSmiles,pIC50
0,Ramipril,CCOC(=O)C(CCc1ccccc1)NC(C)C(=O)N1C(C(=O)O)C[C@...,0
1,Reserpine,COC(=O)C1[C@H]2C[C@@H]3c4[nH]c5cc(OC)ccc5c4CCN...,0
2,Trospium,O=C(O[C@H]1C[C@H]2CC[C@@H](C1)[N+]21CCCC1)C(O)...,0
3,Moxifloxacin,COc1c(N2C[C@@H]3CCCN[C@@H]3C2)c(F)cc2c(=O)c(C(...,0
4,Nelfinavir,Cc1c(O)cccc1C(=O)N[C@@H](CSc1ccccc1)[C@H](O)CN...,0
...,...,...,...
829,Epelsiban,CC[C@H](C)C1C(=O)NC(C2Cc3ccccc3C2)C(=O)N1C(C(=...,0
830,Delafloxacin,Nc1nc(-n2cc(C(=O)O)c(=O)c3cc(F)c(N4CC(O)C4)c(C...,0
831,Dacomitinib,COc1cc2ncnc(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)/C=C/C...,0
832,SJG-136,C=C1CC2C=Nc3cc(OCCCOc4cc5c(cc4OC)C(=O)N4CC(=C)...,0


In [24]:
feature= featurizer(data=df_filter, ID_col='Name', smiles_col='StandardSmiles', type_fp = 'secfp', active_col = 'pIC50')
df=feature.fit()
df.head()

Unnamed: 0,Name,pIC50,0,1,2,3,4,5,6,7,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,Ramipril,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Reserpine,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
2,Trospium,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,Moxifloxacin,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Nelfinavir,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
#df = df_filter.copy()
X = df.drop(['pIC50','Name'],axis = 1)
y = df["pIC50"]
#Split Data train, Data_test, Data_validation

#setting device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
# change numpy to tensor gpu
X_predict = torch.tensor(X.values , device=device).float()


y_predict = torch.tensor(y.values , device=device).float()


# convert into dataloader
predict_dataset = TensorDataset(X_predict, y_predict)


predict_loader = torch.utils.data.DataLoader(dataset=predict_dataset,
                                          batch_size=1,
                                          shuffle=False)


cuda:0


In [26]:
checkpoint = torch.load('./ANN_model.pth')
criterion = torch.nn.BCELoss()
pip = model_pipeline(device = device, seed = 42,
                     save_dir = '.')
model =  Net(in_features = 2048, hidden_nodes_1 =2048,hidden_nodes_2 =1024, hidden_nodes_3=256, drop_out=0.5)
y_proba, y_true = pip.predict_proba(predict_loader, model =model, checkpoint=checkpoint,
                                   criterion=criterion)

In [27]:
y_prob_flat = []
y_pred_new = []
for key, values in enumerate(y_proba):
    y_prob_flat.append(values[0])
    if values[0] < 0.491328:
        y_pred_new.append(0)
    else:
        y_pred_new.append(1)

In [28]:
df

Unnamed: 0,Name,pIC50,0,1,2,3,4,5,6,7,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,Ramipril,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Reserpine,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
2,Trospium,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,Moxifloxacin,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Nelfinavir,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
829,Epelsiban,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
830,Delafloxacin,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
831,Dacomitinib,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
832,SJG-136,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
pred = pd.DataFrame(y_pred_new, columns = ['Pred'])
prob = pd.DataFrame(100*np.round(y_prob_flat,3), columns = ['Proba'])

In [35]:
sort = pd.concat([df['Name'], df_filter['StandardSmiles'],pred,prob], axis=1).sort_values('Proba', ascending=False)

In [36]:
sort[sort['Proba']> 70]

Unnamed: 0,Name,StandardSmiles,Pred,Proba
431,"(2S,4S,5R)-2-ISOBUTYL-5-(2-THIENYL)-1-[4-(TRIF...",CC(C)C[C@@]1(C(=O)O)CC(C(=O)O)[C@H](c2cccs2)N1...,1,75.0
237,"(2s)-2-[(2,4-Dichloro-Benzoyl)-(3-Trifluoromet...",O=C(O)C(Cc1ccccc1)N(Cc1cccc(C(F)(F)F)c1)C(=O)c...,1,72.799995
147,Meticillin,COc1cccc(OC)c1C(=O)NC1C(=O)N2C(C(=O)O)C(C)(C)S...,1,71.599998
430,"(2S,4S,5R)-1-(4-TERT-BUTYLBENZOYL)-2-ISOBUTYL-...",CC(C)C[C@@]1(C(=O)O)CC(C(=O)O)[C@H](c2nccs2)N1...,1,71.300003
588,Podophyllin,COc1cc(C2c3cc4c(cc3C(O)C(CO)C2C(=O)O)OCO4)cc(O...,1,71.100006
786,NT 13,CC(O)C(N)C(=O)N1CCCC1C(=O)N1CCCC1C(=O)NC(C(=O)...,1,70.300003
344,Atrasentan,CCCCN(CCCC)C(=O)CN1C[C@H](c2ccc3c(c2)OCO3)C(C(...,1,70.200005
