In [1]:
import time
import random
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit import DataStructs
from rdkit.ML.Cluster import Butina
from rdkit.Chem import Draw
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem import FilterCatalog, PandasTools
from rdkit.Chem import Crippen,Descriptors



In [39]:
df_map = pd.read_csv('./1443_rsa_select.csv')
df_map['standardized_smiles'].to_csv('./smiles.txt', index=False)

In [None]:
#Calculate PAINS and clogP from rdkit
def detect_pains(smiles):
    mol = Chem.MolFromSmiles(smiles)
    params = FilterCatalog.FilterCatalogParams()
    params.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.PAINS)
    catalog = FilterCatalog.FilterCatalog(params)
    if mol is None:
        return False
    else:
        return catalog.HasMatch(mol)

def calculate_properties(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    clogp = Crippen.MolLogP(mol)
    return clogp

df_map['PAINS'] = df_map['standardized_smiles'].apply(detect_pains)
df_map['clogP'] = df_map['standardized_smiles'].apply(calculate_properties)
df_map


Unnamed: 0,standardized_smiles,vina_title,glide_title,rmsd,score_vina,score_glide,PharmFit_score,Total_RSA,Side_chain_RSA,PAINS,clogP
0,Cc1nn(CC(=O)N2CCC3(CC2)OCCO3)c(=O)c2c(-n3cccc3...,HIT105860007.pdbqt,ligprep_chemdiv-stock1_02_400000_split-10_8.sd...,1.260696,-7.9,-6.01693,2.836370,0.317932,0.044168,False,2.18592
1,CCOc1cc(C2CC(=O)Nc3nc(SCc4ccccc4F)[nH]c(=O)c32...,HIT100531985.pdbqt,ligprep_chemdiv-stock1_02_400000_split-10_0.sd...,1.576517,-7.8,-6.68465,2.805660,0.319012,0.038291,False,2.93820
2,Cc1cccc(-c2c3c(=O)n(C)c(=O)n(C)c3c3n2-c2ccccc2...,HIT102285018.pdbqt,ligprep_chemdiv-stock1_02_400000_split-10_0.sd...,0.624294,-8.2,-6.26076,2.721380,0.344806,0.065206,False,4.11122
3,CCOc1ccc(C2CC(=O)Nc3nc(SCc4ccccc4)[nH]c(=O)c32...,HIT104280749.pdbqt,ligprep_chemdiv-stock1_02_400000_split-10_0.sd...,2.024323,-8.0,-6.49891,2.678040,0.319485,0.038291,False,3.94360
4,O=C1CC(c2ccc(F)c(Br)c2)c2c(nc(SCc3ccccc3)[nH]c...,HIT107574755.pdbqt,ligprep_chemdiv-stock1_02_400000_split-10_1.sd...,1.512527,-8.4,-6.20592,2.629000,0.319191,0.038291,False,4.43790
...,...,...,...,...,...,...,...,...,...,...,...
2062,Cc1ccc(-n2c(-c3cccc([N+](=O)[O-])c3)nc(=S)c3c2...,HIT100747872.pdbqt,ligprep_chemdiv-stock1_01_400000_split-10_2.sd...,0.541712,-7.7,-6.49692,0.502771,0.327821,0.048786,False,5.36421
2063,CNC(=O)C1CN(C(=O)c2ccncn2)CC12CCN(c1ccc(C#N)c(...,HIT103047606.pdbqt,ligprep_chemdiv-stock1_03_400000_split-10_8.sd...,2.074108,-8.2,-5.91642,0.502647,0.305068,0.039088,False,2.47188
2064,O=C(Cn1c(=O)c(=O)n(Cc2ccncc2)c2ncccc21)N1CCN(c...,HIT104698140.pdbqt,ligprep_chemdiv-stock1_02_400000_split-10_9.sd...,2.094927,-8.1,-5.89543,0.502169,0.317966,0.042929,False,1.35030
2065,O=C1c2[nH]nc(-c3ccccc3O)c2C(c2cccc(OCc3ccccc3)...,HIT101305057.pdbqt,ligprep_chemdiv-stock1_01_400000_split-10_9.sd...,1.795784,-8.2,-6.38039,0.501502,0.325137,0.045199,False,3.89880


In [46]:
df_map = df_map[df_map['PAINS']==False]
df_map = df_map[df_map['clogP']<=5.0]
df_map

Unnamed: 0,standardized_smiles,vina_title,glide_title,rmsd,score_vina,score_glide,PharmFit_score,Total_RSA,Side_chain_RSA,PAINS,clogP
247,Cn1c(=O)c2[nH]c(N3CCN(C4CC(=O)N(c5ccccc5)C4=O)...,HIT105312175.pdbqt,ligprep_chemdiv-stock1_04_414266_split-10_9.sd...,2.175176,-8.1,-5.94517,1.353240,0.317425,0.045812,False,-0.58560
847,O=C(CN1C(=O)NC2(CCS(=O)(=O)CC2)C1=O)Nc1ccc2c(c...,HIT101134610.pdbqt,ligprep_chemdiv-stock1_03_400000_split-10_7.sd...,1.959937,-8.0,-5.90213,0.891289,0.305908,0.045247,False,-0.10450
1204,CS(=O)(=O)N1CCCC(C(=O)Nc2cccnc2)CCNC(=O)CC1,HIT105178186.pdbqt,ligprep_chemdiv-stock1_03_400000_split-10_8.sd...,2.213151,-7.9,-6.58018,0.748621,0.313027,0.044180,False,0.58810
56,Cc1ccc(NC(=O)C2CCCN(c3cc(=O)n(C)c(=O)n3C)C2)nc1,HIT213607767.pdbqt,ligprep_chemdiv-stock1_02_400000_split-10_2.sd...,0.728513,-7.8,-6.54781,1.865270,0.313901,0.042760,False,0.64252
1314,Cc1n[nH]c(-n2ccc3nc4ccn(-n5c(C)nnc5C)c(=O)c4cc...,HIT103525818.pdbqt,ligprep_chemdiv-stock1_04_414266_split-10_4.sd...,0.464932,-7.7,-6.17625,0.709489,0.344463,0.064781,False,0.64686
...,...,...,...,...,...,...,...,...,...,...,...
251,CC1=C(C#N)C(C(F)(F)F)(C(F)(F)F)N=C(c2ccc(C(C)(...,HIT102512859.pdbqt,ligprep_chemdiv-stock1_01_400000_split-10_6.sd...,0.790494,-7.8,-7.74184,1.350280,0.308812,0.038291,False,4.99488
1985,O=C(Nc1ccccc1Cl)C1Sc2nnc(-c3ccccc3)n2NC1c1ccccc1,HIT103652901.pdbqt,ligprep_chemdiv-stock1_01_400000_split-10_3.sd...,1.373494,-8.3,-5.87832,0.523340,0.304283,0.042444,False,4.99630
1907,O=C(Nc1ccccc1)C1Sc2nnc(-c3ccccc3)n2NC1c1ccc(Cl...,HIT101524868.pdbqt,ligprep_chemdiv-stock1_01_400000_split-10_3.sd...,1.282145,-8.0,-6.27591,0.540875,0.300560,0.040910,False,4.99630
1503,O=C(Cn1c(=O)n(Cc2ccc3c(c2)OCO3)c(=O)c2oc3ccccc...,HIT100941962.pdbqt,ligprep_chemdiv-stock1_02_400000_split-10_2.sd...,0.788503,-8.7,-6.74043,0.650304,0.333522,0.051896,False,4.99730


In [48]:
admet_map = pd.read_csv('./admet_pred.csv', sep = '\t')
#df_map.drop(columns = ['logS_x', 'logS_y', 'logS', 'logS_dup'], axis = 1, inplace = True)
df_map = df_map.merge(admet_map[['standardized_smiles','logS']], on = 'standardized_smiles', how = 'left', suffixes=('', '_dup'))
df_map = df_map.sort_values(by='logS', ascending=False)
df_map = df_map[df_map['logS']>=-5.0]

In [49]:
#df_map.drop(columns = ['logS_dup'], axis = 1, inplace = True)
df_map.to_csv('./1443_admet_select.csv',index=False)