In [5]:

import time
import random
from pathlib import Path

import pandas as pd
import numpy
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit import DataStructs
from rdkit.ML.Cluster import Butina
from rdkit.Chem import Draw
from rdkit.Chem import rdFingerprintGenerator

In [6]:
compound_df = pd.read_csv("vinaselect.csv")
compound_df

Unnamed: 0,number,id,smiles,vina,pharm
0,1,HIT100914341,O=S(=O)(c1ccc2ccccc2c1)N1CCN(c2ncnc3sc(-c4cccc...,-10.1,3.02889
1,2,HIT105213041,CC12C(=O)C(C)(C(c3ccccc3)=C1c1ccccc1)C1C(=O)N(...,-10.8,3.00000
2,3,HIT105865213,CCOc1ccc(N2C(=O)C3C4C=CC(C4=C(c4ccccc4)c4ccccc...,-10.2,3.00000
3,4,HIT101566223,COC(=O)c1sc(N2C(=O)c3oc4ccccc4c(=O)c3C2c2cccc(...,-10.7,2.97849
4,5,HIT102935161,COC(=O)c1cccc(N2C(=O)C3C4C=CC(C4=C(c4ccccc4)c4...,-10.5,2.94148
...,...,...,...,...,...
2260,2261,HIT102828838,O=C1CC(c2ccccc2)CC2=C1C(c1cccc(Oc3ccccc3)c1)Nc...,-10.7,1.20041
2261,2262,HIT100175699,O=C(Nc1ccc(Oc2ccc(Br)c3ccccc23)c(Cl)c1)c1cc2cc...,-10.4,1.20016
2262,2263,HIT211254118,Oc1ccc(/C=N/c2ccc(C34CC5CC(CC(c6ccccc6)(C5)C3)...,-9.9,1.20011
2263,2264,HIT211686313,O=C(O)c1c2c(nc3ccc(S(=O)(=O)N4CCC(C(=O)N5CCc6c...,-10.6,1.20011


In [7]:
compound = []
smile = []

for _, number, smiles in compound_df[["number", "smiles"]].itertuples():
    compound.append((number, Chem.MolFromSmiles(smiles)))
    smile.append((number, smiles))
    
from rdkit.Chem import AllChem
fps = [AllChem.GetMorganFingerprintAsBitVect(x,2,1024) for id, x in compound]




In [8]:
import numpy as np
def clusterfps(fps, cutoff=0.4):
    dists = []
    nfps = len(fps)
    for i in range(1,nfps):
        sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
        dists.extend([1-x for x in sims])
    
    cs = Butina.ClusterData(dists, nfps, cutoff, isDistData=True)
    cs = sorted(cs, key=len, reverse=True)
    return cs

In [10]:
cluster = clusterfps(fps, cutoff=0.5)
print("Number of clusters:", len(cluster))
print("Number of molecules in largest cluster:", len(cluster[0]))

Number of clusters: 856
Number of molecules in largest cluster: 193


In [11]:
cluster[0]

(1662,
 91,
 121,
 142,
 143,
 144,
 152,
 155,
 156,
 161,
 166,
 169,
 175,
 180,
 181,
 183,
 185,
 186,
 194,
 197,
 239,
 242,
 253,
 257,
 287,
 295,
 298,
 303,
 319,
 323,
 343,
 385,
 475,
 522,
 535,
 543,
 545,
 562,
 732,
 748,
 793,
 825,
 849,
 983,
 1003,
 1098,
 1105,
 1191,
 1192,
 1242,
 1245,
 1253,
 1261,
 1266,
 1270,
 1285,
 1286,
 1288,
 1294,
 1309,
 1313,
 1316,
 1328,
 1329,
 1351,
 1354,
 1368,
 1375,
 1386,
 1387,
 1395,
 1403,
 1404,
 1407,
 1408,
 1414,
 1433,
 1437,
 1438,
 1442,
 1448,
 1455,
 1457,
 1462,
 1467,
 1470,
 1473,
 1474,
 1491,
 1493,
 1496,
 1497,
 1500,
 1504,
 1505,
 1506,
 1507,
 1534,
 1535,
 1540,
 1542,
 1543,
 1555,
 1557,
 1563,
 1567,
 1571,
 1572,
 1580,
 1591,
 1592,
 1593,
 1594,
 1595,
 1605,
 1615,
 1630,
 1635,
 1637,
 1642,
 1650,
 1653,
 1660,
 1670,
 1674,
 1675,
 1685,
 1694,
 1703,
 1712,
 1724,
 1734,
 1737,
 1750,
 1751,
 1753,
 1759,
 1764,
 1766,
 1769,
 1776,
 1778,
 1783,
 1802,
 1808,
 1809,
 1824,
 1833,
 1834,
 

In [12]:
list = []
for i in range(856):
    cluster[i] = np.sort(cluster[i])
    list2 = [smile[x][1] for x in cluster[i]]
    list.append(list2)
   


In [13]:
cluster[0]


array([  91,  121,  142,  143,  144,  152,  155,  156,  161,  166,  169,
        175,  180,  181,  183,  185,  186,  194,  197,  239,  242,  253,
        257,  287,  295,  298,  303,  319,  323,  343,  385,  475,  522,
        535,  543,  545,  562,  732,  748,  793,  825,  849,  983, 1003,
       1098, 1105, 1191, 1192, 1242, 1245, 1253, 1261, 1266, 1270, 1285,
       1286, 1288, 1294, 1309, 1313, 1316, 1328, 1329, 1351, 1354, 1368,
       1375, 1386, 1387, 1395, 1403, 1404, 1407, 1408, 1414, 1433, 1437,
       1438, 1442, 1448, 1455, 1457, 1462, 1467, 1470, 1473, 1474, 1491,
       1493, 1496, 1497, 1500, 1504, 1505, 1506, 1507, 1534, 1535, 1540,
       1542, 1543, 1555, 1557, 1563, 1567, 1571, 1572, 1580, 1591, 1592,
       1593, 1594, 1595, 1605, 1615, 1630, 1635, 1637, 1642, 1650, 1653,
       1660, 1662, 1670, 1674, 1675, 1685, 1694, 1703, 1712, 1724, 1734,
       1737, 1750, 1751, 1753, 1759, 1764, 1766, 1769, 1776, 1778, 1783,
       1802, 1808, 1809, 1824, 1833, 1834, 1836, 18

In [14]:
df = pd.DataFrame(data=list)
df.to_csv("select_cluster.csv")

In [None]:
Draw.MolsToGridImage([compound[np.min(cluster[i])][1] for i in range(50)], legends=[str(compound[np.min(cluster[i])][0]) for i in range(50)], subImgSize=(500,500), molsPerRow=3,)

In [15]:
from rdkit import Chem
from rdkit.Chem import FilterCatalog, PandasTools
import pandas as pd
from rdkit.Chem import Crippen,Descriptors

param = FilterCatalog.FilterCatalogParams()
param.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.PAINS)
filter = FilterCatalog.FilterCatalog(param)

df = pd.read_csv("aftercluster.csv")

# df2['PAINS'] = df2.SMILES.map(filter.HasMatch)
# mol = [Chem.MolFromSmiles for smi in df2.SMILES]
pains = []
logps = []
for smi in df.smiles:
    mol = Chem.MolFromSmiles(smi)
    logp = Crippen.MolLogP(mol)
    pain = filter.HasMatch(mol)
    logps.append(logp)
    pains.append(pain)
    
df['logp'] = logps
df['pain'] = pains
df.to_csv('finalpharm.csv')
    
# pains = [filter.HasMatch(m) for m in mol]
# pains

