In [43]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
from rdkit import DataStructs
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit.Chem import Descriptors
from rdkit import Chem

In [3]:
df = pd.read_csv('generated_drugs/high_pop_test.csv')
df

Unnamed: 0,score,string
0,-0.025151,CC1=C=CC(NC2CN(CCCC3=CC=CC(Cl)=C3)C2)CCCC1
1,-0.02772,C1(C2=CCN(CCCC3=CC=CC(Cl)=C3)C2)CCCC1
2,-0.030168,CC1(C2=CC=CC(Cl)=C2)C=CCCC1
3,-0.032306,ClC[NH+]1CCC(-C2CN(CCCC3=CC=CC(Cl)=C3)C2)CC1
4,-0.037851,CC1=CC=C=CC=CC(NC2/C(=O)N(CCCC3=CC=CC(Cl)=C3)C...
5,-0.042654,C[NH+]1CC=CC=C=CC(NC2/C(=O)N(CCCC3=CC=CC(Cl)=C...
6,-0.044624,C1(C2=CC=CC=C2)NC(=O)N(CCOC2=CCC=CC=CC(Cl)=C2)...
7,-0.046446,ClC[NH+]1CCC(NC2CC(=O)N(CCC3=CC(Cl)=C3)C2)CC1
8,-0.048542,CC1=C=CC(NC2/C(=O)N(CCCC3=CC=CC(Cl)=C3)C2)CCCC1
9,-0.049678,C[NH+]1CCC(NC2CC(=O)N(CCC3=CC(Cl)=C3)C2)CC1


In [4]:
mols = [Chem.MolFromSmiles(i) for i in df['string']]

In [5]:
generated_fps = [FingerprintMols.FingerprintMol(i, minPath=1, maxPath=7, fpSize=2048, bitsPerHash=2, useHs=True, tgtDensity=0.0, minSize=128) for i in mols]

In [6]:
# DataStructs.BulkTanimotoSimilarity(fps[0], [fps[0], fps[0]])

In [7]:
og_df = pd.read_csv('../unused/filtered_dataset.csv', low_memory=False)

In [8]:
og_mols = [Chem.MolFromSmiles(i) for i in og_df['PUBCHEM_EXT_DATASOURCE_SMILES']]

In [9]:
og_fps = [FingerprintMols.FingerprintMol(i, minPath=1, maxPath=7, fpSize=2048, bitsPerHash=2, useHs=True, tgtDensity=0.0, minSize=128) for i in tqdm(og_mols)]

100%|██████████| 18234/18234 [00:46<00:00, 389.51it/s]


In [18]:
[np.average(DataStructs.BulkTanimotoSimilarity(i, og_fps)) for i in generated_fps]

[0.20509837417113258,
 0.19862037132431223,
 0.17843027853602703,
 0.17613834927432084,
 0.2353521941919275,
 0.23153580135107404,
 0.2550016527531748,
 0.18493233549404411,
 0.2346177504698924,
 0.18081902553741294,
 0.13857314436467733,
 0.2565360312159961,
 0.2289682813079061,
 0.22855783399301224,
 0.19240826867780986,
 0.21273923811734552,
 0.22823044048895327,
 0.20843186518400975,
 0.23990287082978173,
 0.23686680993559672]

In [28]:
naive_df = pd.concat([pd.read_csv(f'./generated_drugs/{i}') for i in os.listdir('./generated_drugs/') if 'custom_lipi' in i])

In [29]:
naive_mols = [Chem.MolFromSmiles(i) for i in naive_df['string']]

In [30]:
naive_fps = [FingerprintMols.FingerprintMol(i, minPath=1, maxPath=7, fpSize=2048, bitsPerHash=2, useHs=True, tgtDensity=0.0, minSize=128) for i in naive_mols]

In [34]:
[(np.average(DataStructs.BulkTanimotoSimilarity(i, og_fps)), n) for n, i in enumerate(naive_fps)]

[(0.20153296598654852, 0),
 (0.16175403415671952, 1),
 (0.03560039907378216, 2),
 (0.2259865687539655, 3),
 (0.22864893072535805, 4),
 (0.2626413624156172, 5),
 (0.24773483027273033, 6),
 (0.3201097712814645, 7),
 (0.28188426467786, 8),
 (0.2913825062155849, 9),
 (0.2976504830493346, 10),
 (0.30723792241718983, 11),
 (0.05329304347947529, 12),
 (0.027355708615441978, 13),
 (0.1340586104697433, 14),
 (0.015452096953849812, 15),
 (0.048752687603629656, 16),
 (0.10976504364364313, 17),
 (0.053737429082165514, 18),
 (0.0613363268440576, 19),
 (0.25120019215826594, 20),
 (0.2781802069038573, 21),
 (0.023423918020319616, 22),
 (0.29503541045893517, 23),
 (0.2939376592410789, 24),
 (0.307828070649803, 25),
 (0.26125911618372133, 26),
 (0.2469799382309164, 27)]

In [35]:
naive_df['string'].iloc[7]

'O=C1C=C(C2=CC=CS2)CC(C2=CC=CS2)C1N1C=NC([N+](=O)[O-])=CCC1'

In [36]:
with open('./string_ga/ZINC_first_1000.smi') as f:
    zinc = f.read()

In [39]:
zinc_mols = [Chem.MolFromSmiles(i) for i in zinc.split()]

In [40]:
zinc_fps = [FingerprintMols.FingerprintMol(i, minPath=1, maxPath=7, fpSize=2048, bitsPerHash=2, useHs=True, tgtDensity=0.0, minSize=128) for i in zinc_mols]

In [41]:
[np.average(DataStructs.BulkTanimotoSimilarity(i, zinc_fps)) for i in generated_fps]

[0.20527712610728632,
 0.19808249182014168,
 0.17632610169305388,
 0.17710714239465658,
 0.2371368820717831,
 0.23256204200318376,
 0.25229583051470045,
 0.1913128684624799,
 0.2369196226394575,
 0.18811463454703223,
 0.14349863499473672,
 0.253811206670795,
 0.2303500612496228,
 0.23162480382144995,
 0.19219361935417908,
 0.20863028850175425,
 0.22733440998132795,
 0.21073077572033957,
 0.2411473932199905,
 0.2396645760165975]

In [51]:
logps = [Descriptors.MolLogP(i) for i in mols]
[(i - (-0.4)) / 4.0 for i in logps]

[1.2476000000000012,
 1.2762000000000011,
 1.1844500000000007,
 0.7738999999999999,
 1.212300000000001,
 0.5068750000000002,
 1.2882750000000012,
 0.31832499999999986,
 1.129250000000001,
 0.1767000000000001,
 -0.1052499999999996,
 1.1907500000000009,
 0.4681000000000003,
 1.132000000000001,
 1.139625000000001,
 1.1772750000000007,
 0.9623000000000005,
 0.5938499999999999,
 1.145200000000001,
 0.6216250000000001]