In [11]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from tqdm import tqdm

In [2]:
df_amines = pd.read_csv('../data/DRD3/amines.tsv', delimiter='\t')
df_acids = pd.read_csv('../data/DRD3/acids.tsv', delimiter='\t')
df_hits = pd.read_csv('../data/DRD3/hits.tsv', delimiter='\t')

In [3]:
num_amines = len(df_amines)
num_acids = len(df_acids)

In [4]:
#Saving indexes of hit pairs
for q in [0.6, 0.7, 0.8, 0.9]:
    q_polar = 0.5
    df_ = df_hits[(df_hits['IFP']>=q)&(df_hits['IFP_polar']>=q_polar)]
    amines_idxs = np.array(df_amines.reset_index().set_index('name').loc[df_['name_1']]['index'])
    acids_idxs = np.array(df_acids.reset_index().set_index('name').loc[df_['name_2']]['index'])
    hits_idxs = amines_idxs * num_acids + acids_idxs
    #np.save(f'../data/DRD3/hits_idxs_q_{q}.npy', hits_idxs)

## Generate fingerptints of building blocks

In [7]:
fp_gen = AllChem.GetMorganGenerator(radius=2, fpSize=2048, includeChirality=True, useBondTypes=True)

In [12]:
fps = np.zeros((len(df_amines), 2048), np.int8)
for i, smiles in enumerate(tqdm(df_amines['smiles'], desc="Calculating fingerprints", unit="fp")):
    
    mol = Chem.MolFromSmiles(smiles)
    fp = fp_gen.GetFingerprint(mol)    
    fps[i] = np.array(fp).astype(np.int8)

np.save(f'../data/DRD3/amines.npy', fps)

Calculating fingerprints: 100%|█████████████████████████████████████████████████████████████████| 33726/33726 [00:33<00:00, 1015.61fp/s]


In [13]:
fps = np.zeros((len(df_acids), 2048), np.int8)
for i, smiles in enumerate(tqdm(df_acids['smiles'], desc="Calculating fingerprints", unit="fp")):
    
    mol = Chem.MolFromSmiles(smiles)
    fp = fp_gen.GetFingerprint(mol)    
    fps[i] = np.array(fp).astype(np.int8)

np.save(f'../data/DRD3/acids.npy', fps)

Calculating fingerprints: 100%|█████████████████████████████████████████████████████████████████| 19887/19887 [00:19<00:00, 1003.10fp/s]


In [None]:
f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})
sns.set(style="ticks")

sns.boxplot(data=df_hits['tanimoto_IFP'], orient="h", fliersize=1, ax=ax_box)
sns.histplot(data=df_hits['tanimoto_IFP'], bins=20, kde=True, kde_kws={'bw_adjust':5}, ax=ax_hist)
plt.xlim([0.59,1.01])
ax_box.set(yticks=[])
ax_hist.set_yticks([0, 500000, 1000000, 1500000, 2000000, 2500000, 3000000], ['0', '0.5M', '1M', '1.5M', '2M', '2.5M', '3M'], fontsize=12)
ax_hist.set_xticks([i*0.1 for i in range(6,11)], [round(i*0.1,1) for i in range(6,11)],  fontsize=12)
sns.despine(ax=ax_hist)
sns.despine(ax=ax_box, left=True)
ax_box.set(xlabel='')
ax_hist.set_xlabel('IFP Tanimoto smilarity', fontsize=14)
ax_hist.set_ylabel('Number of molecules', fontsize=14)
#plt.savefig('/projects/ML-SpaceDock/pictures/DRD3_full_hits_ifp_distribution', dpi=300, bbox_inches='tight')
plt.show()