In [1]:
import random
import pandas as pd
import numpy as np
from scipy import sparse as ss
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors as rdmd
from rdkit.Chem import DataStructs
from tqdm import tqdm

## Read the building blocks and create an index for all pairs

In [27]:
amines_df = pd.read_csv('../data/DRD3/amines.tsv', delimiter='\t')
acids_df = pd.read_csv('../data/DRD3/acids.tsv', delimiter='\t')

## Read hits and get indexes of hits

In [14]:
df_hits = pd.read_csv('../data/CBL-B/hits.tsv', delimiter='\t')
df_hits

Unnamed: 0,bb1,bb2,reaction,IFP,IFP_polar
0,EN300-1878549_i001,EN300-11626_i001,Amide,0.764706,0.50
1,EN300-3575340_i001,EN300-1589742_i001,Amide,0.647059,0.50
2,EN300-3575340_i002,EN300-7455807_i002,Amide,0.631579,0.50
3,EN300-3575340_i002,EN300-384067_i001,Amide,0.666667,0.50
4,EN300-3575340_i002,EN300-343493_i001,Amide,0.631579,0.50
...,...,...,...,...,...
7702851,EN300-103503_i002,EN300-12411_i001,Williamson_ether,0.611111,0.75
7702852,EN300-103503_i002,EN300-10699143_i001,Williamson_ether,0.611111,0.50
7702853,EN300-103503_i003,EN300-10547_i002,Williamson_ether,0.666667,0.50
7702854,EN300-103503_i003,EN300-7354512_i001,Williamson_ether,0.611111,0.50


In [15]:
def get_hits_idxs(df_hits, q):
    df_hits = df_hits[df_hits['IFP'] >= q]
    hits_idxs = np.array([], dtype=np.int64)
    for name, group in df_hits.groupby('reaction'):
        
        reaction_n = reactions_names[name]
        
        bb1_df = bbs[reaction_rules[reaction_n][0]]
        bb1_df = bb1_df.set_index('name')
        bb2_df = bbs[reaction_rules[reaction_n][1]]
        bb2_df = bb2_df.set_index('name')
        
        bb1_idxs = bb1_df.index.get_indexer(group['bb1'])
        bb2_idxs = bb2_df.index.get_indexer(group['bb2'])
        
        idxs_local = bb1_idxs * len(bb2_df) + bb2_idxs
        idxs = idxs_local + reaction_borders[reaction_n] - pairs_per_reaction[reaction_n] + 1
    
        hits_idxs = np.hstack([hits_idxs, idxs])
    return hits_idxs

In [16]:
for q in [0.6, 0.7, 0.8, 0.9]:
    hits_idxs = get_hits_idxs(df_hits, q)
    #np.save(f'../data/CBL-B/hits_idxs_q_{q}.npy', hits_idxs)

## Generate fingerptints of building blocks

In [17]:
parameters={ 
    "radius": 2,
    "nBits": 2048,
    "invariants": [],
    "fromAtoms": [],
    "useChirality": True,
    "useBondTypes": True,
    "useFeatures": False
}

In [18]:
from rdkit import RDLogger                                                                                                                                                               
RDLogger.DisableLog('rdApp.*')  

In [None]:
for n, df in enumerate(bbs):
    fps = np.zeros((len(df), 2048), np.int8)
    for i, smiles in enumerate(tqdm(df['smiles'], desc="Calculating fingerprints", unit="fp")):
        
        mol = Chem.MolFromSmiles(smiles)
        fp = rdmd.GetMorganFingerprintAsBitVect(mol, **parameters)
        
        fps[i] = np.array(fp).astype(np.int8)

    np.save(f'../data/CBL-B/bb_{n}.npy', fps)

Calculating fingerprints: 100%|█████████| 75697/75697 [00:59<00:00, 1264.21fp/s]
Calculating fingerprints: 100%|█████████| 35406/35406 [00:28<00:00, 1231.24fp/s]
Calculating fingerprints: 100%|█████████████| 150/150 [00:00<00:00, 1083.71fp/s]
Calculating fingerprints: 100%|███████████| 4060/4060 [00:03<00:00, 1229.25fp/s]
Calculating fingerprints: 100%|█████████| 66153/66153 [00:52<00:00, 1249.61fp/s]
Calculating fingerprints: 100%|█████████| 17993/17993 [00:14<00:00, 1246.70fp/s]
Calculating fingerprints: 100%|█████████| 30243/30243 [00:23<00:00, 1261.05fp/s]
Calculating fingerprints: 100%|█████████| 14532/14532 [00:11<00:00, 1227.81fp/s]
Calculating fingerprints:  58%|█████▎   | 16827/28785 [00:13<00:09, 1225.02fp/s]