In [2]:
import numpy as np
import pandas as pd
from scipy import sparse as ss
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors as rdmd
from tqdm import tqdm

## Read the dataframe

In [3]:
df = pd.read_csv('../data/DRD3/1M_DRD3_pairs.tsv', delimiter='\t')
df

Unnamed: 0,name_1,smiles_1,name_2,smiles_2,IFP,IFP_polar
0,EN300-7418644_i001,C[C@@H](c1ccc(cc1)C#N)[NH3+],EN300-71903_i001,C1CCN([C@@H](C1)C(=O)[O-])C(=O)C2CCCC2,0.000000,0.0
1,EN300-7418644_i001,C[C@@H](c1ccc(cc1)C#N)[NH3+],EN300-219577_i001,c1c(onc1C(=O)[O-])C[NH+]2CCCC2,0.411765,0.0
2,EN300-7418644_i001,C[C@@H](c1ccc(cc1)C#N)[NH3+],EN300-6830151_i001,c1c(c(no1)C(=O)[O-])C2CC2,0.000000,0.0
3,EN300-7418644_i001,C[C@@H](c1ccc(cc1)C#N)[NH3+],EN300-1601163_i004,C1C[C@@H](C[C@@H]1CC(=O)[O-])O,0.000000,0.0
4,EN300-7418644_i001,C[C@@H](c1ccc(cc1)C#N)[NH3+],EN300-202201_i001,CCCc1c2c(cc(nc2on1)CC)C(=O)[O-],0.500000,0.5
...,...,...,...,...,...,...
999995,EN300-1237271_i001,C1C[NH2+]C[C@@H]1OCC2CC2,EN300-51359_i001,C1C[C@H](Cn2c1nnc2C3CC3)C(=O)[O-],0.000000,0.0
999996,EN300-1237271_i001,C1C[NH2+]C[C@@H]1OCC2CC2,EN300-57589_i003,C[N@H+]1CCC[C@H](C1)C(=O)[O-],0.000000,0.0
999997,EN300-1237271_i001,C1C[NH2+]C[C@@H]1OCC2CC2,EN300-343801_i002,C[C@@H](CCC(=O)[O-])[NH+](C)C,0.000000,0.0
999998,EN300-1237271_i001,C1C[NH2+]C[C@@H]1OCC2CC2,EN300-244267_i001,c1c(c2n(n1)CCC2)C(=O)[O-],0.000000,0.0


## Generate fingerprints 

In [4]:
#Fingerprints parameters
parameters = {
    "radius": 2,
    "nBits": 2048,
    "invariants": [],
    "fromAtoms": [],
    "useChirality": True,
    "useBondTypes": True,
    "useFeatures": False,
}

In [5]:
from rdkit import RDLogger                                                                                                                                                               
RDLogger.DisableLog('rdApp.*')  

In [5]:
#Generate fingerprints for every pair of reagents, convert to sparse matrix
row_idx = list()
col_idx = list()

for count, (smi_1, smi_2) in enumerate(tqdm(zip(df['smiles_1'], df['smiles_2']), desc='Generating fingerprints', unit='fp')):

    mol1 = Chem.MolFromSmiles(smi_1) 
    mol2 = Chem.MolFromSmiles(smi_2)

    fp1 = rdmd.GetMorganFingerprintAsBitVect(mol1, **parameters)
    fp2 = rdmd.GetMorganFingerprintAsBitVect(mol2, **parameters)

    fp_and = fp1 & fp2
    fp_xor = fp1 ^ fp2
    fp = fp_and + fp_xor

    onbits = list(fp.GetOnBits())
    #these bits all have the same row:
    row_idx += [count]*len(onbits)
    #and the column indices of those bits:
    col_idx+=onbits

#generate a sparse matrix out of the row,col indices:
fingerprint_matrix = ss.coo_matrix((np.ones(len(row_idx)).astype(bool), (row_idx, col_idx)), shape=(max(row_idx)+1, 4096))
#convert to csr matrix:
fingerprint_matrix =  ss.csr_matrix(fingerprint_matrix)
#Save fingerprints as sparse matrix
#ss.save_npz('../data/DRD3/1M_DRD3_fingerprints.npz', fingerprint_matrix)

Generating fingerprints: 1000000fp [04:54, 3391.14fp/s]


## Generate training labels

In [6]:
#Set thresholds of IFP similarity
q = 0.6
q_polar = 0.5

In [7]:
#Create np array with training labels for each pair: hit - '1', else - '0'
y = np.where(((df['IFP'] >= q) & (df['IFP_polar']>=q_polar)), 1, 0)
#Save labels as np array
#np.save('../data/DRD3/1M_DRD3_labels.npy', y)