In [1]:
import numpy as np
import pandas as pd
from scipy import sparse as ss
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors as rdmd
from tqdm import tqdm
from sklearn.preprocessing import OneHotEncoder

## Read the dataframe

In [2]:
df = pd.read_csv('../data/1M_CBL-B/1M_CBL-B_pairs.tsv', delimiter='\t')
df

Unnamed: 0,name_1,smiles_1,name_2,smiles_2,IFP,IFP_polar,reaction
0,EN300-1273273_i001,COc1cc(c(c(c1)Br)O)C(=O)[O-],EN300-80433_i001,c1ccc(c(c1)CBr)OC(F)F,0.333333,0.0,Negishi
1,EN300-35822484_i001,C1C(CC1(F)F)(CF)CBr,EN300-8688089_i001,c1cc(cc(c1)[C@@H]2CC[NH2+]C2)C#N,0.550000,0.0,Grignard_Nitrile
2,EN300-53744_i001,c1cc(ccc1CN)[NH+]2CCOCC2,EN300-310211_i001,Cc1nnc2n1C[C@@H]([N@@](C2)C)C(=O)[O-],0.500000,0.0,Amide
3,EN300-7437778_i001,C1CN(CC[NH2+]1)c2[nH]nc(n2)Br,EN300-1620610_i001,CC1(CC[C@@H](C1)CC(=O)[O-])C,0.450000,0.0,Amide
4,EN300-317553_i001,Cc1ccc(cc1CNC(=O)OC(C)(C)C)Br,EN300-249270_i001,CC(c1cccc(c1)C#N)(F)F,0.523810,0.2,Grignard_Nitrile
...,...,...,...,...,...,...,...
999995,EN300-201019_i001,COCC1(CCC1)C[NH3+],EN300-313500_i001,CC(C)n1cc(c(n1)C2CCOCC2)C(=O)[O-],0.000000,0.0,Amide
999996,EN300-25616_i002,Cc1ccc(cc1)[C@H](C2CC2)[NH3+],EN300-11663_i002,C[C@H](C(=O)c1ccc(cc1)NS(=O)(=O)C)Cl,0.000000,0.0,Reductive_Amination
999997,EN300-1707771_i001,CCn1cnnc1CCl,EN300-91321_i002,Cc1ccc(cc1)[N@@H+]2CCC[C@H](C2=O)Br,0.000000,0.0,Negishi
999998,EN300-14055_i001,c1ccc(cc1)CNC(=O)C2CC[NH2+]CC2,EN300-92028_i001,Cc1c(cn(n1)c2ccc(cc2)Cl)C(=O)[O-],0.000000,0.0,Amide


## Generate fingerprints

In [3]:
#Fingerprints parameters
parameters = {
    "radius": 2,
    "nBits": 2048,
    "invariants": [],
    "fromAtoms": [],
    "useChirality": True,
    "useBondTypes": True,
    "useFeatures": False,
}

In [4]:
en = OneHotEncoder(sparse_output=True, dtype=np.int8)
en.fit(df[['reaction']])
reaction_features = en.transform(df[['reaction']])

In [5]:
#Generate fingerprints for every pair of reagents, convert to sparse matrix
row_idx = list()
col_idx = list()

for count, (smi_1, smi_2) in enumerate(tqdm(zip(df['smiles_1'], df['smiles_2']), desc='Generating fingerprints', unit='fp')):

    mol1 = Chem.MolFromSmiles(smi_1) 
    mol2 = Chem.MolFromSmiles(smi_2)

    fp1 = rdmd.GetMorganFingerprintAsBitVect(mol1, **parameters)
    fp2 = rdmd.GetMorganFingerprintAsBitVect(mol2, **parameters)

    fp_and = fp1 & fp2
    fp_xor = fp1 ^ fp2
    fp = fp_and + fp_xor

    onbits = list(fp.GetOnBits())
    #these bits all have the same row:
    row_idx += [count]*len(onbits)
    #and the column indices of those bits:
    col_idx+=onbits

#generate a sparse matrix out of the row,col indices:
fingerprint_matrix = ss.coo_matrix((np.ones(len(row_idx)).astype(bool), (row_idx, col_idx)), shape=(max(row_idx)+1, 4096))
#convert to csr matrix:
fingerprint_matrix =  ss.csr_matrix(fingerprint_matrix)

Generating fingerprints: 1000000fp [05:00, 3326.85fp/s]


In [6]:
fps = ss.hstack([fingerprint_matrix, reaction_features])
#Save fingerprints as sparse matrix
#ss.save_npz('../data/1M_CBL-B/1M_CBL-B_fingerprints.npz', fingerprint_matrix)

## Generate training labels

In [7]:
#Set thresholds of IFP similarity
q = 0.6
q_polar = 0.5

In [8]:
#Create np array with training labels for each pair: hit - '1', else - '0'
y = np.where(((df['IFP'] >= q) & (df['IFP_polar']>=q_polar)), 1, 0)
#Save labels as np array
#np.save('../data/1M_CBL-B/1M_CBL-B_labels.npy', y)