In [8]:
import numpy as np
import pandas as pd
from scipy import sparse as ss
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors as rdmd
from tqdm import tqdm
from sklearn.preprocessing import OneHotEncoder

In [9]:
df = pd.read_csv('/projects/ML-SpaceDock/data/data_BBs_pairs_polar_CACHE4.tsv', delimiter='\t', header=None, names=['name_1','smiles_1','name_2','smiles_2','IFP','IFP_polar', 'reaction'])

In [10]:
df

Unnamed: 0,name_1,smiles_1,name_2,smiles_2,IFP,IFP_polar,reaction
0,EN300-1273273_i001,COc1cc(c(c(c1)Br)O)C(=O)[O-],EN300-80433_i001,c1ccc(c(c1)CBr)OC(F)F,0.333333,0.0,Negishi
1,EN300-35822484_i001,C1C(CC1(F)F)(CF)CBr,EN300-8688089_i001,c1cc(cc(c1)[C@@H]2CC[NH2+]C2)C#N,0.550000,0.0,Grignard_Nitrile
2,EN300-53744_i001,c1cc(ccc1CN)[NH+]2CCOCC2,EN300-310211_i001,Cc1nnc2n1C[C@@H]([N@@](C2)C)C(=O)[O-],0.500000,0.0,Amide
3,EN300-7437778_i001,C1CN(CC[NH2+]1)c2[nH]nc(n2)Br,EN300-1620610_i001,CC1(CC[C@@H](C1)CC(=O)[O-])C,0.450000,0.0,Amide
4,EN300-317553_i001,Cc1ccc(cc1CNC(=O)OC(C)(C)C)Br,EN300-249270_i001,CC(c1cccc(c1)C#N)(F)F,0.523810,0.2,Grignard_Nitrile
...,...,...,...,...,...,...,...
999995,EN300-201019_i001,COCC1(CCC1)C[NH3+],EN300-313500_i001,CC(C)n1cc(c(n1)C2CCOCC2)C(=O)[O-],0.000000,0.0,Amide
999996,EN300-25616_i002,Cc1ccc(cc1)[C@H](C2CC2)[NH3+],EN300-11663_i002,C[C@H](C(=O)c1ccc(cc1)NS(=O)(=O)C)Cl,0.000000,0.0,Reductive_Amination
999997,EN300-1707771_i001,CCn1cnnc1CCl,EN300-91321_i002,Cc1ccc(cc1)[N@@H+]2CCC[C@H](C2=O)Br,0.000000,0.0,Negishi
999998,EN300-14055_i001,c1ccc(cc1)CNC(=O)C2CC[NH2+]CC2,EN300-92028_i001,Cc1c(cn(n1)c2ccc(cc2)Cl)C(=O)[O-],0.000000,0.0,Amide


In [11]:
q = 0.6
q_polar = 0.25

In [12]:
df['hit'] = (df['IFP'] >= q) & (df['IFP_polar']>=q_polar)

In [16]:
df['hit'].sum()

4010

In [13]:
df.to_csv('/projects/ML-SpaceDock/1M_CACHE4/data_1M_CACHE4_pairs.tsv', sep='\t', index=False)

In [7]:
parameters = {
    "radius": 2,
    "nBits": 2048,
    "invariants": [],
    "fromAtoms": [],
    "useChirality": True,
    "useBondTypes": True,
    "useFeatures": False,
}

In [27]:
en = OneHotEncoder(sparse_output=False, dtype=np.int8)
en.fit(df[['reaction']])

In [28]:
reaction_features = en.transform(df[['reaction']])

In [29]:
fps = np.zeros((1000000, 4096), dtype=np.int8)

In [31]:
for i, (smi1, smi2) in tqdm(enumerate(zip(df['smiles_1'], df['smiles_2'])), desc='Generating descriptors', unit='pairs'):
    
    mol1 = Chem.MolFromSmiles(smi1)
    mol2 = Chem.MolFromSmiles(smi2)
    
    fp1 = np.array(rdmd.GetMorganFingerprintAsBitVect(mol1, **parameters)).astype(np.int8)
    fp2 = np.array(rdmd.GetMorganFingerprintAsBitVect(mol2, **parameters)).astype(np.int8)

    fp_ = np.hstack([np.logical_and(fp1, fp2), np.logical_xor(fp1, fp2)])

    fps[i] = fp_

Generating descriptors: 1000000pairs [26:25, 630.65pairs/s]


In [32]:
ss.save_npz('/projects/ML-SpaceDock/1M_CACHE4/1M_CACHE4_fingerprints_no_reaction.npz', ss.csr_matrix(fps))

In [34]:
fps = np.hstack([fps, reaction_features])

In [36]:
ss.save_npz('/projects/ML-SpaceDock/1M_CACHE4/1M_CACHE4_fingerprints_with_reaction.npz', ss.csr_matrix(fps))

In [17]:
y = np.where(df['hit'] == True, 1, 0)

In [18]:
np.save('/projects/ML-SpaceDock/1M_CACHE4/training_labels.npy', y)