In [17]:
!pip3 install tqdm


Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [44]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import numpy as np 
import pandas as pd
from rdkit import Chem 
import matplotlib.pyplot as plt
from tqdm import tqdm
from rdkit.Chem import AllChem

In [53]:
def generate_morgan_fingerprints(data_frame, is_train = True):
    
    data_frame['molecules'] = data_frame['Chemical_Compound'].apply(lambda smile: Chem.MolFromSmiles(smile))
    
    if is_train:
        data_frame.dropna(axis=0,inplace=True) #dropping silicon based elemets for train dataset
        data_frame.reset_index(drop=True, inplace=True)
    
    onBits = {}
    
    morgan_fp = [
        np.array(AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024, bitInfo= onBits)) for mol in tqdm(data_frame['molecules'])
    ]
    
    data_frame.drop(['Chemical_Compound', 'molecules'], axis=1, inplace=True)
    
    COLUMNS = [f'FP{i}' for i in range(0, 1024)]
    
    finger_prints = pd.DataFrame(data=morgan_fp, columns=COLUMNS)
    
    morgan_fp_data = pd.concat([data_frame, finger_prints], axis=1)
    
    return morgan_fp_data

In [57]:
train_data = pd.read_csv("./data/extracted_compounds.csv")

In [58]:
len(train_data)

75383

In [54]:
mog_train = generate_morgan_fingerprints(train_data,False)

100%|████████████████████████████████████| 75377/75377 [01:42<00:00, 732.47it/s]


In [55]:
len(mog_train)

75383

In [59]:
mog_train.to_csv("Morgan_fingers_train.csv",index=False)

In [35]:
test_data = pd.read_csv("./data/test_data_new.csv")

In [36]:
test_data

Unnamed: 0.1,Unnamed: 0,x,Chemical_Compound,assay_ids
0,0,CC1=CC(=C(C=C1)C(C)(C)C)O;1682,CC1=CC(=C(C=C1)C(C)(C)C)O,1682
1,1,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,1656
2,2,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,36
3,3,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+];1850,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+],1850
4,4,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,30
...,...,...,...,...
10989,10989,CC(=NOCC1=CC=CC=C1C(=NOC)C(=O)OC)C2=CC(=CC=C2)...,CC(=NOCC1=CC=CC=C1C(=NOC)C(=O)OC)C2=CC(=CC=C2)...,38
10990,10990,C1=CC=C(C(=C1)C(C2=CC=C(C=C2)O)C3=CC=C(C=C3)O)...,C1=CC=C(C(=C1)C(C2=CC=C(C=C2)O)C3=CC=C(C=C3)O)...,34
10991,10991,CC1=NC(=NC(=N1)OC)NC(=O)NS(=O)(=O)C2=CC=CC=C2C...,CC1=NC(=NC(=N1)OC)NC(=O)NS(=O)(=O)C2=CC=CC=C2C...,1640
10992,10992,COP(=O)(OC)OC=C(Cl)Cl;28,COP(=O)(OC)OC=C(Cl)Cl,28


In [37]:
mog_test = generate_morgan_fingerprints(test_data)


  0%|                                                 | 0/10994 [00:00<?, ?it/s][A
  1%|▎                                      | 74/10994 [00:00<00:14, 733.60it/s][A
  1%|▌                                     | 148/10994 [00:00<00:14, 734.72it/s][A
  2%|▊                                     | 222/10994 [00:00<00:14, 733.39it/s][A
  3%|█                                     | 296/10994 [00:00<00:15, 696.65it/s][A
  3%|█▎                                    | 367/10994 [00:00<00:15, 698.87it/s][A
  4%|█▌                                    | 440/10994 [00:00<00:14, 708.79it/s][A
  5%|█▊                                    | 514/10994 [00:00<00:14, 717.29it/s][A
  5%|██                                    | 588/10994 [00:00<00:14, 723.47it/s][A
  6%|██▎                                   | 662/10994 [00:00<00:14, 727.14it/s][A
  7%|██▌                                   | 736/10994 [00:01<00:14, 729.90it/s][A
  7%|██▊                                   | 810/10994 [00:01<00:13, 731.25

 65%|████████████████████████▏            | 7174/10994 [00:09<00:05, 731.90it/s][A
 66%|████████████████████████▍            | 7248/10994 [00:09<00:05, 732.57it/s][A
 67%|████████████████████████▋            | 7322/10994 [00:09<00:05, 732.49it/s][A
 67%|████████████████████████▉            | 7396/10994 [00:10<00:04, 733.39it/s][A
 68%|█████████████████████████▏           | 7470/10994 [00:10<00:04, 734.25it/s][A
 69%|█████████████████████████▍           | 7544/10994 [00:10<00:04, 734.56it/s][A
 69%|█████████████████████████▋           | 7618/10994 [00:10<00:04, 731.95it/s][A
 70%|█████████████████████████▉           | 7692/10994 [00:10<00:04, 730.30it/s][A
 71%|██████████████████████████▏          | 7766/10994 [00:10<00:04, 730.06it/s][A
 71%|██████████████████████████▍          | 7840/10994 [00:10<00:04, 731.87it/s][A
 72%|██████████████████████████▋          | 7914/10994 [00:10<00:04, 733.86it/s][A
 73%|██████████████████████████▉          | 7988/10994 [00:10<00:04, 734.71i

In [38]:
mog_test.to_csv("Morgan_fingers_test.csv",index=False)

In [39]:
mog_test

Unnamed: 0.1,Unnamed: 0,x,assay_ids,FP0,FP1,FP2,FP3,FP4,FP5,FP6,...,FP1014,FP1015,FP1016,FP1017,FP1018,FP1019,FP1020,FP1021,FP1022,FP1023
0,0,CC1=CC(=C(C=C1)C(C)(C)C)O;1682,1682,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,1656,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,36,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,3,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+];1850,1850,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,30,1,0,0,0,0,1,0,...,0,1,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10989,10989,CC(=NOCC1=CC=CC=C1C(=NOC)C(=O)OC)C2=CC(=CC=C2)...,38,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
10990,10990,C1=CC=C(C(=C1)C(C2=CC=C(C=C2)O)C3=CC=C(C=C3)O)...,34,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
10991,10991,CC1=NC(=NC(=N1)OC)NC(=O)NS(=O)(=O)C2=CC=CC=C2C...,1640,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10992,10992,COP(=O)(OC)OC=C(Cl)Cl;28,28,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
