In [7]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem
import pickle
from tqdm import tqdm
tqdm.pandas()
from rdkit.Chem import DataStructs
from sklearn.impute import KNNImputer
from feature_engine.selection import DropCorrelatedFeatures 
import pickle

In [3]:
grouped_table = pd.read_csv('grouped_table.csv', index_col = 0).reset_index(drop = True).drop(columns = ['Is_organic', 'num_fragments'])
grouped_table['mol'] = grouped_table['SMILES'].apply(lambda x: Chem.MolFromSmiles(x))

In [4]:
def mols_to_descriptors(list_of_mols):
    descriptor_dict = {desc[0]:[] for desc in Descriptors.descList }
    for mol in tqdm(list_of_mols):
        for descriptor, func in Descriptors.descList:
            try:
                descriptor_dict[descriptor].append(func(mol))
            except:
                descriptor_dict[descriptor].append(np.nan)
    return pd.DataFrame(descriptor_dict)

In [5]:
desc_df = mols_to_descriptors(grouped_table['mol'])

100%|██████████| 7104/7104 [00:40<00:00, 175.44it/s]


In [6]:
print('Before dropping', desc_df.shape[1])
dropper = DropCorrelatedFeatures(threshold=0.9)
desc_df = dropper.fit_transform(desc_df)
print('After dropping', desc_df.shape[1])

Before dropping 208
After dropping 156


In [10]:
with open('features_to_drop.pickle', 'wb') as out:
    pickle.dump(dropper.features_to_drop_, out)

In [19]:
def mols_to_fingerprints(list_of_mols):
    fp_list = []
    for mol in tqdm(list_of_mols):
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius = 3)
        dest_array = np.zeros(2048)
        DataStructs.ConvertToNumpyArray(fp, dest_array)
        fp_list.append(dest_array)
    return pd.DataFrame(np.stack(fp_list), columns = ['fp{}'.format(i) for i in range(2048)])

In [20]:
fp_df = mols_to_fingerprints(grouped_table['mol'])

100%|██████████| 7104/7104 [00:00<00:00, 9218.16it/s]


In [22]:
imputer = KNNImputer()
desc_df_columns = desc_df.columns
desc_df = pd.DataFrame(imputer.fit_transform(desc_df), columns = desc_df_columns)

In [24]:
with open('imputer.pickle', 'wb') as out:
    pickle.dump(imputer, out)

In [23]:
concatenated_table = pd.concat([grouped_table.reset_index(drop = True), desc_df, fp_df], axis = 1)

In [171]:
concatenated_table.to_csv('table_with_desriptors.csv')