# Feature engineering

In this notebook, we will try a few different methods of featurizing the structure data. 
1. Structural similarity profile (SSP) that is basically computed between pairs of drugs (Tanimoto) 
2. Molecular Descriptor Profile [Check Package: mordred]
3. Molecular Autoencoder [Automatic chemical design using a data-driven continuous representation of molecules.]
4. Mol2vec: [Check Paper: Mol2vec: unsupervised machine learning approach with chemical intuition]


In [51]:
import rdkit as rd
import pandas as pd
import csv 
from rdkit import DataStructs
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit import Chem
from rdkit.Chem import AllChem 
import numpy as np

In [52]:
molecular_features = pd.read_csv("/Users/shristi/Documents/InsightDataProject/Data/ProcessedData/molecular_features.csv")
molecular_features.rename(columns={'Unnamed: 0':'DrugName'}, inplace=True)
molecular_features
smiles_strings = molecular_features[['DrugName','SMILES']]
print(len(smiles_strings))
print(smiles_strings.isna().sum())
smiles_strings = smiles_strings.dropna()
print(len(smiles_strings))
smiles_strings.set_index('DrugName', inplace=True)

8226
DrugName      0
SMILES      774
dtype: int64
7452


In [53]:
smiles_strings.head()

Unnamed: 0_level_0,SMILES
DrugName,Unnamed: 1_level_1
Lepirudin,COCCOC[C@H](CC1(CCCC1)C(=O)N[C@H]1CC[C@H](CC1)...
Cetuximab,COCCOC[C@H](CC1(CCCC1)C(=O)N[C@H]1CC[C@H](CC1)...
Dornase alfa,COCCOC[C@H](CC1(CCCC1)C(=O)N[C@H]1CC[C@H](CC1)...
Denileukin diftitox,COCCOC[C@H](CC1(CCCC1)C(=O)N[C@H]1CC[C@H](CC1)...
Etanercept,COCCOC[C@H](CC1(CCCC1)C(=O)N[C@H]1CC[C@H](CC1)...


#### Now lets convert the SMILES to feature vectors

#### 1.  Structural similarity profile

Structural similarity profile is defined as the number of common chemical fingerprints divided by the number of all the chemical fingerprints of the two drugs being compared. 

In [15]:
def get_profile(drug):
    drug1_mol = Chem.MolFromSmiles(drug)
    if drug1_mol is None: return None
    return Chem.AddHs(drug1_mol)

def compute_similarity(profile_1, profile_2):
    fps_1 = AllChem.GetMorganFingerprint(profile_1, 2)
    fps_2 = AllChem.GetMorganFingerprint(profile_2, 2)
    if fps_1 is None or fps_2 is None: 
        return None
    
    score = DataStructs.DiceSimilarity(fps_1, fps_2)
    return score 

def calculate_tanimoto_similarity(input_drug_list, drugbank_list):
    structure_similarity_scores = {}

    for i, row in enumerate(drugbank_list.values):
        curr_drug = drugbank_list.index[i]
        structure_similarity_scores[curr_drug] = {}
        profile = get_profile(row[0])
        if profile is None:
            continue
        
        for j, row2 in enumerate(input_drug_list.values):
            curr_input_drug = input_drug_list.index[j]
            input_profile = get_profile(row2[0])
            if input_profile is None:
                continue
            
            score = compute_similarity(input_profile, profile)
            if score is not None: 
                structure_similarity_scores[curr_drug][curr_input_drug] = score  
    return structure_similarity_scores

structure_similarities = calculate_tanimoto_similarity(smiles_strings, smiles_strings)

In [17]:
structure_similarity_profiles = pd.DataFrame(structure_similarities)

In [19]:
structure_similarity_profiles.to_csv("/Users/shristi/Documents/InsightDataProject/Data/ProcessedData/structural_similarity_profiles.csv")

#### 2. Featurize using molecular fingerprints only ECFP



In [95]:
def featurize_smiles_mol_fingerprint(drug_list):
    fingerprints = []
    for i, row in enumerate(drug_list.values):
        curr_drug = drug_list.index[i]
        profile = get_profile(row[0])
        arr = np.zeros((1,))
        if profile is not None: 
            fingerprint = AllChem.GetMorganFingerprintAsBitVect(profile, 2)
            DataStructs.ConvertToNumpyArray(fingerprint,arr)
        else:
            fingerprint = None
        fingerprints.append(arr)  
    drug_list['Molecular_Fingerprint'] = fingerprints
    return drug_list
fingerprint_list = featurize_smiles_mol_fingerprint(smiles_strings)

In [106]:
fingerprint_list.head()

Unnamed: 0_level_0,SMILES,Molecular_Fingerprint
DrugName,Unnamed: 1_level_1,Unnamed: 2_level_1
Lepirudin,COCCOC[C@H](CC1(CCCC1)C(=O)N[C@H]1CC[C@H](CC1)...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
Cetuximab,COCCOC[C@H](CC1(CCCC1)C(=O)N[C@H]1CC[C@H](CC1)...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
Dornase alfa,COCCOC[C@H](CC1(CCCC1)C(=O)N[C@H]1CC[C@H](CC1)...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
Denileukin diftitox,COCCOC[C@H](CC1(CCCC1)C(=O)N[C@H]1CC[C@H](CC1)...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
Etanercept,COCCOC[C@H](CC1(CCCC1)C(=O)N[C@H]1CC[C@H](CC1)...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [107]:
#fingerprint_list.to_csv('/Users/shristi/Documents/InsightDataProject/Data/ProcessedData/fingerprints.csv')
fingerprint_list.to_pickle('/Users/shristi/Documents/InsightDataProject/Data/ProcessedData/fingerprints.pkl')

#### 3. Featurize using mol2vec

In [21]:
from mol2vec import features
from mol2vec import helpers

In [None]:
## NEEDS DOING: 
def featurize_smiles_mol2vec(drug_list):
    drug_vectors = []
    for i, row in enumerate(drug_list.values):
        curr_drug = drug_list.index[i]
        profile = get_profile(row[0])
        
        if profile is not None: 
            fingerprint = AllChem.GetMorganFingerprintAsBitVect(profile, 2)
        else:
            fingerprint = None
            
        drug_vectors.append(fingerprint)
    drug_vectors['fingerprints'] = fingerprints
    return drug_list

mol2vec_features = featurize_smiles_mol2vec(smiles_strings)