# RDKit Feature Engineering

In [1]:
# IN TERMINAL:
# $source activate my-rdkit-env
# $pip install jupyter
# $pip install pandas

#Import packages
import pandas as pd
from rdkit.Chem import rdMolDescriptors

## FEATURE ENGINEERING - PART 1

In [20]:
'''
INPUTS:
inscv : string name for input csv file
outscv: string name for output csv file with new features
'''
def make_features(incsv, outcsv):
    df_read=pd.read_csv(incsv, index_col=0)
    keys=['f'+str(i) for i in range(14,40)]
    vals=[[] for i in range(14,40)]
    features=dict(zip(keys, vals))

    for i in df_read.smiles:
        m = Chem.MolFromSmiles(i)
        features['f01'].append(rdMolDescriptors.CalcExactMolWt(m))
        features['f02'].append(rdMolDescriptors.CalcFractionCSP3(m))
        features['f03'].append(rdMolDescriptors.CalcHallKierAlpha(m))
        features['f04'].append(rdMolDescriptors.CalcKappa1(m))
        features['f05'].append(rdMolDescriptors.CalcKappa2(m))
        features['f06'].append(rdMolDescriptors.CalcKappa3(m))
        features['f07'].append(rdMolDescriptors.CalcLabuteASA(m))
        features['f08'].append(rdMolDescriptors.CalcNumAliphaticCarbocycles(m))
        features['f09'].append(rdMolDescriptors.CalcNumAliphaticHeterocycles(m))
        features['f10'].append(rdMolDescriptors.CalcNumAliphaticRings(m))
        features['f11'].append(rdMolDescriptors.CalcNumAmideBonds(m))
        features['f12'].append(rdMolDescriptors.CalcNumAromaticCarbocycles(m))
        features['f13'].append(rdMolDescriptors.CalcNumRings(m))

    #Add new columns to dataframe
    for i in keys:
        df_read[i]=features[i]

    df_read.to_csv(outcsv)
    return 'Finished'

## FEATURE ENGINEERING - PART 2

In [20]:
'''
INPUTS:
inscv : string name for input csv file
outscv: string name for output csv file with new features
'''
def make_features2(incsv, outcsv):
    df_read=pd.read_csv(incsv, index_col=0)
    keys=['f'+str(i) for i in range(14,40)]
    vals=[[] for i in range(14,40)]
    features=dict(zip(keys, vals))

    for i in df_read.smiles:
        m = Chem.MolFromSmiles(i)
        features['f14'].append(rdMolDescriptors.CalcChi0n(m))
        features['f15'].append(rdMolDescriptors.CalcChi0v(m))
        features['f16'].append(rdMolDescriptors.CalcChi1n(m))
        features['f17'].append(rdMolDescriptors.CalcChi1v(m))
        features['f18'].append(rdMolDescriptors.CalcChi2n(m))
        features['f19'].append(rdMolDescriptors.CalcChi2v(m))
        features['f20'].append(rdMolDescriptors.CalcChi3n(m))
        features['f21'].append(rdMolDescriptors.CalcChi3v(m))
        features['f22'].append(rdMolDescriptors.CalcChi4n(m))
        features['f23'].append(rdMolDescriptors.CalcChi4v(m))
        features['f24'].append(rdMolDescriptors.CalcCrippenDescriptors(m)[0])
        features['f25'].append(rdMolDescriptors.CalcCrippenDescriptors(m)[1])
        features['f26'].append(rdMolDescriptors.CalcNumAromaticHeterocycles(m))
        features['f27'].append(rdMolDescriptors.CalcNumBridgeheadAtoms(m))
        features['f28'].append(rdMolDescriptors.CalcNumHBA(m))
        features['f29'].append(rdMolDescriptors.CalcNumHBD(m))
        features['f30'].append(rdMolDescriptors.CalcNumHeteroatoms(m))
        features['f31'].append(rdMolDescriptors.CalcNumHeterocycles(m))
        features['f32'].append(rdMolDescriptors.CalcNumLipinskiHBA(m))
        features['f33'].append(rdMolDescriptors.CalcNumLipinskiHBD(m))
        features['f34'].append(rdMolDescriptors.CalcNumRotatableBonds(m))
        features['f35'].append(rdMolDescriptors.CalcNumSaturatedCarbocycles(m))
        features['f36'].append(rdMolDescriptors.CalcNumSaturatedHeterocycles(m))
        features['f37'].append(rdMolDescriptors.CalcNumSaturatedRings(m))
        features['f38'].append(rdMolDescriptors.CalcNumSpiroAtoms(m))
        features['f39'].append(rdMolDescriptors.CalcTPSA(m))
    
    #Add new columns to dataframe
    for i in keys:
        df_read[i]=features[i]

    df_read.to_csv(outcsv)
    return 'Finished'

In [17]:
make_features('new_train.csv', 'new_tr_feat.csv')

In [21]:
make_features('new_test.csv', 'new_te_feat.csv')

'Finished'

In [22]:
make_features('new_val.csv', 'new_val_feat.csv')

'Finished'

In [21]:
make_features2('new_tr_feat.csv', 'new_xtr_feat.csv')

'Finished'

In [22]:
make_features2('new_te_feat.csv', 'new_xte_feat.csv')

'Finished'

In [23]:
make_features2('new_val_feat.csv', 'new_xval_feat.csv')

'Finished'