In [None]:
import pandas as pd
import numpy as np
import json

from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import MinMaxScaler
from sklearn.externals import joblib

In [12]:
# import all the training data
train_ecfp6_bits = pd.read_csv('../data/Descriptors/train_ecfp6_bits.csv', index_col='CASRN')
train_ecfp6_counts = pd.read_csv('../data/Descriptors/train_ecfp6_counts.csv', index_col='CASRN')
train_maccs = pd.read_csv('../data/Descriptors/train_maccs.csv', index_col='CASRN')
train_rdkit2d = pd.read_csv('../data/Descriptors/train_rdkit2d.csv', index_col='CASRN')
train_mordred = pd.read_csv('../data/Descriptors/train_mordred.csv', index_col='CASRN')

## Filter out the zero variance and highly correlated features

In [45]:
def feature_selection(df, nonzero_thrd = 0.0, cor_thrd = 0.95):
    '''
    remove the zero variance and highly correlated features
    
    df: train features
    
    '''
    selector = VarianceThreshold(nonzero_thrd)
    selector.fit(df)
    nonzero_df = df[df.columns[selector.get_support(indices=True)]]
    
    #remove high correlated features
    ## Create correlation matrix
    corr_matrix = nonzero_df.corr().abs()
    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    
    # Find index of feature columns with correlation greater than 0.95
    to_drop = [column for column in upper.columns if any(upper[column] > cor_thrd)]
    
    return nonzero_df.drop(nonzero_df[to_drop], axis=1)

In [49]:
filtered_train_ecfp6_bits = feature_selection(train_ecfp6_bits)
filtered_train_ecfp6_counts = feature_selection(train_ecfp6_counts)
filtered_train_maccs = feature_selection(train_maccs)
filtered_train_rdkit2d = feature_selection(train_rdkit2d)
filtered_train_mordred = feature_selection(train_mordred)

In [53]:
filtered_train_ecfp6_bits.shape, filtered_train_ecfp6_counts.shape,filtered_train_maccs.shape

((8221, 2048), (8221, 2046), (8221, 145))

In [54]:
filtered_train_mordred.shape, filtered_train_rdkit2d.shape

((8221, 459), (8221, 159))

In [65]:
desc = ['ecfp6_bits', 'ecfp6_counts', 'maccs', 'rdkit2d', 'mordred']
filtered_features = [
    list(filtered_train_ecfp6_bits),
    list(filtered_train_ecfp6_counts),
    list(filtered_train_maccs),
    list(filtered_train_rdkit2d),
    list(filtered_train_mordred),
]

# store the filtered features
dict_features = dict(zip(desc, filtered_features))

In [66]:
with open('../data/Descriptors/filtered_features.json', 'w') as f:
    json.dump(dict_features, f)

In [None]:
# # read the json file
# with open('../data/Descriptors/filtered_features.json') as f:
#     dict_features = json.load(f)

# Normalize the numerical feautres

This is only apply to mordred and rdkit2d descriptors. Here, we use [MinMaxScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html) to scale all the features of training data into range (0,1). And then scale the test sets based on the scaler generated on the train sets.

Here, we define two functions: (1) feature_norm_fit fits and transform the training data and return both the normalized training dataframe and the fitted scaler. (2) feature_norm_transform uses the fitted scaler to transform the test data. These two functions will be put into the utils.py.

In [82]:
def feature_norm_fit(train_df , scaler = MinMaxScaler()):
    '''
    train_df: training data
    
    scaler: return the scaler which will be used for test set.
    '''
    array =  train_df.values
    df_norm = pd.DataFrame(scaler.fit_transform(array), columns=train_df.columns, index=train_df.index)
    return df_norm, scaler

def feature_norm_transform(test_df, scaler):
    '''
    test_df: test features
    scaler: fitted scaler
    '''
    array =  test_df.values
    df_norm = pd.DataFrame(scaler.transform(array), columns=test_df.columns, index=test_df.index)
    return df_norm  
    

In [72]:
norm_train_rdkit2d, scaler_rdkit2d = feature_norm(filtered_train_rdkit2d)

In [73]:
scaler_rdkit2d.get_params

<bound method BaseEstimator.get_params of MinMaxScaler(copy=True, feature_range=(0, 1))>

In [74]:
norm_train_rdkit2d.head(1)

Unnamed: 0_level_0,BalabanJ,BertzCT,Chi0,Chi3v,EState_VSA1,EState_VSA10,EState_VSA11,EState_VSA2,EState_VSA3,EState_VSA4,...,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea,qed
CASRN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
23233-88-7,0.286258,0.112142,0.133237,0.157641,0.018166,0.040446,0.0,0.0,0.086767,0.05297,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.382204


save the scaler for future use

In [79]:
# 
joblib.dump(scaler_rdkit2d, '../data/Descriptors/scaler_rdkit2d.pkl') 

['../data/Descriptors/scaler_rdkit2d.pkl']

reload and test 

In [80]:
# 
test_scaler = joblib.load('../data/Descriptors/scaler_rdkit2d.pkl')

In [83]:
test_rdkit2d = feature_norm_transform(filtered_train_rdkit2d, test_scaler)

In [84]:
test_rdkit2d.head(1) 

Unnamed: 0_level_0,BalabanJ,BertzCT,Chi0,Chi3v,EState_VSA1,EState_VSA10,EState_VSA11,EState_VSA2,EState_VSA3,EState_VSA4,...,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea,qed
CASRN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
23233-88-7,0.286258,0.112142,0.133237,0.157641,0.018166,0.040446,0.0,0.0,0.086767,0.05297,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.382204


OK, it works

Fit and transform the mordred descriptors

In [86]:
norm_train_mordred, scaler_mordred = feature_norm(filtered_train_mordred)
joblib.dump(scaler_mordred, '../data/Descriptors/scaler_mordred.pkl')

['../data/Descriptors/scaler_mordred.pkl']

Again, test the scaler

In [87]:
test_scaler_mordred = joblib.load('../data/Descriptors/scaler_mordred.pkl')

In [90]:
test_mordred = feature_norm_transform(filtered_train_mordred, test_scaler_mordred)
test_mordred.head(1)

Unnamed: 0_level_0,ABC,nAcid,nBase,SpMax_A,SpMAD_A,LogEE_A,VE1_A,VR1_A,nAromAtom,nAtom,...,JGI6,JGI7,JGI8,JGI9,JGI10,JGT10,SRW03,SRW05,TSRW10,WPath
CASRN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
23233-88-7,0.133141,0.0,0.0,0.749405,0.859231,0.622256,0.397997,1.990715e-08,0.25,0.096346,...,0.363075,0.241979,0.338535,0.456364,0.471193,0.540906,0.0,0.0,0.244308,0.004741


In [91]:
norm_train_mordred.head(1)

Unnamed: 0_level_0,ABC,nAcid,nBase,SpMax_A,SpMAD_A,LogEE_A,VE1_A,VR1_A,nAromAtom,nAtom,...,JGI6,JGI7,JGI8,JGI9,JGI10,JGT10,SRW03,SRW05,TSRW10,WPath
CASRN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
23233-88-7,0.133141,0.0,0.0,0.749405,0.859231,0.622256,0.397997,1.990715e-08,0.25,0.096346,...,0.363075,0.241979,0.338535,0.456364,0.471193,0.540906,0.0,0.0,0.244308,0.004741


Save the training features

In [92]:
filtered_train_ecfp6_bits.to_csv('../data/Bmodel_features/modeling_train_ecfp6_bits.csv')
filtered_train_ecfp6_counts.to_csv('../data/Bmodel_features/modeling_train_ecfp6_counts.csv')
filtered_train_maccs.to_csv('../data/Bmodel_features/modeling_train_maccs.csv')
norm_train_rdkit2d.to_csv('../data/Bmodel_features/modeling_train_rdkit2d.csv')
norm_train_mordred.to_csv('../data/Bmodel_features/modeling_train_mordred.csv')

## Now Let's handle the test features

In [114]:
def test_feature(df, feature, scaler = None):
    '''
    transform the raw (computed) feature into the fowmat ready for modeling.
    
    df: test features
    scaler: for rdkit2d and mordred
    feature: name of the feature ['ecfp6_bits', 'ecfp6_counts', 'maccs', 'rdkit2d', 'mordred']
    '''
    
    with open('../data/Descriptors/filtered_features.json') as f:
        dict_features = json.load(f)
        
    if feature not in dict_features.keys():
        raise Exception(f'The feature **{feature}** is not support, please choose from [ecfp6_bits, ecfp6_counts, maccs, rdkit2d, mordred]')
        
    filtered_desc = dict_features[feature]
    df = df[filtered_desc]
    
    if scaler:
        df = feature_norm_transform(df, scaler)
    
    return df

In [109]:
# import all the test data
test_ecfp6_bits = pd.read_csv('../data/Descriptors/test_ecfp6_bits.csv', index_col='CASRN')
test_ecfp6_counts = pd.read_csv('../data/Descriptors/test_ecfp6_counts.csv', index_col='CASRN')
test_maccs = pd.read_csv('../data/Descriptors/test_maccs.csv', index_col='CASRN')
test_rdkit2d = pd.read_csv('../data/Descriptors/test_rdkit2d.csv', index_col='CASRN')
test_mordred = pd.read_csv('../data/Descriptors/test_mordred.csv', index_col='CASRN')

In [112]:
filtered_test_ecfp6_bits = test_feature(test_ecfp6_bits, 'ecfp6_bits')
filtered_test_ecfp6_counts = test_feature(test_ecfp6_counts, 'ecfp6_counts')
filtered_test_maccs = test_feature(test_maccs, 'maccs')

In [113]:
filtered_test_ecfp6_bits.shape, filtered_test_ecfp6_counts.shape, filtered_test_maccs.shape

((2849, 2048), (2849, 2046), (2849, 145))

In [115]:
scaler_mordred = joblib.load('../data/Descriptors/scaler_mordred.pkl')
scaler_rdkit2d = joblib.load('../data/Descriptors/scaler_rdkit2d.pkl') 

In [116]:
filtered_test_rdkit2d = test_feature(test_rdkit2d, 'rdkit2d', scaler_rdkit2d)

In [117]:
filtered_test_rdkit2d.head(1)

Unnamed: 0_level_0,BalabanJ,BertzCT,Chi0,Chi3v,EState_VSA1,EState_VSA10,EState_VSA11,EState_VSA2,EState_VSA3,EState_VSA4,...,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea,qed
CASRN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
130209-82-4,0.207313,0.100235,0.185335,0.22792,0.055726,0.16968,0.0,0.195699,0.129578,0.100139,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012987,0.0,0.254728


In [118]:
filtered_test_mordred = test_feature(test_mordred, 'mordred', scaler_mordred)

In [119]:
filtered_test_mordred.head(1)

Unnamed: 0_level_0,ABC,nAcid,nBase,SpMax_A,SpMAD_A,LogEE_A,VE1_A,VR1_A,nAromAtom,nAtom,...,JGI6,JGI7,JGI8,JGI9,JGI10,JGT10,SRW03,SRW05,TSRW10,WPath
CASRN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
130209-82-4,0.18358,0.0,0.0,0.750389,0.867067,0.682724,0.335865,2.993004e-07,0.125,0.225914,...,0.252108,0.1689,0.266486,0.273758,0.249418,0.412305,0.0,0.437189,0.351957,0.015544


In [120]:
filtered_test_mordred.shape, filtered_test_rdkit2d.shape

((2849, 459), (2849, 159))

Save the modeling ready data.

In [121]:
filtered_test_ecfp6_bits.to_csv('../data/Bmodel_features/modeling_test_ecfp6_bits.csv')
filtered_test_ecfp6_counts.to_csv('../data/Bmodel_features/modeling_test_ecfp6_counts.csv')
filtered_test_maccs.to_csv('../data/Bmodel_features/modeling_test_maccs.csv')
filtered_test_rdkit2d.to_csv('../data/Bmodel_features/modeling_test_rdkit2d.csv')
filtered_test_mordred.to_csv('../data/Bmodel_features/modeling_test_mordred.csv')