In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df_train = pd.read_csv('/Users/tanmaysharma/projects/Jak2Biotech/Datasets/jak2train.csv')

In [3]:
df_train.head()

Unnamed: 0,name,cls,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,C1=CC=C(C=C1)C(=O)OC2=CC=CC3=C2C=CC=C3O,1,,,0,0.0,26.533788,2.417568,4.835136,26.533788,...,9.904237,53.680768,264.078644,8.252458,816,31,104,122.0,5.666667,4.444444
1,CCCN1C(=NC2=C1C(=O)N(C(=O)N2C)C)CN(C)CC3=CC=CC=C3,1,,,0,1.0,33.119478,2.547941,4.982317,33.119478,...,10.256501,75.463984,355.200825,6.964722,1663,44,136,163.0,9.611111,5.805556
2,C1=CC=C(C=C1)CCCNC(=O)/C(=C/C2=CC(=C(C=C2)O)O)...,1,,,0,0.0,30.775285,2.30647,4.61294,30.775285,...,9.722625,57.631782,322.131742,7.669803,1632,32,114,127.0,8.166667,5.555556
3,C1=CC=C(C=C1)CCCNC(=O)/C(=C/C2=CC(=C(C=C2)O)O)...,1,,,0,0.0,30.775285,2.30647,4.61294,30.775285,...,9.722625,57.631782,322.131742,7.669803,1632,32,114,127.0,8.166667,5.555556
4,C1CCC2(CC1)N=C3C=C(C=CC3=[N+]2[O-])NC4=CC=CC=C4,1,,,0,0.0,29.437477,2.545352,4.897891,29.437477,...,10.187199,71.277759,293.152812,7.150069,1074,36,122,147.0,5.368056,4.763889


## What do we want to do?

1. Remove all the features with more than 70% data absent.
2. Imputation of missing data.
3. Remove constant and quasi-constant features
4. Remove duplicated features
5. Remove correlated features with a brute force approach or selecting features smartly
6. Select important features by feature shuffling
7. Select features based on a univariate model performance
8. Select features recursively

In [4]:
# checking how many columns have different percentages of data absent.

percent_absent = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
absent_mean = df_train.isna().mean()

for i in percent_absent:
    temp_df = df_train.loc[:, absent_mean >= (i/100)]
    print('Number of data with {}% data missing is : {}'.format(i, temp_df.shape[1]))

Number of data with 0% data missing is : 1615
Number of data with 10% data missing is : 450
Number of data with 20% data missing is : 184
Number of data with 30% data missing is : 176
Number of data with 40% data missing is : 176
Number of data with 50% data missing is : 170
Number of data with 60% data missing is : 163
Number of data with 70% data missing is : 157
Number of data with 80% data missing is : 155
Number of data with 90% data missing is : 133
Number of data with 100% data missing is : 89


So, we remove all data which has more than 30% values missing.

In [5]:
df_train.dropna(axis=1, thresh=(len(df_train)*.3), inplace=True)
df_train.shape[1]

1458

In [6]:
absent_mean = df_train.isna().mean()

In [7]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
names = df_train['name']
names = le.fit_transform(names)
df_train['name'] = names


In [8]:
df_train['name']

0        285
1       4915
2        372
3        372
4       1190
        ... 
9435    2787
9436    2922
9437    5696
9438    5747
9439    3056
Name: name, Length: 9440, dtype: int64

In [9]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=4, weights='distance')
temp_df = pd.DataFrame(imputer.fit_transform(df_train), columns=df_train.columns)
temp_df.shape

(9440, 1458)

In [10]:
df_train = temp_df

In [11]:
from feature_engine.selection import (
    DropDuplicateFeatures,
    DropConstantFeatures,
    DropDuplicateFeatures,
    DropCorrelatedFeatures,
    SmartCorrelatedSelection,
    SelectByShuffling,
    SelectBySingleFeaturePerformance,
    RecursiveFeatureElimination,
)

In [12]:
constant = DropConstantFeatures(tol=0.98)
constant.fit(df_train)

In [13]:
len(constant.features_to_drop_)

202

In [14]:
df_train = constant.transform(df_train)

In [15]:
df_train

Unnamed: 0,name,cls,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,285.0,1.0,0.0,0.0,26.533788,2.417568,4.835136,26.533788,1.326689,3.931384,...,9.904237,53.680768,264.078644,8.252458,816.0,31.0,104.0,122.0,5.666667,4.444444
1,4915.0,1.0,0.0,1.0,33.119478,2.547941,4.982317,33.119478,1.273826,4.184367,...,10.256501,75.463984,355.200825,6.964722,1663.0,44.0,136.0,163.0,9.611111,5.805556
2,372.0,1.0,0.0,0.0,30.775285,2.306470,4.612940,30.775285,1.282304,4.070065,...,9.722625,57.631782,322.131742,7.669803,1632.0,32.0,114.0,127.0,8.166667,5.555556
3,372.0,1.0,0.0,0.0,30.775285,2.306470,4.612940,30.775285,1.282304,4.070065,...,9.722625,57.631782,322.131742,7.669803,1632.0,32.0,114.0,127.0,8.166667,5.555556
4,1190.0,1.0,0.0,0.0,29.437477,2.545352,4.897891,29.437477,1.338067,4.053737,...,10.187199,71.277759,293.152812,7.150069,1074.0,36.0,122.0,147.0,5.368056,4.763889
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9435,2787.0,0.0,0.0,0.0,33.463594,2.444828,4.738693,33.463594,1.239392,4.225655,...,10.211303,78.253383,384.161997,7.532588,2035.0,41.0,144.0,169.0,10.562500,5.777778
9436,2922.0,0.0,0.0,0.0,41.589381,2.455832,4.910792,41.589381,1.299668,4.394866,...,10.373210,81.140664,431.184506,7.564640,2934.0,51.0,168.0,198.0,10.222222,7.083333
9437,5696.0,0.0,0.0,0.0,32.030349,2.531373,5.059508,32.030349,1.231936,4.190962,...,10.372459,75.349253,356.173607,7.123472,1555.0,45.0,140.0,168.0,10.312500,5.625000
9438,5747.0,0.0,2.0,0.0,31.249932,2.363275,4.640138,31.249932,1.302080,4.098340,...,9.756321,73.225218,343.110296,8.368544,1647.0,29.0,120.0,136.0,6.666667,5.444444


In [16]:
# Remove Duplicated Features

duplicates = DropDuplicateFeatures()

duplicates.fit(df_train)
duplicates.duplicated_feature_sets_

[{'NsF', 'nF'}, {'n7AHRing', 'n7HRing'}, {'n9FaHRing', 'n9FaRing'}]

In [17]:
# Let's check if the above observation is correct or not

df_train['NsF'].equals(df_train['nF'])

True

In [18]:
df_train = duplicates.transform(df_train)

In [19]:
df_train

Unnamed: 0,name,cls,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,285.0,1.0,0.0,0.0,26.533788,2.417568,4.835136,26.533788,1.326689,3.931384,...,9.904237,53.680768,264.078644,8.252458,816.0,31.0,104.0,122.0,5.666667,4.444444
1,4915.0,1.0,0.0,1.0,33.119478,2.547941,4.982317,33.119478,1.273826,4.184367,...,10.256501,75.463984,355.200825,6.964722,1663.0,44.0,136.0,163.0,9.611111,5.805556
2,372.0,1.0,0.0,0.0,30.775285,2.306470,4.612940,30.775285,1.282304,4.070065,...,9.722625,57.631782,322.131742,7.669803,1632.0,32.0,114.0,127.0,8.166667,5.555556
3,372.0,1.0,0.0,0.0,30.775285,2.306470,4.612940,30.775285,1.282304,4.070065,...,9.722625,57.631782,322.131742,7.669803,1632.0,32.0,114.0,127.0,8.166667,5.555556
4,1190.0,1.0,0.0,0.0,29.437477,2.545352,4.897891,29.437477,1.338067,4.053737,...,10.187199,71.277759,293.152812,7.150069,1074.0,36.0,122.0,147.0,5.368056,4.763889
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9435,2787.0,0.0,0.0,0.0,33.463594,2.444828,4.738693,33.463594,1.239392,4.225655,...,10.211303,78.253383,384.161997,7.532588,2035.0,41.0,144.0,169.0,10.562500,5.777778
9436,2922.0,0.0,0.0,0.0,41.589381,2.455832,4.910792,41.589381,1.299668,4.394866,...,10.373210,81.140664,431.184506,7.564640,2934.0,51.0,168.0,198.0,10.222222,7.083333
9437,5696.0,0.0,0.0,0.0,32.030349,2.531373,5.059508,32.030349,1.231936,4.190962,...,10.372459,75.349253,356.173607,7.123472,1555.0,45.0,140.0,168.0,10.312500,5.625000
9438,5747.0,0.0,2.0,0.0,31.249932,2.363275,4.640138,31.249932,1.302080,4.098340,...,9.756321,73.225218,343.110296,8.368544,1647.0,29.0,120.0,136.0,6.666667,5.444444


In [20]:
# Drop correlated features
# Before dropping, let's split the dataset into training data and label data

X = df_train.drop(['cls'], axis = 1)
y = df_train['cls']

In [21]:
y = y.astype(int)

In [22]:
y

0       1
1       1
2       1
3       1
4       1
       ..
9435    0
9436    0
9437    0
9438    0
9439    0
Name: cls, Length: 9440, dtype: int64

In [23]:
# deleting correlated features

correlated = DropCorrelatedFeatures(variables=None, method='pearson', threshold=0.8)
correlated.fit(X)

In [24]:
correlated.correlated_feature_sets_

[{'ETA_alpha',
  'ETA_beta',
  'ETA_beta_s',
  'ETA_eta',
  'ETA_eta_F',
  'ETA_eta_FL',
  'ETA_eta_L',
  'ETA_eta_R',
  'ETA_eta_RL',
  'LogEE_A',
  'LogEE_D',
  'LogEE_DzZ',
  'LogEE_Dzare',
  'LogEE_Dzi',
  'LogEE_Dzm',
  'LogEE_Dzp',
  'LogEE_Dzpe',
  'LogEE_Dzse',
  'LogEE_Dzv',
  'MID',
  'MID_C',
  'SpAD_A',
  'SpAD_D',
  'SpAD_DzZ',
  'SpAD_Dzare',
  'SpAD_Dzi',
  'SpAD_Dzm',
  'SpAD_Dzp',
  'SpAD_Dzpe',
  'SpAD_Dzse',
  'SpAD_Dzv',
  'SpAbs_A',
  'SpAbs_D',
  'SpAbs_DzZ',
  'SpAbs_Dzare',
  'SpAbs_Dzi',
  'SpAbs_Dzm',
  'SpAbs_Dzp',
  'SpAbs_Dzpe',
  'SpAbs_Dzse',
  'SpAbs_Dzv',
  'SpDiam_D',
  'SpDiam_DzZ',
  'SpDiam_Dzare',
  'SpDiam_Dzi',
  'SpDiam_Dzm',
  'SpDiam_Dzp',
  'SpDiam_Dzpe',
  'SpDiam_Dzse',
  'SpDiam_Dzv',
  'SpMax_D',
  'SpMax_DzZ',
  'SpMax_Dzare',
  'SpMax_Dzi',
  'SpMax_Dzm',
  'SpMax_Dzp',
  'SpMax_Dzpe',
  'SpMax_Dzse',
  'SpMax_Dzv',
  'VE1_D',
  'VE1_DzZ',
  'VE1_Dzare',
  'VE1_Dzi',
  'VE1_Dzm',
  'VE1_Dzp',
  'VE1_Dzpe',
  'VE1_Dzse',
  'VE1_Dzv',
  '

In [25]:
correlated.features_to_drop_

{'AATS0are',
 'AATS0m',
 'AATS0pe',
 'AATS0se',
 'AATS1are',
 'AATS1d',
 'AATS1dv',
 'AATS1i',
 'AATS1m',
 'AATS1p',
 'AATS1pe',
 'AATS1se',
 'AATS1v',
 'AATS2Z',
 'AATS2are',
 'AATS2d',
 'AATS2dv',
 'AATS2i',
 'AATS2m',
 'AATS2p',
 'AATS2pe',
 'AATS2s',
 'AATS2se',
 'AATS2v',
 'AATS3Z',
 'AATS3are',
 'AATS3d',
 'AATS3dv',
 'AATS3m',
 'AATS3pe',
 'AATS3s',
 'AATS3se',
 'AATS3v',
 'AATS4Z',
 'AATS4are',
 'AATS4dv',
 'AATS4m',
 'AATS4p',
 'AATS4pe',
 'AATS4s',
 'AATS4se',
 'AATS4v',
 'AATS5Z',
 'AATS5are',
 'AATS5d',
 'AATS5dv',
 'AATS5m',
 'AATS5p',
 'AATS5pe',
 'AATS5se',
 'AATS6are',
 'AATS6m',
 'AATS6pe',
 'AATS6se',
 'AATS6v',
 'AATS7Z',
 'AATS7are',
 'AATS7d',
 'AATS7dv',
 'AATS7m',
 'AATS7pe',
 'AATS7se',
 'AATS7v',
 'AATS8are',
 'AATS8m',
 'AATS8p',
 'AATS8pe',
 'AATS8v',
 'AATSC0Z',
 'AATSC0are',
 'AATSC0dv',
 'AATSC0i',
 'AATSC0m',
 'AATSC0p',
 'AATSC0pe',
 'AATSC0s',
 'AATSC0se',
 'AATSC1Z',
 'AATSC1are',
 'AATSC1c',
 'AATSC1d',
 'AATSC1dv',
 'AATSC1i',
 'AATSC1m',
 'AATSC1p',

In [26]:
len(correlated.features_to_drop_)

821

In [27]:
X = correlated.transform(X)

In [28]:
X

Unnamed: 0,name,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpMAD_A,VE1_A,VE2_A,VR1_A,...,JGI5,JGI6,JGI7,JGI8,JGI9,JGI10,VAdjMat,SRW07,TSRW10,WPath
0,285.0,0.0,0.0,26.533788,2.417568,4.835136,1.326689,3.856019,0.192801,164.037696,...,0.024306,0.012975,0.009163,0.007849,0.005893,0.006173,5.459432,0.000000,53.680768,816.0
1,4915.0,0.0,1.0,33.119478,2.547941,4.982317,1.273826,3.909209,0.150354,658.589840,...,0.032117,0.021938,0.013776,0.009565,0.009926,0.007947,5.807355,4.844187,75.463984,1663.0
2,372.0,0.0,0.0,30.775285,2.306470,4.612940,1.282304,3.856805,0.160700,452.104341,...,0.017222,0.018605,0.015700,0.006790,0.007390,0.005224,5.643856,0.000000,57.631782,1632.0
3,372.0,0.0,0.0,30.775285,2.306470,4.612940,1.282304,3.856805,0.160700,452.104341,...,0.017222,0.018605,0.015700,0.006790,0.007390,0.005224,5.643856,0.000000,57.631782,1632.0
4,1190.0,0.0,0.0,29.437477,2.545352,4.897891,1.338067,3.776860,0.171675,366.793239,...,0.020022,0.015478,0.015423,0.011003,0.003678,0.003186,5.643856,4.948760,71.277759,1074.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9435,2787.0,0.0,0.0,33.463594,2.444828,4.738693,1.239392,4.621410,0.171163,214.993825,...,0.031372,0.017806,0.019677,0.011058,0.013396,0.009059,5.857981,5.416100,78.253383,2035.0
9436,2922.0,0.0,0.0,41.589381,2.455832,4.910792,1.299668,4.325253,0.135164,571.413793,...,0.024028,0.014695,0.013131,0.009001,0.007480,0.007505,6.129283,4.442651,81.140664,2934.0
9437,5696.0,0.0,0.0,32.030349,2.531373,5.059508,1.231936,4.217002,0.162192,249.080885,...,0.035791,0.027001,0.014142,0.013051,0.008479,0.008843,5.807355,4.727388,75.349253,1555.0
9438,5747.0,2.0,0.0,31.249932,2.363275,4.640138,1.302080,3.969563,0.165398,312.724861,...,0.023921,0.018854,0.009507,0.006061,0.009121,0.005133,5.700440,5.209486,73.225218,1647.0


In [29]:
print('Number of features in our dataset : {} '.format(X.shape[1]))

Number of features in our dataset : 431 


In [30]:
# Now we remove the correlated features which produce the same result

from sklearn.ensemble import GradientBoostingClassifier

smart_corr = SmartCorrelatedSelection(
    variables=None, 
    method="pearson", 
    threshold=0.7, 
    selection_method="model_performance", 
    estimator=GradientBoostingClassifier(), 
)

In [31]:
smart_corr.fit(X, y)

In [32]:
smart_corr.correlated_feature_sets_

[{'AETA_eta_R', 'ETA_beta_ns', 'SpAbs_A', 'SpMAD_DzZ', 'VE1_A', 'VE2_A'},
 {'BCUTp-1l', 'JGI10', 'SpDiam_A', 'Xc-6dv'},
 {'AXp-0d', 'SpMAD_A', 'fMF'},
 {'BertzCT', 'C2SP2', 'n6aRing', 'nAromAtom', 'piPC6'},
 {'ATS1Z', 'ATSC0i', 'GGI10', 'GGI8', 'nAtom'},
 {'ATS0dv', 'TopoPSA(NO)', 'nHetero'},
 {'AMID_N', 'NaaN', 'nN'},
 {'MID_O', 'nO'},
 {'BCUTs-1l', 'nS'},
 {'MID_X', 'SssssC', 'nF'},
 {'NsBr', 'nBr'},
 {'ATS0s', 'ATS4s', 'EState_VSA10'},
 {'AATS0Z', 'ATS0Z'},
 {'AATS0p', 'ATS0p'},
 {'AATS0d', 'AATS0dv', 'AATS0s', 'AATS1s', 'AATS6dv', 'AATS6s', 'AATSC0v'},
 {'AATS8Z', 'AATS8d', 'AATS8dv', 'AATS8se'},
 {'AATS4d', 'AATS5v', 'GATS2d'},
 {'AATS6d', 'AATS6p', 'AATS7p'},
 {'AATS7s', 'AATS8s'},
 {'AATS0v', 'AATS1Z', 'AATS6Z', 'SIC0'},
 {'AATS3i', 'AATS3p'},
 {'AATS0i', 'AATS4i', 'AATS5i', 'AATS6i'},
 {'AATSC0c', 'ATSC0c'},
 {'ATSC2c', 'ATSC3c'},
 {'AATSC6c', 'ATSC6c'},
 {'AATSC7c', 'ATSC7c'},
 {'ATSC3dv', 'GATS3dv'},
 {'ATSC4dv', 'GATS4dv'},
 {'ATSC5dv', 'GATS5dv'},
 {'ATSC6dv', 'GATS6dv'},
 

In [33]:
smart_corr.features_to_drop_

['SpDiam_A',
 'SpMAD_A',
 'VE1_A',
 'VE2_A',
 'nAromAtom',
 'nAtom',
 'nHetero',
 'nN',
 'nO',
 'nS',
 'nF',
 'ATS0dv',
 'ATS0s',
 'ATS4s',
 'ATS0Z',
 'ATS1Z',
 'ATS0p',
 'AATS0dv',
 'AATS6dv',
 'AATS8dv',
 'AATS0d',
 'AATS6d',
 'AATS8d',
 'AATS0s',
 'AATS1s',
 'AATS6s',
 'AATS7s',
 'AATS1Z',
 'AATS6Z',
 'AATS8Z',
 'AATS5v',
 'AATS3p',
 'AATS6p',
 'AATS4i',
 'AATS5i',
 'AATS6i',
 'ATSC3c',
 'ATSC5d',
 'ATSC6d',
 'ATSC7d',
 'ATSC1s',
 'ATSC3s',
 'ATSC8s',
 'ATSC4Z',
 'ATSC7Z',
 'ATSC2v',
 'ATSC3v',
 'ATSC6v',
 'ATSC7v',
 'ATSC2se',
 'ATSC3se',
 'ATSC1are',
 'ATSC1p',
 'ATSC1i',
 'ATSC6i',
 'ATSC7i',
 'AATSC0c',
 'AATSC6c',
 'AATSC7c',
 'AATSC7dv',
 'AATSC8dv',
 'AATSC0d',
 'AATSC8d',
 'AATSC7s',
 'AATSC8Z',
 'AATSC7v',
 'AATSC8v',
 'AATSC4se',
 'AATSC5se',
 'AATSC6se',
 'AATSC6p',
 'MATS8s',
 'GATS2dv',
 'GATS3dv',
 'GATS4dv',
 'GATS5dv',
 'GATS6dv',
 'GATS2d',
 'GATS3d',
 'GATS4d',
 'GATS1Z',
 'GATS6se',
 'GATS8se',
 'GATS3p',
 'GATS4p',
 'GATS4i',
 'GATS5i',
 'BCUTdv-1h',
 'SpMAD_DzZ'

In [34]:
len(smart_corr.features_to_drop_)

136

In [35]:
smart_corr.variables_

['name',
 'nAcid',
 'nBase',
 'SpAbs_A',
 'SpMax_A',
 'SpDiam_A',
 'SpMAD_A',
 'VE1_A',
 'VE2_A',
 'VR1_A',
 'nAromAtom',
 'nAtom',
 'nHetero',
 'nN',
 'nO',
 'nS',
 'nF',
 'nCl',
 'nBr',
 'ATS0dv',
 'ATS0s',
 'ATS1s',
 'ATS4s',
 'ATS0Z',
 'ATS1Z',
 'ATS0p',
 'AATS0dv',
 'AATS6dv',
 'AATS8dv',
 'AATS0d',
 'AATS4d',
 'AATS6d',
 'AATS8d',
 'AATS0s',
 'AATS1s',
 'AATS5s',
 'AATS6s',
 'AATS7s',
 'AATS8s',
 'AATS0Z',
 'AATS1Z',
 'AATS6Z',
 'AATS8Z',
 'AATS0v',
 'AATS5v',
 'AATS8se',
 'AATS0p',
 'AATS3p',
 'AATS6p',
 'AATS7p',
 'AATS0i',
 'AATS3i',
 'AATS4i',
 'AATS5i',
 'AATS6i',
 'AATS7i',
 'AATS8i',
 'ATSC0c',
 'ATSC1c',
 'ATSC2c',
 'ATSC3c',
 'ATSC4c',
 'ATSC5c',
 'ATSC6c',
 'ATSC7c',
 'ATSC1dv',
 'ATSC2dv',
 'ATSC3dv',
 'ATSC4dv',
 'ATSC5dv',
 'ATSC6dv',
 'ATSC7dv',
 'ATSC8dv',
 'ATSC1d',
 'ATSC2d',
 'ATSC3d',
 'ATSC4d',
 'ATSC5d',
 'ATSC6d',
 'ATSC7d',
 'ATSC8d',
 'ATSC1s',
 'ATSC2s',
 'ATSC3s',
 'ATSC4s',
 'ATSC5s',
 'ATSC6s',
 'ATSC7s',
 'ATSC8s',
 'ATSC1Z',
 'ATSC2Z',
 'ATSC3Z',
 'A

In [36]:
X = smart_corr.transform(X)
X

Unnamed: 0,name,nAcid,nBase,SpAbs_A,SpMax_A,VR1_A,nCl,nBr,ATS1s,AATS4d,...,JGI2,JGI3,JGI4,JGI5,JGI6,JGI7,JGI8,JGI9,TSRW10,WPath
0,285.0,0.0,0.0,26.533788,2.417568,164.037696,0.0,0.0,124.444444,3.060606,...,0.081481,0.042339,0.037607,0.024306,0.012975,0.009163,0.007849,0.005893,53.680768,816.0
1,4915.0,0.0,1.0,33.119478,2.547941,658.589840,0.0,0.0,158.805556,2.771429,...,0.077778,0.062027,0.052209,0.032117,0.021938,0.013776,0.009565,0.009926,75.463984,1663.0
2,372.0,0.0,0.0,30.775285,2.306470,452.104341,0.0,0.0,159.472222,2.452632,...,0.076389,0.039062,0.038632,0.017222,0.018605,0.015700,0.006790,0.007390,57.631782,1632.0
3,372.0,0.0,0.0,30.775285,2.306470,452.104341,0.0,0.0,159.472222,2.452632,...,0.076389,0.039062,0.038632,0.017222,0.018605,0.015700,0.006790,0.007390,57.631782,1632.0
4,1190.0,0.0,0.0,29.437477,2.545352,366.793239,0.0,0.0,124.305556,2.816514,...,0.067901,0.062114,0.035575,0.020022,0.015478,0.015423,0.011003,0.003678,71.277759,1074.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9435,2787.0,0.0,0.0,33.463594,2.444828,214.993825,0.0,0.0,161.077160,2.593985,...,0.087855,0.059621,0.052164,0.031372,0.017806,0.019677,0.011058,0.013396,78.253383,2035.0
9436,2922.0,0.0,0.0,41.589381,2.455832,571.413793,0.0,0.0,196.888889,2.925170,...,0.081633,0.041122,0.051377,0.024028,0.014695,0.013131,0.009001,0.007480,81.140664,2934.0
9437,5696.0,0.0,0.0,32.030349,2.531373,249.080885,0.0,0.0,158.722222,2.970149,...,0.100529,0.074537,0.045222,0.035791,0.027001,0.014142,0.013051,0.008479,75.349253,1555.0
9438,5747.0,2.0,0.0,31.249932,2.363275,312.724861,0.0,0.0,141.987654,2.973684,...,0.058824,0.032567,0.031329,0.023921,0.018854,0.009507,0.006061,0.009121,73.225218,1647.0


In [37]:
df_test = pd.read_csv('/Users/tanmaysharma/projects/Jak2Biotech/Datasets/jak2test.csv')

In [38]:
X_test = df_test.drop(['cls'], axis = 1)
y_test = df_test['cls']

In [39]:
X_test = X_test[X.columns]

In [40]:
temp_name = le.fit_transform(X_test['name'])
X_test['name'] = temp_name
X_test = pd.DataFrame(imputer.fit_transform(X_test), columns=X_test.columns)
X_test.isna().sum().sum()

0

In [52]:
from sklearn.utils import shuffle

X, y = shuffle(X, y)

### Feature selection is done. Now it's time to test on various decision trees. Let's see.

In [53]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

xgb = XGBClassifier()
xgb_model = xgb.fit(X, y)
xgb_predict = xgb.predict(X_test)
accuracy_score(y_test, xgb_predict)

0.9173728813559322

In [54]:
from catboost import CatBoostClassifier

In [55]:
catboost = CatBoostClassifier()
catboost_model = catboost.fit(X, y)
catboost_predict = catboost_model.predict(X_test)
accuracy_score(y_test, catboost_predict)

Learning rate set to 0.026869
0:	learn: 0.6835128	total: 11.3ms	remaining: 11.2s
1:	learn: 0.6742734	total: 19.1ms	remaining: 9.55s
2:	learn: 0.6662555	total: 26.8ms	remaining: 8.91s
3:	learn: 0.6584121	total: 34.5ms	remaining: 8.58s
4:	learn: 0.6497295	total: 41.9ms	remaining: 8.33s
5:	learn: 0.6423741	total: 49.8ms	remaining: 8.25s
6:	learn: 0.6356018	total: 57.7ms	remaining: 8.19s
7:	learn: 0.6283948	total: 65.3ms	remaining: 8.09s
8:	learn: 0.6222039	total: 72.8ms	remaining: 8.01s
9:	learn: 0.6168340	total: 80.2ms	remaining: 7.93s
10:	learn: 0.6102580	total: 87.6ms	remaining: 7.87s
11:	learn: 0.6045642	total: 94.9ms	remaining: 7.81s
12:	learn: 0.5993414	total: 103ms	remaining: 7.8s
13:	learn: 0.5940858	total: 110ms	remaining: 7.77s
14:	learn: 0.5880429	total: 118ms	remaining: 7.75s
15:	learn: 0.5832273	total: 126ms	remaining: 7.74s
16:	learn: 0.5785857	total: 133ms	remaining: 7.69s
17:	learn: 0.5738531	total: 140ms	remaining: 7.66s
18:	learn: 0.5691560	total: 148ms	remaining: 7.63s


0.9084745762711864

In [67]:
from sklearn.ensemble import AdaBoostClassifier

abc = AdaBoostClassifier()
abc_model = abc.fit(X, y)
abc_predict = abc_model.predict(X_test)
accuracy_score(y_test, abc_predict)

0.8182203389830508

In [68]:
from sklearn.ensemble import ExtraTreesClassifier

etc = ExtraTreesClassifier()
etc_model = etc.fit(X, y)
etc_predict = etc_model.predict(X_test)
accuracy_score(y_test, etc_predict)

0.9271186440677966

In [69]:
from sklearn.ensemble import BaggingClassifier

bc = BaggingClassifier()
bc_model = bc.fit(X, y)
bc_predict = bc_model.predict(X_test)
accuracy_score(y_test, bc_predict)

0.9012711864406779

In [71]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier()
gbc_model = gbc.fit(X, y)
gbc_predict = gbc_model.predict(X_test)
accuracy_score(y_test, gbc_predict)

0.8550847457627119

In [73]:
from sklearn.ensemble import IsolationForest

If = IsolationForest()
If_model = If.fit(X, y)
If_predict = If_model.predict(X_test)
accuracy_score(y_test, If_predict)

0.47966101694915253

In [74]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc_model = rfc.fit(X, y)
rfc_predict = rfc_model.predict(X_test)
accuracy_score(y_test, rfc_predict)

0.9139830508474577

In [83]:
from sklearn.ensemble import StackingClassifier

estimators = [
    ('etc', ExtraTreesClassifier()),
    ('rfc', RandomForestClassifier()),
    ('bc', BaggingClassifier())
]
sc = StackingClassifier(estimators=estimators, final_estimator = AdaBoostClassifier(), cv = 5)

In [84]:
sc.fit(X, y).score(X_test, y_test)

0.9233050847457627

In [86]:
from sklearn.ensemble import VotingClassifier

estimators = [
    ('etc', ExtraTreesClassifier()),
    ('rfc', RandomForestClassifier()),
    ('bc', BaggingClassifier())
]
vch = VotingClassifier(estimators=estimators, voting = 'hard')
vcs = VotingClassifier(estimators=estimators, voting = 'soft')

In [87]:
vch.fit(X, y).score(X_test, y_test)

0.923728813559322

In [88]:
vcs.fit(X, y).score(X_test, y_test)

0.9186440677966101

In [90]:
from sklearn.ensemble import HistGradientBoostingClassifier

hgbc = HistGradientBoostingClassifier()
hgbc_model = hgbc.fit(X, y)
hgbc_predict = hgbc_model.predict(X_test)
accuracy_score(y_test, hgbc_predict)

0.9114406779661017

Final Remarks : This pipeline works good. Feature Engineering was good. Accuaracy was decent. We shall tweak some things and try and find other methods to try and see what works the best.  