In [1]:
import matplotlib
matplotlib.use('Agg')

# Handle table-like data and matrices
import numpy as np
import pandas as pd

# Helpers
import sys
sys.path.insert(0,'../../')
from utils import data_path,results_path,grid_search,estimator_result,cross_validate,evaluate_param
from scipy.sparse import csr_matrix,save_npz,load_npz
from sklearn.model_selection import cross_val_score,LeaveOneOut,StratifiedKFold
from sklearn.metrics import accuracy_score,roc_auc_score,roc_curve
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import pickle

# Feature selection
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFECV,VarianceThreshold

# Algorithm
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

## Load data

In [2]:
train_X = load_npz(data_path + 'version_1/'+ 'train_per.npz')
test_X = load_npz(data_path + 'version_1/'+ 'test_per.npz')

train_y = pd.read_csv(data_path + 'train_Y.csv')['CMV_status']
test_y = pd.read_csv(data_path + 'test_Y.csv')['CMV_status']

## Feature selection using SelectFromModel

### Tree-based

### ExtraTrees

In [14]:
from sklearn.ensemble import ExtraTreesClassifier
et = ExtraTreesClassifier(n_estimators=200,random_state=0)
et.fit(train_X,train_y)
et_im = np.argsort(et.feature_importances_)[::-1]

In [19]:
model3 = SelectFromModel(et,prefit=True,threshold='1500*mean')
train_new3 = model3.transform(train_X)
test_new3 = model3.transform(test_X)
print(train_new3.shape)

(641, 212)


In [20]:
et2 = ExtraTreesClassifier(n_estimators=200,random_state=0)
et2.fit(train_new3,train_y)
accuracy_score(test_y,et2.predict(test_new3))

0.7333333333333333

In [21]:
ind3 = model3.get_support(indices=True)

In [22]:
with open(data_path + 'feature_selection/'+'per/'+'et', 'wb') as f:
    pickle.dump(ind3, f, pickle.HIGHEST_PROTOCOL)
    
with open(data_path + 'feature_selection/'+'per/'+'et_im', 'wb') as f:
    pickle.dump(et_im, f, pickle.HIGHEST_PROTOCOL)

### RandomForest

In [23]:
rf = RandomForestClassifier(n_estimators=250,random_state=0)
rf.fit(train_X,train_y)
rf_im = np.argsort(rf.feature_importances_)[::-1]

In [25]:
model4 = SelectFromModel(rf,prefit=True,threshold='2000*mean')
train_new4 = model4.transform(train_X)
test_new4 = model4.transform(test_X)
print(train_new4.shape)

(641, 130)


In [26]:
rf2 = RandomForestClassifier(n_estimators=250,random_state=0)
rf2.fit(train_new4,train_y)
accuracy_score(test_y,rf2.predict(test_new4))

0.8083333333333333

In [27]:
ind4 = model4.get_support(indices=True)

In [28]:
with open(data_path + 'feature_selection/'+'per/'+'rf', 'wb') as f:
    pickle.dump(ind4, f, pickle.HIGHEST_PROTOCOL)

with open(data_path + 'feature_selection/'+'per/'+'rf_im', 'wb') as f:
    pickle.dump(rf_im, f, pickle.HIGHEST_PROTOCOL)

### Intersection

In [29]:
inter = list(set(ind3).intersection(ind4))
len(inter)

10

## Test Performance

In [30]:
inds = {'et':ind3,'rf':ind4,'inter_et_rf':inter}

In [31]:
def test_sub_features(clf,dict_inds):
    for name in dict_inds:
        print('Features selected by {}'.format(name))
        print('Len: ',len(dict_inds[name]))
        ind = dict_inds[name]
        clf.fit(train_X[:,ind],train_y)
        print(accuracy_score(test_y,clf.predict(test_X[:,ind])))
        print(roc_auc_score(test_y,clf.predict_proba(test_X[:,ind])[:,1]))
        print()

In [32]:
test_sub_features(lr,inds)

Features selected by et
Len:  212
0.575
0.5

Features selected by rf
Len:  130
0.575
0.5

Features selected by inter_et_rf
Len:  10
0.575
0.5



In [33]:
test_sub_features(et,inds)

Features selected by et
Len:  212
0.7333333333333333
0.7998010798522308

Features selected by rf
Len:  130
0.7666666666666667
0.8618925831202047

Features selected by inter_et_rf
Len:  10
0.7
0.6383915885194659



In [34]:
test_sub_features(rf,inds)

Features selected by et
Len:  212
0.7583333333333333
0.8209718670076727

Features selected by rf
Len:  130
0.8083333333333333
0.8650184711565785

Features selected by inter_et_rf
Len:  10
0.7083333333333334
0.6747655583972719

