In [1]:
import matplotlib
matplotlib.use('Agg')

# Handle table-like data and matrices
import numpy as np
import pandas as pd

# Helpers
import sys
sys.path.insert(0,'../../')
from utils import data_path,results_path,grid_search,estimator_result,cross_validate,evaluate_param
from scipy.sparse import csr_matrix,save_npz,load_npz
from sklearn.model_selection import cross_val_score,LeaveOneOut,StratifiedKFold
from sklearn.metrics import accuracy_score,roc_auc_score,roc_curve
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import pickle

# Feature selection
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFECV,VarianceThreshold

# Algorithm
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

## Load data

In [2]:
train_X = load_npz(data_path + 'version_1/'+ 'train_freq.npz')
test_X = load_npz(data_path + 'version_1/'+ 'test_freq.npz')

train_y = pd.read_csv(data_path + 'train_Y.csv')['CMV_status']
test_y = pd.read_csv(data_path + 'test_Y.csv')['CMV_status']

## Feature selection using SelectFromModel

### L1-based

In [3]:
# Logistic Regression
lr = LogisticRegressionCV([0.001,0.01,0.1,1,10,100],penalty='l1',solver='liblinear',random_state=0).fit(train_X,train_y)
# Linear SVC
lsvc = LinearSVC(C=10, penalty="l1", dual=False,random_state=0).fit(train_X, train_y)

lr_coef = np.argsort(lr.coef_.flatten())[::-1]
lsvc_coef = np.argsort(lsvc.coef_.flatten())[::-1]

In [4]:
# Select from LR
model = SelectFromModel(lr,prefit=True,threshold='7000*mean')
train_new = model.transform(train_X)
test_new = model.transform(test_X)
train_new.shape

(641, 283)

In [5]:
lr2 = LogisticRegression(random_state=0,C=0.1)
lr2.fit(train_new,train_y)
accuracy_score(test_y,lr2.predict(test_new))

0.7833333333333333

In [6]:
# Select from LinearSVC
model2 = SelectFromModel(lsvc, prefit=True,threshold='20000*mean')
train_new2 = model2.transform(train_X)
test_new2 = model2.transform(test_X)
train_new2.shape

(641, 184)

In [7]:
lsvc2 = LinearSVC(C=10, penalty="l1", dual=False,random_state=0).fit(train_new2, train_y)
lsvc2.fit(train_new2,train_y)
accuracy_score(test_y,lsvc2.predict(test_new2))

0.7416666666666667

In [8]:
# Get indices of selected features
ind = model.get_support(indices=True)
ind2 = model2.get_support(indices=True)

In [9]:
with open(data_path + 'feature_selection/'+'freq/'+'lr', 'wb') as f:
    pickle.dump(ind, f, pickle.HIGHEST_PROTOCOL)
with open(data_path + 'feature_selection/'+'freq/'+'lsvc', 'wb') as f:
    pickle.dump(ind2, f, pickle.HIGHEST_PROTOCOL)
with open(data_path + 'feature_selection/'+'freq/'+'lr_coef', 'wb') as f:
    pickle.dump(lr_coef, f, pickle.HIGHEST_PROTOCOL)
with open(data_path + 'feature_selection/'+'freq/'+'lsvc_coef', 'wb') as f:
    pickle.dump(lsvc_coef, f, pickle.HIGHEST_PROTOCOL)

### Tree-based

### ExtraTrees

In [10]:
from sklearn.ensemble import ExtraTreesClassifier
et = ExtraTreesClassifier(n_estimators=200,random_state=0)
et.fit(train_X,train_y)
et_im = np.argsort(et.feature_importances_)[::-1]

In [11]:
model3 = SelectFromModel(et,prefit=True,threshold='2000*mean')
train_new3 = model3.transform(train_X)
test_new3 = model3.transform(test_X)
print(train_new3.shape)

(641, 111)


In [12]:
et2 = ExtraTreesClassifier(n_estimators=200,random_state=0)
et2.fit(train_new3,train_y)
accuracy_score(test_y,et2.predict(test_new3))

0.625

In [13]:
ind3 = model3.get_support(indices=True)

In [14]:
with open(data_path + 'feature_selection/'+'freq/'+'et', 'wb') as f:
    pickle.dump(ind3, f, pickle.HIGHEST_PROTOCOL)
    
with open(data_path + 'feature_selection/'+'freq/'+'et_im', 'wb') as f:
    pickle.dump(et_im, f, pickle.HIGHEST_PROTOCOL)

### RandomForest

In [15]:
rf = RandomForestClassifier(n_estimators=250,random_state=0)
rf.fit(train_X,train_y)
rf_im = np.argsort(rf.feature_importances_)[::-1]

In [16]:
model4 = SelectFromModel(rf,prefit=True,threshold='2000*mean')
train_new4 = model4.transform(train_X)
test_new4 = model4.transform(test_X)
print(train_new4.shape)

(641, 85)


In [17]:
rf2 = RandomForestClassifier(n_estimators=250,random_state=0)
rf2.fit(train_new4,train_y)
accuracy_score(test_y,rf2.predict(test_new4))

0.75

In [18]:
ind4 = model4.get_support(indices=True)

In [19]:
with open(data_path + 'feature_selection/'+'freq/'+'rf', 'wb') as f:
    pickle.dump(ind4, f, pickle.HIGHEST_PROTOCOL)

with open(data_path + 'feature_selection/'+'freq/'+'rf_im', 'wb') as f:
    pickle.dump(rf_im, f, pickle.HIGHEST_PROTOCOL)

### Intersection

In [20]:
inter = list(set(ind).intersection(ind2))
inter2 = list(set(ind).intersection(ind3))
inter3 = list(set(ind).intersection(ind4))
inter4 = list(set(ind2).intersection(ind3))
inter5 = list(set(ind2).intersection(ind4))
inter6 = list(set(ind3).intersection(ind4))

## Test Performance

In [21]:
inds = {'lr':ind,'lsvc':ind2,'et':ind3,'rf':ind4,'inter_lr_lsvc':inter,'inter_lr_et':inter2,'inter_lr_rf':inter3,
       'inter_lsvc_et':inter4,'inter_lsvc_rf':inter5,'inter_et_rf':inter6}

In [22]:
def test_sub_features(clf,dict_inds):
    for name in dict_inds:
        print('Features selected by {}'.format(name))
        print('Len: ',len(dict_inds[name]))
        ind = dict_inds[name]
        clf.fit(train_X[:,ind],train_y)
        print(accuracy_score(test_y,clf.predict(test_X[:,ind])))
        print(roc_auc_score(test_y,clf.predict_proba(test_X[:,ind])[:,1]))
        print()

In [23]:
test_sub_features(lr,inds)

Features selected by lr
Len:  283
0.7
0.7507814720090935

Features selected by lsvc
Len:  184
0.7333333333333333
0.8076157999431656

Features selected by et
Len:  111
0.6333333333333333
0.6760443307757886

Features selected by rf
Len:  85
0.725
0.7956805910770106

Features selected by inter_lr_lsvc
Len:  50
0.7333333333333333
0.7647058823529412

Features selected by inter_lr_et
Len:  13
0.675
0.6864165956237569

Features selected by inter_lr_rf
Len:  19
0.7333333333333333
0.7450980392156863

Features selected by inter_lsvc_et
Len:  2
0.6083333333333333
0.548877522023302

Features selected by inter_lsvc_rf
Len:  8
0.725
0.7939755612389884

Features selected by inter_et_rf
Len:  6
0.7083333333333334
0.6705029838022165



In [24]:
test_sub_features(et,inds)

Features selected by lr
Len:  283
0.875
0.9288150042625746

Features selected by lsvc
Len:  184
0.7833333333333333
0.881074168797954

Features selected by et
Len:  111
0.625
0.6408070474566638

Features selected by rf
Len:  85
0.7416666666666667
0.783461210571185

Features selected by inter_lr_lsvc
Len:  50
0.7583333333333333
0.8492469451548735

Features selected by inter_lr_et
Len:  13
0.675
0.6872691105427678

Features selected by inter_lr_rf
Len:  19
0.675
0.6486217675475988

Features selected by inter_lsvc_et
Len:  2
0.6083333333333333
0.548877522023302

Features selected by inter_lsvc_rf
Len:  8
0.6666666666666666
0.6764705882352942

Features selected by inter_et_rf
Len:  6
0.7083333333333334
0.6686558681443592



In [25]:
test_sub_features(rf,inds)

Features selected by lr
Len:  283
0.875
0.9440181869849389

Features selected by lsvc
Len:  184
0.7833333333333333
0.9009661835748792

Features selected by et
Len:  111
0.675
0.6761864165956237

Features selected by rf
Len:  85
0.75
0.8022165387894289

Features selected by inter_lr_lsvc
Len:  50
0.825
0.8653026427962489

Features selected by inter_lr_et
Len:  13
0.675
0.6872691105427678

Features selected by inter_lr_rf
Len:  19
0.6833333333333333
0.6592782040352372

Features selected by inter_lsvc_et
Len:  2
0.6083333333333333
0.548877522023302

Features selected by inter_lsvc_rf
Len:  8
0.6666666666666666
0.6868428530832623

Features selected by inter_et_rf
Len:  6
0.7083333333333334
0.6678033532253481

