# [SPLEX] TME8 : Feature Selection - Model Selection
---
PERRIN Jérémie & PODLEJSKI Witold

In [80]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import ElasticNet
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
from sklearn import linear_model
import copy

In [52]:
# Reading Golub data
GX = pd.read_csv('data/Golub_X',sep=' ',header=None) # Observations
Gy = pd.read_csv('data/Golub_y',sep=' ',header=None) # Classes

# Reading Breast cancer data
BX = pd.read_csv('data/Breast.txt',sep=' ',header=None)
By = BX.values[:,30] # Classes
BX = BX.values[:,0:29] # Observations

print("Size of Golub data : ", GX.shape)
print("Size of Breast data : ", BX.shape)

Size of Golub data :  (72, 3562)
Size of Breast data :  (569, 29)


## Question 1

In [53]:
from sklearn.feature_selection import VarianceThreshold

print("With variance threshold 0.05")
sel = VarianceThreshold(threshold=0.05)
print("Size of Golub data : ", sel.fit_transform(GX).shape)
print("Size of Breast data : ", sel.fit_transform(BX).shape)

print("With variance threshold 0.99")
sel = VarianceThreshold(threshold=0.99)
#print("Size of Golub data : ", sel.fit_transform(GX).shape)
print("Size of Breast data : ", sel.fit_transform(BX).shape)

With variance threshold 0.05
Size of Golub data :  (72, 387)
Size of Breast data :  (569, 29)
With variance threshold 0.99
Size of Breast data :  (569, 29)


## Question 2

In [55]:
from sklearn.feature_selection import SelectFdr

print("With univariate selection from ANOVA :")
sel = SelectFdr(alpha=0.05)
print("Size of Golub data : ", sel.fit_transform(GX,np.ravel(Gy)).shape)
print("Size of Breast data : ", sel.fit_transform(BX,np.ravel(By)).shape)

With univariate selection from ANOVA :
Size of Golub data :  (72, 545)
Size of Breast data :  (569, 24)


## Question 3
a) Lasso 

In [81]:
print("Lasso :")
lasso = {}
clf = linear_model.Lasso(alpha=0.01)
clf.fit(GX,Gy)
lasso['G'] = copy.copy(clf)
print("Number of non zero features  for Golub data: ",len(np.where(clf.coef_ != 0)[0]))
clf.fit(BX,By)
lasso['B'] = copy.copy(clf)
print("Number of non zero features  for Breast cancer data: ",len(np.where(clf.coef_ != 0)[0]))


Lasso :
Number of non zero features  for Golub data:  27
Number of non zero features  for Breast cancer data:  16


b) SVM with L1 penalty

In [84]:
print("SVM with L1 penalty :")
svm = {}
clf = LinearSVC(C = 0.1, penalty="l1", dual=False)
clf.fit(GX,np.ravel(Gy))
svm['G'] = copy.copy(clf)
print("Number of non zero features  for Golub data: ",len(np.where(clf.coef_ != 0)[0]))
clf.fit(BX,np.ravel(By))
svm['B'] = copy.copy(clf)
print("Number of non zero features  for Breast Cancer data: ",len(np.where(clf.coef_ != 0)[0]))

SVM with L1 penalty :
Number of non zero features  for Golub data:  12
Number of non zero features  for Breast Cancer data:  14


c) Elastic Net


In [85]:
clf = ElasticNet(alpha=0.01, l1_ratio=0.7)
EN = {}
clf.fit(GX,np.ravel(Gy))
EN['G'] = copy.copy(clf)
print("Number of non zero features  for Golub data: ",len(np.where(clf.coef_ != 0)[0]))
clf.fit(BX,np.ravel(By))
EN['B'] = copy.copy(clf)
print("Number of non zero features  for Breast Cancer data: ",len(np.where(clf.coef_ != 0)[0]))

Number of non zero features  for Golub data:  40
Number of non zero features  for Breast Cancer data:  18


## Assessing the quality of the classifications

In [89]:
clfs = {'Lasso':lasso,'SVM':svm,'Elastic Net' : EN}
data = {'G':(GX,Gy),'B':(BX,By)}
for i in ['G','B']:
    best = ""
    bestacc = 0
    for name,clf in clfs.items():
        s = clf[i].score(*data[i])
        if s > bestacc:
            best = name
            bestacc = s
    print("Best Classification for " + i + " dataset is " + best + " method. With accuracy :",bestacc)

Best Classification for G dataset is SVM method. With accuracy : 0.9861111111111112
Best Classification for B dataset is SVM method. With accuracy : 0.9876977152899824
