Feature selection is important to 
1. reduce overfitting
2. improve accuracy 
3. raducing training time

Different methods to do feature selection are:
1. univariate selection 
2. Recursive feature elimination 
3. PCA 
4. Feature importance 

In [1]:
from pandas import read_csv
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest, RFE, chi2
from sklearn.linear_model import LogisticRegression 
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier

In [2]:
## Univariate slecetion 
print("Feature selection with Univariate statistical tests (Chi-square for classification)")

# load data
file = '../data/pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
df = read_csv(file, names= names)
array = df.values

X = array[:,0:8]
Y = array[:,8]
# feature extraction
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, Y)

# summarize scores 
print(fit.scores_)
features = fit.transform(X)
# summarize selected features
print(features[0:5,:])

Feature selection with Univariate statistical tests (Chi-square for classification)
[ 111.51969064 1411.88704064   17.60537322   53.10803984 2175.56527292
  127.66934333    5.39268155  181.30368904]
[[148.    0.   33.6  50. ]
 [ 85.    0.   26.6  31. ]
 [183.    0.   23.3  32. ]
 [ 89.   94.   28.1  21. ]
 [137.  168.   43.1  33. ]]


Recursive Feature Elimination

In [3]:
# feature extraction
model = LogisticRegression(solver='lbfgs', max_iter=1000)
rfe = RFE(model,n_features_to_select=3, step=1)
fit = rfe.fit(X, Y)
print("Num Features: %d" % fit.n_features_)
print("Selected Features: %s" % fit.support_)
print("Feature  Ranking: %s " % fit.ranking_)

Num Features: 3
Selected Features: [ True False False False False  True  True False]
Feature  Ranking: [1 2 4 6 5 1 1 3] 


Principal Component Analysis

In [4]:
# feature extraction
pca = PCA(n_components=3)
fit = pca.fit(X)
#summarized components 
print("Explained variance\n %s" %fit.explained_variance_ratio_)
print(fit.components_)

print("-- FEATURE IMPORTANCE -- ")
# feature extraction 
model = ExtraTreesClassifier()
model.fit(X,Y)
print(model.feature_importances_)

Explained variance
 [0.88854663 0.06159078 0.02579012]
[[-2.02176587e-03  9.78115765e-02  1.60930503e-02  6.07566861e-02
   9.93110844e-01  1.40108085e-02  5.37167919e-04 -3.56474430e-03]
 [ 2.26488861e-02  9.72210040e-01  1.41909330e-01 -5.78614699e-02
  -9.46266913e-02  4.69729766e-02  8.16804621e-04  1.40168181e-01]
 [ 2.24649003e-02 -1.43428710e-01  9.22467192e-01  3.07013055e-01
  -2.09773019e-02  1.32444542e-01  6.39983017e-04  1.25454310e-01]]
-- FEATURE IMPORTANCE -- 
[0.11038804 0.2366037  0.10048613 0.07789462 0.07405263 0.13982619
 0.12040095 0.14034774]
