In [1]:
from pandas import read_csv
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
import os

In [2]:
# load data
filename = os.getcwd()+'/datasets_228_482_diabetes.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]

## Univariate Feature selection with SelectKBest

In [3]:
# feature extraction
test = SelectKBest(score_func=f_classif, k=4) #f_classif can be replaced by other methods like chi squared
fit = test.fit(X, Y)

In [4]:
# summarize scores
set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)

[ 39.67  213.162   3.257   4.304  13.281  71.772  23.871  46.141]


In [5]:
# summarize selected features
print(features[0:5,:])

[[  6.  148.   33.6  50. ]
 [  1.   85.   26.6  31. ]
 [  8.  183.   23.3  32. ]
 [  1.   89.   28.1  21. ]
 [  0.  137.   43.1  33. ]]


## Feature selection using Recursive Feature elimination

In [21]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

In [22]:
model = LogisticRegression(solver='lbfgs')
rfe = RFE(model, 3)
fit = rfe.fit(X, Y)
print('Num of Features:', fit.n_features_)
print('Selected Features: ', fit.support_)
print('Feature Ranking: ',fit.ranking_)

Num of Features: 3
Selected Features:  [ True False False False False  True  True False]
Feature Ranking:  [1 2 4 5 6 1 1 3]


In [23]:
print(names)

['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']


So it selected 3 features, "preg, pedi and age"

## Feature importance using Bagged decision trees

In [24]:
from sklearn.ensemble import ExtraTreesClassifier

In [26]:
model = ExtraTreesClassifier(n_estimators=10)
model.fit(X, Y)
print('Feature importances:',model.feature_importances_)

Feature importances: [0.111 0.223 0.105 0.095 0.079 0.132 0.121 0.133]


## Reducing dimensionality using PCA

In [30]:
from sklearn.decomposition import PCA

In [32]:
pca = PCA(n_components=3)
fit = pca.fit(X)

print('Explained Variance: ',fit.explained_variance_ratio_)
print(fit.components_)

Explained Variance:  [0.889 0.062 0.026]
[[-2.022e-03  9.781e-02  1.609e-02  6.076e-02  9.931e-01  1.401e-02
   5.372e-04 -3.565e-03]
 [-2.265e-02 -9.722e-01 -1.419e-01  5.786e-02  9.463e-02 -4.697e-02
  -8.168e-04 -1.402e-01]
 [-2.246e-02  1.434e-01 -9.225e-01 -3.070e-01  2.098e-02 -1.324e-01
  -6.400e-04 -1.255e-01]]


## Implementing LASSO regression(uses l1 penalty)

In [35]:
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

In [37]:
parameters = {'alpha':[0.01,0.001,0.0001]}
lasso_regressor = GridSearchCV(Lasso(), parameters, scoring='neg_mean_squared_error',cv=5)
lasso_regressor.fit(X, Y)
print(lasso_regressor.best_params_)
print(lasso_regressor.best_score_)

{'alpha': 0.0001}
-0.1625266035931419
