# Feature Selection Methods

## 1. Univariate Feature Selection

In [48]:
# out of 8 features of dataset which one is most important in predicting class?
# Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)
from pandas import read_csv
import pandas as pd
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest # for univariate selection
from sklearn.feature_selection import chi2

In [49]:
# load data, Patient's details taken, Identify which variable is most important in identifying whether patient has dibetes or not
filename ='C:\\Users\\pawar\\Documents\\Downloads\\pima-indians-diabetes.data.csv'
# In actual dataset no column names so give col names, Class - Y variable  
name = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe= read_csv(filename, names=name)
array = dataframe.values
dataframe.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [50]:
dataframe.shape

(768, 9)

In [51]:
X = array[:,0:8]
Y = array[:,8] # separate them in X and Y

# feature extraction 
test = SelectKBest(score_func=chi2, k=5) # get top 5 features
fit = test.fit(X, Y)
# summarize scores
set_printoptions(precision=4)
print(fit.scores_) # will get chi squares for each feature
features = fit.transform(X)
#For regression: f_regression, mutual_info_regression
#For classification: chi2, f_classif, mutual_info_classif
# for test: chi square is 2175.5653, so this is most imp feature, then plas,pedi,....

[ 111.5197 1411.887    17.6054   53.108  2175.5653  127.6693    5.3927
  181.3037]


In [52]:
name

['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

In [53]:
type(test)

sklearn.feature_selection._univariate_selection.SelectKBest

In [54]:
features=fit.transform(X)
features.shape

(768, 5)

In [55]:
x

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age
0,111.519691,1411.887041,17.605373,53.10804,2175.565273,127.669343,5.392682,181.303689


In [56]:
x.columns=name[:8]
x
# As per chi square test, test is most important feature, Next..plas, age, mass, preg

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age
0,111.519691,1411.887041,17.605373,53.10804,2175.565273,127.669343,5.392682,181.303689


In [57]:
#sort chi-square value in ascending or decending order
sorted_x= x.sort_values(by = 0,ascending= False, axis=1)
sorted_x

Unnamed: 0,test,plas,age,mass,preg,skin,pres,pedi
0,2175.565273,1411.887041,181.303689,127.669343,111.519691,53.10804,17.605373,5.392682


In [58]:
dataframe

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


## 2.Recursive Feature Elimination

#### Feature Scaling (standardization, Normalization etc.)

In [59]:
# Feature Extraction with RFE
from pandas import read_csv
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
# load data, Patient's details taken, Identify which variable is most important in identifying whether patient has dibetes or not
filename ='C:\\Users\\pawar\\Documents\\Downloads\\pima-indians-diabetes.data.csv'
# In actual dataset no column names so give col names, Class - Y variable  
name = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe= read_csv(filename, names=name)
array = dataframe.values
# feature extraction
model = LogisticRegression(max_iter=400)# to find out coefficients of logistic regression, for that it need to iterate
# inside logistic regression optimization algorithm is there. Using that coefficients will be estimated.
# For that it has to iterate multiple times
rfe = RFE(model)# see best 3 features
fit = rfe.fit(X, Y)

In [60]:
#Num Features: 4 top features are available
fit.n_features_

4

In [61]:
#Selected Features:
fit.support_

array([ True,  True, False, False, False,  True,  True, False])

In [62]:
name

['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

In [63]:
# Feature Ranking:
fit.ranking_

array([1, 1, 3, 5, 4, 1, 1, 2])

## 3.Feature Importance using Decision Tree

In [64]:
# Feature Importance with Extra Trees Classifier
from pandas import read_csv
from sklearn.tree import  DecisionTreeClassifier
# load data, Patient's details taken, Identify which variable is most important in identifying whether patient has dibetes or not
filename ='C:\\Users\\pawar\\Documents\\Downloads\\pima-indians-diabetes.data.csv'
# In actual dataset no column names so give col names, Class - Y variable  
name = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe= read_csv(filename, names=name)
array = dataframe.values
# feature extraction
model = DecisionTreeClassifier()
model.fit(X, Y)
print(model.feature_importances_) # feature importance score

[0.0688 0.346  0.0846 0.019  0.0295 0.2157 0.1243 0.1121]


In [65]:
sorted_list=model.feature_importances_
print("Feature importance=",sorted_list) # Before soting
sorted_list.sort()
print("Sorted Feature importance=",sorted_list) # After sorting  


Feature importance= [0.0688 0.346  0.0846 0.019  0.0295 0.2157 0.1243 0.1121]
Sorted Feature importance= [0.019  0.0295 0.0688 0.0846 0.1121 0.1243 0.2157 0.346 ]


In [66]:
names # plas is most imp feature, then mass,age,pedi
# find common amongst all methods

['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

# Model Validation Methods

### 1.Evaluate using a train and a test set

In [71]:
# Evaluate using a train and a test set
from pandas import read_csv
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression 
filename ='C:\\Users\\pawar\\Documents\\Downloads\\pima-indians-diabetes.data.csv'
# In actual dataset no column names so give col names, Class - Y variable  
name = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe= read_csv(filename, names=name)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
test_size = 0.33
seed = 7

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size,random_state=seed)
model = LogisticRegression()
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [72]:
result

0.7874015748031497

In [73]:
result*100.0   # 78% is the accuracy of the model

78.74015748031496

 dataframe

### 2.Evaluate using K-Fold Cross Validation

In [75]:
# Evaluate using Cross Validation
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

In [76]:
filename ='C:\\Users\\pawar\\Documents\\Downloads\\pima-indians-diabetes.data.csv'
# In actual dataset no column names so give col names, Class - Y variable  
name = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe= read_csv(filename, names=name)

In [77]:
dataframe

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [78]:
array = dataframe.values

# Split data in train and test data
X = array[:,0:8]
Y = array[:,8]

num_folds = 10
seed = 7

kfold = KFold(n_splits=num_folds)
model = LogisticRegression(max_iter=200)
results = cross_val_score(model, X, Y, cv=kfold)

In [79]:
results # accuracy of 10 models

array([0.6883, 0.8182, 0.7662, 0.6883, 0.7792, 0.7922, 0.8442, 0.8312,
       0.75  , 0.8026])

In [80]:
results.mean()*100.0 # Final accuracy is the mean of all accuracies

77.60423786739577

In [81]:
results.std()*100.0 
# + or - 5% standard deviation for accuracy. If Std is very high means models are very inconsistent for this d

5.157545262086822

### 3.Evaluate using Leave One Out Cross Validation

In [82]:
# Evaluate using Leave One Out Cross Validation
from pandas import read_csv
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
filename ='C:\\Users\\pawar\\Documents\\Downloads\\pima-indians-diabetes.data.csv'
# In actual dataset no column names so give col names, Class - Y variable  
name = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe= read_csv(filename, names=name)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
loocv = LeaveOneOut()
model = LogisticRegression(max_iter=300)
results = cross_val_score(model, X, Y, cv=loocv)

In [83]:
results

array([1., 1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 1., 0., 1., 1., 0., 0.,
       0., 1., 0., 1., 1., 1., 0., 1., 0., 1., 1., 0., 1., 1., 1., 1., 1.,
       1., 1., 0., 0., 0., 1., 0., 0., 1., 1., 0., 1., 1., 1., 0., 1., 1.,
       1., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 0., 1.,
       1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 1., 1., 0., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1.,
       1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 0., 1., 1., 1., 1.,
       0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 1.,
       0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
       0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 1.,
       1., 1., 0., 1., 1.

In [84]:
results.mean()

0.7760416666666666

In [85]:
X.shape

(768, 8)

In [86]:
results.mean()*100.0

77.60416666666666

In [87]:
results.std()*100.0 
# Here accuracy is either 0% or 100% so we are getting high std.
# So don't consider Std. here

41.68944689773287

In [88]:
results # 1 indicates 100 % accuracy and 0 indicates 0 % accuracy here

array([1., 1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 1., 0., 1., 1., 0., 0.,
       0., 1., 0., 1., 1., 1., 0., 1., 0., 1., 1., 0., 1., 1., 1., 1., 1.,
       1., 1., 0., 0., 0., 1., 0., 0., 1., 1., 0., 1., 1., 1., 0., 1., 1.,
       1., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 0., 1.,
       1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 1., 1., 0., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1.,
       1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 0., 1., 1., 1., 1.,
       0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 1.,
       0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
       0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 1.,
       1., 1., 0., 1., 1.

In [89]:
import numpy as np
np.array([100,100,100,0,0]).std() # check std of values 100 and 0

48.98979485566356