In [1]:
# Importing libraries
import numpy as np
import pandas as pd
%matplotlib inline 
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix
# classifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFE
import warnings
warnings.filterwarnings('ignore')

In [2]:
# sample example ************************************************************

In [3]:
# load data
url      = "https://raw.githubusercontent.com/Santanukolkata/Data_Science/master/datasets/pima-indians-diabetes.csv"
names    = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
colnames = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age']

df = pd.read_csv(url, names=names)

In [4]:
df.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
# copy data into arrays
array = df.values
X = array[:,0:8]
y = array[:,8]

In [6]:
# Parameters:	
# estimator : A supervised learning estimator 
#             with a fit method that provides information about feature importance 
#             either through a coef_ attribute or through a feature_importances_ attribute.

# n_features_to_select : int or None (default=None)
#                        The number of features to select. 
#                        If None, half of the features are selected.

# step : int or float, optional (default=1)
#        If greater than or equal to 1, then step corresponds to the (integer) 
#        number of features to remove at each iteration. 
#        If within (0.0, 1.0), then step corresponds to the percentage (rounded down) 
#        of features to remove at each iteration.

# verbose : int, (default=0)
#            Controls verbosity of output.

# Attributes:	
# n_features_ : int
# The number of selected features.

# support_ : array of shape [n_features]
#            The mask of selected features.

# ranking_ : array of shape [n_features]
#            The feature ranking, such that ranking_[i] corresponds to the ranking position 
#            of the i-th feature. 
#            Selected (i.e., estimated best) features are assigned rank 1.

In [7]:
# feature extraction
# The example below uses RFE with the logistic regression algorithm to select the top 3 features. 
# The choice of algorithm does not matter too much as long as it is skillful and consistent.
model = LogisticRegression()

rfe   = RFE(model, 4, verbose=10)
selector = rfe.fit(X, y)

Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.


In [8]:
# The number of selected features.
print('Number of selected features     : ', selector.n_features_)
# The mask of selected features.
print('Mask of selected features       : ', selector.support_)
# indices of selected features
print('Indices of selected features    : ', selector.get_support(indices=True))

# ranking_ : array of shape [n_features]
# The feature ranking, such that ranking_[i] corresponds to the ranking position of 
# the i-th feature. 
# Selected (i.e., estimated best) features are assigned rank 1.
print('Rankings of selected features   : ', selector.ranking_)

for i in list(zip(colnames, selector.ranking_)):
    if i[1] == 1:
        print('feature : ', i[0], ' to be included in the model')
    else:
        print('feature : ', i[0], ' to be DROPPED from the model')

Number of selected features     :  4
Mask of selected features       :  [ True  True False False False  True  True False]
Indices of selected features    :  [0 1 5 6]
Rankings of selected features   :  [1 1 3 4 5 1 1 2]
feature :  preg  to be included in the model
feature :  plas  to be included in the model
feature :  pres  to be DROPPED from the model
feature :  skin  to be DROPPED from the model
feature :  test  to be DROPPED from the model
feature :  mass  to be included in the model
feature :  pedi  to be included in the model
feature :  age  to be DROPPED from the model


In [9]:
model_dt = DecisionTreeClassifier()

rfe_dt   = RFE(model_dt, 4, verbose=10)
selector_dt = rfe_dt.fit(X, y)

Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.


In [10]:
# The number of selected features.
print('Number of selected features     : ', selector_dt.n_features_)
# The mask of selected features.
print('Mask of selected features       : ', selector_dt.support_)
# indices of selected features
print('Indices of selected features    : ', selector_dt.get_support(indices=True))

# ranking_ : array of shape [n_features]
# The feature ranking, such that ranking_[i] corresponds to the ranking position of 
# the i-th feature. 
# Selected (i.e., estimated best) features are assigned rank 1.
print('Rankings of selected features   : ', selector_dt.ranking_)

for i in list(zip(colnames, selector_dt.ranking_)):
    if i[1] == 1:
        print('feature : ', i[0], ' to be included in the model')
    else:
        print('feature : ', i[0], ' to be DROPPED from the model')

Number of selected features     :  4
Mask of selected features       :  [False  True False False False  True  True  True]
Indices of selected features    :  [1 5 6 7]
Rankings of selected features   :  [3 1 2 5 4 1 1 1]
feature :  preg  to be DROPPED from the model
feature :  plas  to be included in the model
feature :  pres  to be DROPPED from the model
feature :  skin  to be DROPPED from the model
feature :  test  to be DROPPED from the model
feature :  mass  to be included in the model
feature :  pedi  to be included in the model
feature :  age  to be included in the model
