Step Forward, Step backward, and Exhaustive feature selection

-Use combinations of variables to determine predictive power

-Find the best combination of variables

-Computationally expensive than filter method

-Perform better than filter method

-Not recommended on high number of features

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score,roc_auc_score
from sklearn.preprocessing import StandardScaler
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [4]:
from sklearn.datasets import load_wine

In [5]:
data=load_wine()
data.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])

In [6]:
print(data.DESCR)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0

In [8]:
x=pd.DataFrame(data.data,columns=data.feature_names)
y=data.target

In [31]:
x.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [32]:
x.isnull().sum()

alcohol                         0
malic_acid                      0
ash                             0
alcalinity_of_ash               0
magnesium                       0
total_phenols                   0
flavanoids                      0
nonflavanoid_phenols            0
proanthocyanins                 0
color_intensity                 0
hue                             0
od280/od315_of_diluted_wines    0
proline                         0
dtype: int64

In [33]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)
x_train.shape , x_test.shape

((142, 13), (36, 13))

## Step Forward Feature Selection (SFS)

In [34]:
sfs=SFS(RandomForestClassifier(n_estimators=100,random_state=0,n_jobs=-1),
       k_features=7,forward=True,floating=False,verbose=2,scoring='accuracy',cv=4,n_jobs=-1).fit(x_train,y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  13 | elapsed:    1.5s remaining:    2.5s
[Parallel(n_jobs=-1)]: Done  13 out of  13 | elapsed:    2.7s finished

[2020-11-27 15:58:26] Features: 1/7 -- score: 0.7674603174603174[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  12 | elapsed:    1.6s remaining:    3.3s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    2.7s finished

[2020-11-27 15:58:29] Features: 2/7 -- score: 0.9718253968253968[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of  11 | elapsed:    1.5s remaining:    7.3s
[Parallel(n_jobs=-1)]: Done   8 out of  11 | elapsed:    1.6s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  11 out of  11 | elapsed:    2.9s finished

[2020-11-27 15:58:32] Features: 3/7 -- score: 0.9859126984126985[Parallel(n_jobs=-1)]: Using backend Lok

In [35]:
sfs.k_feature_names_

('alcohol',
 'ash',
 'magnesium',
 'flavanoids',
 'proanthocyanins',
 'color_intensity',
 'proline')

In [36]:
sfs.k_feature_idx_

(0, 2, 4, 6, 8, 9, 12)

In [37]:
sfs.k_score_

0.9791666666666666

In [38]:
pd.DataFrame.from_dict(sfs.get_metric_dict()).T

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
1,"(6,)","[0.7222222222222222, 0.8333333333333334, 0.742...",0.76746,"(flavanoids,)",0.0670901,0.0418533,0.024164
2,"(6, 9)","[0.9444444444444444, 1.0, 0.9714285714285714, ...",0.971825,"(flavanoids, color_intensity)",0.031492,0.0196459,0.0113425
3,"(4, 6, 9)","[0.9722222222222222, 1.0, 0.9714285714285714, ...",0.985913,"(magnesium, flavanoids, color_intensity)",0.0225862,0.0140901,0.00813492
4,"(4, 6, 9, 12)","[0.9722222222222222, 0.9722222222222222, 0.971...",0.978968,"(magnesium, flavanoids, color_intensity, proline)",0.0194714,0.012147,0.00701308
5,"(2, 4, 6, 9, 12)","[0.9444444444444444, 0.9722222222222222, 0.971...",0.972024,"(ash, magnesium, flavanoids, color_intensity, ...",0.0314903,0.0196449,0.011342
6,"(2, 4, 6, 8, 9, 12)","[0.9722222222222222, 0.9722222222222222, 0.971...",0.978968,"(ash, magnesium, flavanoids, proanthocyanins, ...",0.0194714,0.012147,0.00701308
7,"(0, 2, 4, 6, 8, 9, 12)","[0.9444444444444444, 0.9722222222222222, 1.0, ...",0.979167,"(alcohol, ash, magnesium, flavanoids, proantho...",0.0369201,0.0230321,0.0132976


In [39]:
sfs=SFS(RandomForestClassifier(n_estimators=100,random_state=0,n_jobs=-1),
       k_features=(1,8),forward=True,floating=False,verbose=2,scoring='accuracy',cv=4,n_jobs=-1).fit(x_train,y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  13 | elapsed:    1.6s remaining:    2.6s
[Parallel(n_jobs=-1)]: Done  13 out of  13 | elapsed:    3.1s finished

[2020-11-27 15:59:03] Features: 1/8 -- score: 0.7674603174603174[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  12 | elapsed:    1.4s remaining:    3.0s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    3.0s finished

[2020-11-27 15:59:06] Features: 2/8 -- score: 0.9718253968253968[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of  11 | elapsed:    1.4s remaining:    6.8s
[Parallel(n_jobs=-1)]: Done   8 out of  11 | elapsed:    1.6s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  11 out of  11 | elapsed:    2.7s finished

[2020-11-27 15:59:09] Features: 3/8 -- score: 0.9859126984126985[Parallel(n_jobs=-1)]: Using backend Lok

In [40]:
sfs.k_score_

0.9859126984126985

In [42]:
sfs.k_feature_names_ ,sfs.k_feature_idx_

(('magnesium', 'flavanoids', 'color_intensity'), (4, 6, 9))

## Step Backward Selection (SBS)

In [44]:
sbs=SFS(RandomForestClassifier(n_estimators=100,random_state=0,n_jobs=-1),
       k_features=(1,8),forward=False,floating=False,verbose=2,scoring='accuracy',cv=4,n_jobs=-1).fit(x_train,y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  13 | elapsed:    1.5s remaining:    2.5s
[Parallel(n_jobs=-1)]: Done  13 out of  13 | elapsed:    3.0s finished

[2020-11-27 16:02:47] Features: 12/1 -- score: 0.9861111111111112[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  12 | elapsed:    1.4s remaining:    3.0s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    3.0s finished

[2020-11-27 16:02:50] Features: 11/1 -- score: 0.9861111111111112[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of  11 | elapsed:    1.5s remaining:    7.2s
[Parallel(n_jobs=-1)]: Done   8 out of  11 | elapsed:    1.6s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done  11 out of  11 | elapsed:    2.7s finished

[2020-11-27 16:02:53] Features: 10/1 -- score: 0.9791666666666666[Parallel(n_jobs=-1)]: Using backend 

In [46]:
sbs.k_score_

0.9859126984126985

In [47]:
sbs.k_feature_names_ , sbs.k_feature_idx_

(('alcohol',
  'malic_acid',
  'ash',
  'alcalinity_of_ash',
  'magnesium',
  'flavanoids',
  'nonflavanoid_phenols',
  'color_intensity'),
 (0, 1, 2, 3, 4, 6, 7, 9))

## Exhuastive Feature Selection (EFS)

In [48]:
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

In [None]:
efs=EFS(RandomForestClassifier(n_estimators=100,random_state=0,n_jobs=-1),
       min_features=4,max_features=5,scoring='accuracy',cv=None,n_jobs=-1).fit(x_train,y_train)
#시간 개오래걸림

In [1]:
efs.best_score

NameError: name 'efs' is not defined

In [None]:
efs.best_feature_names_ , efs.best_idx_