# Recursive Feature Elimination (RFE) by Using Random Forest and Gradient Boosting Algorithm

In [1]:
# Import important libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score

In [3]:
from sklearn.datasets import load_breast_cancer

In [4]:
data = load_breast_cancer()

In [5]:
data.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [6]:
print(data.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [7]:
X = pd.DataFrame(data = data.data, columns=data.feature_names)
y = data.target

In [8]:
X

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

In [10]:
X_train.shape, X_test.shape

((455, 30), (114, 30))

### 1. Feature selection by feature importance of random_forest_classifier

In [12]:
sel = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=1))
sel.fit(X_train, y_train)
sel.get_support()

array([ True, False,  True,  True, False, False,  True,  True, False,
       False, False, False, False,  True, False, False, False, False,
       False, False,  True, False,  True,  True, False, False, False,
        True, False, False])

In [13]:
X_train.columns

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension'],
      dtype='object')

In [14]:
features = X_train.columns[sel.get_support()]
features # Selected features

Index(['mean radius', 'mean perimeter', 'mean area', 'mean concavity',
       'mean concave points', 'area error', 'worst radius', 'worst perimeter',
       'worst area', 'worst concave points'],
      dtype='object')

In [24]:
len(features) # length of selected features

10

In [15]:
np.mean(sel.estimator_.feature_importances_)

0.03333333333333334

In [16]:
sel.estimator_.feature_importances_

array([0.03699612, 0.01561296, 0.06016409, 0.0371452 , 0.0063401 ,
       0.00965994, 0.0798662 , 0.08669071, 0.00474992, 0.00417092,
       0.02407355, 0.00548033, 0.01254423, 0.03880038, 0.00379521,
       0.00435162, 0.00452503, 0.00556905, 0.00610635, 0.00528878,
       0.09556258, 0.01859305, 0.17205401, 0.05065305, 0.00943096,
       0.01565491, 0.02443166, 0.14202709, 0.00964898, 0.01001304])

In [17]:
# Perform regularization
X_train_rfc = sel.transform(X_train)
X_test_rfc = sel.transform(X_test)

In [19]:
X_train_rfc.shape, X_test_rfc.shape

((455, 10), (114, 10))

In [20]:
# Function to calculate random forest ML accuracy
def run_random_forest(X_train, X_test, y_train, y_test):
    clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print("Accuracy:",accuracy_score(y_test, y_pred))

In [21]:
# Call random forest on cleaned selected features dataset

import time  
# store starting time
begin = time.time()
  
run_random_forest(X_train_rfc, X_test_rfc, y_train, y_test)

time.sleep(1)
# store end time
end = time.time()
# total time taken
print(f"Time: {end - begin}")


Accuracy: 0.9473684210526315
Time: 1.3661541938781738


In [22]:
# Call random forest on origional dataset

import time  
# store starting time
begin = time.time()
  
run_random_forest(X_train, X_test, y_train, y_test)

time.sleep(1)
# store end time
end = time.time()
# total time taken
print(f"Time: {end - begin}")

Accuracy: 0.9649122807017544
Time: 1.6114847660064697


### Observation: Accuracy increased by 2% with reduced features to 10

## 2. Recursive feature elimination (RFE)

In [23]:
from sklearn.feature_selection import RFE
sel = RFE(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=1))
sel.fit(X_train, y_train)
sel.get_support()

array([ True,  True,  True,  True, False, False,  True,  True, False,
       False, False, False, False,  True, False, False, False, False,
       False, False,  True,  True,  True,  True,  True, False,  True,
        True,  True, False])

In [25]:
X_train.columns

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension'],
      dtype='object')

In [26]:
features = X_train.columns[sel.get_support()]
features # Selected features

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean concavity', 'mean concave points', 'area error', 'worst radius',
       'worst texture', 'worst perimeter', 'worst area', 'worst smoothness',
       'worst concavity', 'worst concave points', 'worst symmetry'],
      dtype='object')

In [27]:
len(features) # length of selected features

15

In [28]:
np.mean(sel.estimator_.feature_importances_)

0.06666666666666667

In [30]:
# Perform regularization
X_train_rfe = sel.transform(X_train)
X_test_rfe = sel.transform(X_test)

In [31]:
X_train_rfe.shape, X_test_rfe.shape

((455, 15), (114, 15))

In [32]:
# Call random forest on cleaned selected features dataset

import time  
# store starting time
begin = time.time()
  
run_random_forest(X_train_rfe, X_test_rfe, y_train, y_test)

time.sleep(1)
# store end time
end = time.time()
# total time taken
print(f"Time: {end - begin}")


Accuracy: 0.9736842105263158
Time: 1.2978975772857666


In [33]:
# Call random forest on origional dataset

import time  
# store starting time
begin = time.time()
  
run_random_forest(X_train, X_test, y_train, y_test)

time.sleep(1)
# store end time
end = time.time()
# total time taken
print(f"Time: {end - begin}")

Accuracy: 0.9649122807017544
Time: 1.315138816833496


### Observation: Accuracy increased by 2% with reduced features to 15

## 3. GradientBoost Tree Importance

In [34]:
from sklearn.ensemble import GradientBoostingClassifier
sel = RFE(GradientBoostingClassifier(n_estimators=100, random_state=0), n_features_to_select=12)
sel.fit(X_train, y_train)
sel.get_support()

array([False,  True, False, False,  True, False, False,  True,  True,
       False, False, False, False,  True, False, False,  True, False,
       False, False,  True,  True,  True,  True, False, False,  True,
        True, False, False])

In [35]:
X_train.columns

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension'],
      dtype='object')

In [36]:
features = X_train.columns[sel.get_support()]
features # Selected features

Index(['mean texture', 'mean smoothness', 'mean concave points',
       'mean symmetry', 'area error', 'concavity error', 'worst radius',
       'worst texture', 'worst perimeter', 'worst area', 'worst concavity',
       'worst concave points'],
      dtype='object')

In [37]:
len(features) # length of selected features

12

In [38]:
np.mean(sel.estimator_.feature_importances_)

0.08333333333333333

In [39]:
# Perform regularization
X_train_rfe = sel.transform(X_train)
X_test_rfe = sel.transform(X_test)

In [40]:
X_train_rfe.shape, X_test_rfe.shape

((455, 12), (114, 12))

In [41]:
# Call random forest on cleaned selected features dataset

import time  
# store starting time
begin = time.time()
  
run_random_forest(X_train_rfe, X_test_rfe, y_train, y_test)

time.sleep(1)
# store end time
end = time.time()
# total time taken
print(f"Time: {end - begin}")


Accuracy: 0.9736842105263158
Time: 1.341994047164917


In [42]:
# Call random forest on origional dataset

import time  
# store starting time
begin = time.time()
  
run_random_forest(X_train, X_test, y_train, y_test)

time.sleep(1)
# store end time
end = time.time()
# total time taken
print(f"Time: {end - begin}")

Accuracy: 0.9649122807017544
Time: 1.3527276515960693


### Observation: Accuracy increased by 2% with reduced features to 12

In [47]:
# Check how many features out of 31 are performing well with RandomForestClassifier
for index in range(1,31):
    sel = RFE(RandomForestClassifier(n_estimators=100, random_state=0), n_features_to_select=index)
    sel.fit(X_train, y_train)
    X_train_rfe = sel.transform(X_train)
    X_test_rfe = sel.transform(X_test)
    print("Selected Features: ", index)
    run_random_forest(X_train_rfe, X_test_rfe, y_train, y_test)


Selected Features:  1
Accuracy: 0.8947368421052632
Selected Features:  2
Accuracy: 0.9298245614035088
Selected Features:  3
Accuracy: 0.9473684210526315
Selected Features:  4
Accuracy: 0.9649122807017544
Selected Features:  5
Accuracy: 0.9649122807017544
Selected Features:  6
Accuracy: 0.956140350877193
Selected Features:  7
Accuracy: 0.956140350877193
Selected Features:  8
Accuracy: 0.9649122807017544
Selected Features:  9
Accuracy: 0.9736842105263158
Selected Features:  10
Accuracy: 0.9736842105263158
Selected Features:  11
Accuracy: 0.9649122807017544
Selected Features:  12
Accuracy: 0.9736842105263158
Selected Features:  13
Accuracy: 0.9649122807017544
Selected Features:  14
Accuracy: 0.9736842105263158
Selected Features:  15
Accuracy: 0.9736842105263158
Selected Features:  16
Accuracy: 0.9736842105263158
Selected Features:  17
Accuracy: 0.9824561403508771
Selected Features:  18
Accuracy: 0.9649122807017544
Selected Features:  19
Accuracy: 0.9649122807017544
Selected Features:  20


### Observation: On the basis of RandomForestClassifier only 17 features we are getting 98% accuracy

In [50]:
# Check only 17 features

sel = RFE(RandomForestClassifier(n_estimators=100, random_state=0), n_features_to_select=17)
sel.fit(X_train, y_train)
X_train_rfe = sel.transform(X_train)
X_test_rfe = sel.transform(X_test)
print("Selected Features: ", 17)
run_random_forest(X_train_rfe, X_test_rfe, y_train, y_test)


Selected Features:  17
Accuracy: 0.9824561403508771


In [51]:
features = X_train.columns[sel.get_support()]
features # Best 17 selected features

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean concavity', 'mean concave points', 'radius error', 'area error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry'],
      dtype='object')

In [43]:
# Check how many features out of 31 are performing well with GradientBoostingClassifier
for index in range(1,31):
    sel = RFE(GradientBoostingClassifier(n_estimators=100, random_state=0), n_features_to_select=index)
    sel.fit(X_train, y_train)
    X_train_rfe = sel.transform(X_train)
    X_test_rfe = sel.transform(X_test)
    print("Selected Features: ", index)
    run_random_forest(X_train_rfe, X_test_rfe, y_train, y_test)


Selected Features:  1
Accuracy: 0.8771929824561403
Selected Features:  2
Accuracy: 0.9035087719298246
Selected Features:  3
Accuracy: 0.9649122807017544
Selected Features:  4
Accuracy: 0.9736842105263158
Selected Features:  5
Accuracy: 0.9649122807017544
Selected Features:  6
Accuracy: 0.9912280701754386
Selected Features:  7
Accuracy: 0.9736842105263158
Selected Features:  8
Accuracy: 0.9649122807017544
Selected Features:  9
Accuracy: 0.9736842105263158
Selected Features:  10
Accuracy: 0.956140350877193
Selected Features:  11
Accuracy: 0.956140350877193
Selected Features:  12
Accuracy: 0.9736842105263158
Selected Features:  13
Accuracy: 0.956140350877193
Selected Features:  14
Accuracy: 0.956140350877193
Selected Features:  15
Accuracy: 0.9649122807017544
Selected Features:  16
Accuracy: 0.956140350877193
Selected Features:  17
Accuracy: 0.9649122807017544
Selected Features:  18
Accuracy: 0.9473684210526315
Selected Features:  19
Accuracy: 0.9649122807017544
Selected Features:  20
Acc

In [44]:
### Observation: On the basis of only 6 features we are getting 99% accuracy

In [45]:
# Check only 6 features

sel = RFE(GradientBoostingClassifier(n_estimators=100, random_state=0), n_features_to_select=6)
sel.fit(X_train, y_train)
X_train_rfe = sel.transform(X_train)
X_test_rfe = sel.transform(X_test)
print("Selected Features: ", 6)
run_random_forest(X_train_rfe, X_test_rfe, y_train, y_test)


Selected Features:  6
Accuracy: 0.9912280701754386


In [46]:
features = X_train.columns[sel.get_support()]
features # Best 6 selected features

Index(['mean concave points', 'area error', 'worst texture', 'worst perimeter',
       'worst area', 'worst concave points'],
      dtype='object')