In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score



In [3]:
from sklearn.datasets import load_breast_cancer

In [4]:
data = load_breast_cancer()
data.keys()


dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [5]:
print(data.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

:Number of Instances: 569

:Number of Attributes: 30 numeric, predictive attributes and the class

:Attribute Information:
    - radius (mean of distances from center to points on the perimeter)
    - texture (standard deviation of gray-scale values)
    - perimeter
    - area
    - smoothness (local variation in radius lengths)
    - compactness (perimeter^2 / area - 1.0)
    - concavity (severity of concave portions of the contour)
    - concave points (number of concave portions of the contour)
    - symmetry
    - fractal dimension ("coastline approximation" - 1)

    The mean, standard error, and "worst" or largest (mean of the three
    worst/largest values) of these features were computed for each image,
    resulting in 30 features.  For instance, field 0 is Mean Radius, field
    10 is Radius SE, field 20 is Worst Radius.

    - 

In [6]:
X = pd.DataFrame(data = data.data, columns=data.feature_names)
X.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [7]:
y = data.target

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 0)
X_train.shape, X_test.shape

((455, 30), (114, 30))

In [9]:
sel = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1))
sel.fit(X_train, y_train)
sel.get_support()


array([ True, False,  True,  True, False, False,  True,  True, False,
       False, False, False, False,  True, False, False, False, False,
       False, False,  True, False,  True,  True, False, False, False,
        True, False, False])

In [10]:
X_train.columns

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension'],
      dtype='object')

In [11]:
features = X_train.columns[sel.get_support()]
features

Index(['mean radius', 'mean perimeter', 'mean area', 'mean concavity',
       'mean concave points', 'area error', 'worst radius', 'worst perimeter',
       'worst area', 'worst concave points'],
      dtype='object')

In [12]:
len(features)

10

In [13]:
np.mean(sel.estimator_.feature_importances_)

np.float64(0.03333333333333334)

In [14]:
sel.estimator_.feature_importances_


array([0.03699612, 0.01561296, 0.06016409, 0.0371452 , 0.0063401 ,
       0.00965994, 0.0798662 , 0.08669071, 0.00474992, 0.00417092,
       0.02407355, 0.00548033, 0.01254423, 0.03880038, 0.00379521,
       0.00435162, 0.00452503, 0.00556905, 0.00610635, 0.00528878,
       0.09556258, 0.01859305, 0.17205401, 0.05065305, 0.00943096,
       0.01565491, 0.02443166, 0.14202709, 0.00964898, 0.01001304])

In [15]:
X_train_rfc = sel.transform(X_train)
X_test_rfc = sel.transform(X_test)

In [16]:
def run_randomForest(X_train, X_test, y_train, y_test):
    clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print('Accuracy: ', accuracy_score(y_test, y_pred))

In [17]:
%%time
run_randomForest(X_train_rfc, X_test_rfc, y_train, y_test)

Accuracy:  0.9473684210526315
CPU times: user 426 ms, sys: 17.1 ms, total: 443 ms
Wall time: 1.14 s


In [18]:
%%time
run_randomForest(X_train, X_test, y_train, y_test)

Accuracy:  0.9649122807017544
CPU times: user 506 ms, sys: 21 ms, total: 527 ms
Wall time: 1.26 s


In [19]:
from sklearn.feature_selection import RFE
sel = RFE(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1), n_features_to_select = 15)
sel.fit(X_train, y_train)

In [20]:
sel.get_support()


array([ True,  True,  True,  True, False, False,  True,  True, False,
       False, False, False, False,  True, False, False, False, False,
       False, False,  True,  True,  True,  True,  True, False,  True,
        True,  True, False])

In [21]:
features = X_train.columns[sel.get_support()]
features

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean concavity', 'mean concave points', 'area error', 'worst radius',
       'worst texture', 'worst perimeter', 'worst area', 'worst smoothness',
       'worst concavity', 'worst concave points', 'worst symmetry'],
      dtype='object')

In [22]:
len(features)

15

In [23]:
X_train_rfe = sel.transform(X_train)
X_test_rfe = sel.transform(X_test)


In [24]:
%%time
run_randomForest(X_train_rfe, X_test_rfe, y_train, y_test)

Accuracy:  0.9736842105263158
CPU times: user 412 ms, sys: 15.6 ms, total: 427 ms
Wall time: 763 ms


In [25]:
%%time
run_randomForest(X_train, X_test, y_train, y_test)

Accuracy:  0.9649122807017544
CPU times: user 491 ms, sys: 23.7 ms, total: 514 ms
Wall time: 960 ms


In [26]:
from sklearn.ensemble import GradientBoostingClassifier

In [27]:
sel = RFE(GradientBoostingClassifier(n_estimators=100, random_state=0), n_features_to_select = 12)
sel.fit(X_train, y_train)

In [28]:
sel.get_support()

array([False,  True, False, False,  True, False, False,  True,  True,
       False, False, False, False,  True, False, False,  True, False,
       False, False,  True,  True,  True,  True, False, False,  True,
        True, False, False])

In [29]:
features = X_train.columns[sel.get_support()]
features

Index(['mean texture', 'mean smoothness', 'mean concave points',
       'mean symmetry', 'area error', 'concavity error', 'worst radius',
       'worst texture', 'worst perimeter', 'worst area', 'worst concavity',
       'worst concave points'],
      dtype='object')

In [30]:
len(features)

12

In [31]:
X_train_rfe = sel.transform(X_train)
X_test_rfe = sel.transform(X_test)

In [32]:
%%time
run_randomForest(X_train_rfe, X_test_rfe, y_train, y_test)

Accuracy:  0.9736842105263158
CPU times: user 302 ms, sys: 31.4 ms, total: 334 ms
Wall time: 275 ms


In [33]:
%%time
run_randomForest(X_train, X_test, y_train, y_test)

Accuracy:  0.9649122807017544
CPU times: user 369 ms, sys: 25 ms, total: 394 ms
Wall time: 318 ms


In [34]:
for index in range(1, 31):
    sel = RFE(GradientBoostingClassifier(n_estimators=100, random_state=0), n_features_to_select = index)
    sel.fit(X_train, y_train)
    X_train_rfe = sel.transform(X_train)
    X_test_rfe = sel.transform(X_test)
    print('Selected Feature: ', index)
    run_randomForest(X_train_rfe, X_test_rfe, y_train, y_test)
    print()

Selected Feature:  1
Accuracy:  0.8771929824561403

Selected Feature:  2
Accuracy:  0.9035087719298246

Selected Feature:  3
Accuracy:  0.9649122807017544

Selected Feature:  4
Accuracy:  0.9736842105263158

Selected Feature:  5
Accuracy:  0.9649122807017544

Selected Feature:  6
Accuracy:  0.9912280701754386

Selected Feature:  7
Accuracy:  0.9736842105263158

Selected Feature:  8
Accuracy:  0.9649122807017544

Selected Feature:  9
Accuracy:  0.9736842105263158

Selected Feature:  10
Accuracy:  0.956140350877193

Selected Feature:  11
Accuracy:  0.956140350877193

Selected Feature:  12
Accuracy:  0.9736842105263158

Selected Feature:  13
Accuracy:  0.956140350877193

Selected Feature:  14
Accuracy:  0.9649122807017544

Selected Feature:  15
Accuracy:  0.9649122807017544

Selected Feature:  16
Accuracy:  0.956140350877193

Selected Feature:  17
Accuracy:  0.956140350877193

Selected Feature:  18
Accuracy:  0.956140350877193

Selected Feature:  19
Accuracy:  0.956140350877193

Selected 

In [35]:
for index in range(1, 31):
    sel = RFE(GradientBoostingClassifier(n_estimators=100, random_state=0),
              n_features_to_select=index)
    sel.fit(X_train, y_train)

    X_train_rfe = sel.transform(X_train)
    X_test_rfe = sel.transform(X_test)

    selected_indices = sel.get_support(indices=True)
    selected_columns = X_train.columns[selected_indices]

    print(f'Selected {index} Feature(s):')
    print('Indices:', selected_indices)
    print('Column Names:', list(selected_columns))

    run_randomForest(X_train_rfe, X_test_rfe, y_train, y_test)
    print()

Selected 1 Feature(s):
Indices: [27]
Column Names: ['worst concave points']
Accuracy:  0.8771929824561403

Selected 2 Feature(s):
Indices: [ 7 27]
Column Names: ['mean concave points', 'worst concave points']
Accuracy:  0.9035087719298246

Selected 3 Feature(s):
Indices: [ 7 23 27]
Column Names: ['mean concave points', 'worst area', 'worst concave points']
Accuracy:  0.9649122807017544

Selected 4 Feature(s):
Indices: [ 7 21 23 27]
Column Names: ['mean concave points', 'worst texture', 'worst area', 'worst concave points']
Accuracy:  0.9736842105263158

Selected 5 Feature(s):
Indices: [ 7 21 22 23 27]
Column Names: ['mean concave points', 'worst texture', 'worst perimeter', 'worst area', 'worst concave points']
Accuracy:  0.9649122807017544

Selected 6 Feature(s):
Indices: [ 7 13 21 22 23 27]
Column Names: ['mean concave points', 'area error', 'worst texture', 'worst perimeter', 'worst area', 'worst concave points']
Accuracy:  0.9912280701754386

Selected 7 Feature(s):
Indices: [ 7 13 