In [1]:
# inline plotting instead of popping out
%matplotlib inline

# python 3.7.3
import os

from IPython.display import display

# numpy  1.17.1
import numpy as np

# pandas  0.25.1
import pandas as pd 

# matplotlib  3.1.1
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

# scikit-learn  0.21.3
from sklearn.datasets import load_wine, load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz

In [2]:
# load the breast_cancer dataset
init_data = load_breast_cancer()
(X, y) = load_breast_cancer(return_X_y=True)
# X = pd.DataFrame(data=X, columns=init_data['feature_names'])
# y = pd.DataFrame(data=y, columns=['label'])

# split X into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Train a RandomForestClassifier as model 
forest = RandomForestClassifier(criterion='entropy',
                                n_estimators=200, 
                                random_state=1,
                                n_jobs=2)
forest.fit(X_train, y_train)
y_pred = forest.predict(X_test)
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
print('Accuracy per feature: %.2f' % (accuracy_score(y_test, y_pred)/X.shape[1]))

Accuracy: 0.98
Accuracy per feature: 0.03


In [3]:
print("\nall feature")
importances = forest.feature_importances_
# get sort indices in descending order
indices = np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, 
                            indices[f], 
                            importances[indices[f]]))

sfm = SelectFromModel(forest, threshold=0.117, prefit=True)
Z_forest_alt = sfm.transform(X)
print("\nimportant feature")
for f in range(Z_forest_alt.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, 
                            indices[f], 
                            importances[indices[f]]))

# train a random forest based on Z_forest
Z_forest_train, Z_forest_test, y_forest_train, y_forest_test = train_test_split(Z_forest_alt, y,
                                                                                test_size=0.3, random_state=0)
forest_forest = RandomForestClassifier(criterion='entropy',
                                       n_estimators=200, 
                                       random_state=1,
                                       n_jobs=2)
forest_forest.fit(Z_forest_train, y_forest_train)

y_forest_pred = forest_forest.predict(Z_forest_test)
print('\nAccuracy (forest_forest): %.2f' % accuracy_score(y_forest_test, y_forest_pred))
print('Accuracy per feature: %.2f' % (accuracy_score(y_forest_test, y_forest_pred)/Z_forest_alt.shape[1]))


all feature
 1) 27                             0.141849
 2) 7                              0.117697
 3) 20                             0.110919
 4) 22                             0.105243
 5) 6                              0.077163
 6) 23                             0.073364
 7) 13                             0.041760
 8) 2                              0.036943
 9) 0                              0.035625
10) 26                             0.035205
11) 3                              0.032890
12) 21                             0.022736
13) 1                              0.018584
14) 25                             0.016786
15) 10                             0.015408
16) 28                             0.014770
17) 24                             0.014235
18) 5                              0.010085
19) 12                             0.010040
20) 29                             0.009015
21) 17                             0.008085
22) 15                             0.007420
23) 4              