# # First we import everything we want

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
%matplotlib

Using matplotlib backend: Qt5Agg


Now we import the data, for simplicity I will use the cancer dataset from sklearn

In [2]:
cancer = datasets.load_breast_cancer()

In [3]:
cancer.keys()

dict_keys(['data', 'DESCR', 'target', 'target_names', 'feature_names'])

Creating an array with the names of all the features

In [4]:
flabel = cancer.feature_names

In [5]:
flabel

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension'], 
      dtype='<U23')

In [6]:
#divide the data is train and test

X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.4, random_state=0)

#call the Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)

# Train the classifier
rfc.fit(X_train, y_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [8]:
#now lets see how each feature fared 
for feature in zip(flabel, rfc.feature_importances_):
    print(feature)

('mean radius', 0.033519299298114778)
('mean texture', 0.011133822946841997)
('mean perimeter', 0.045196071613029826)
('mean area', 0.032948877351153219)
('mean smoothness', 0.0056899831101316313)
('mean compactness', 0.015269673488826593)
('mean concavity', 0.075572732341672541)
('mean concave points', 0.14384835749783342)
('mean symmetry', 0.0043408124779590234)
('mean fractal dimension', 0.0037638088880767775)
('radius error', 0.014930707151808281)
('texture error', 0.0047124860543599646)
('perimeter error', 0.01357063242520368)
('area error', 0.035520238576549096)
('smoothness error', 0.0040941258416534558)
('compactness error', 0.0047738756106799505)
('concavity error', 0.0063566142114096372)
('concave points error', 0.0063313867802081867)
('symmetry error', 0.0032979925004250057)
('fractal dimension error', 0.005155795406384495)
('worst radius', 0.084468577642723641)
('worst texture', 0.012876687224029225)
('worst perimeter', 0.11386756389229268)
('worst area', 0.0853177899566041

In [9]:
#lets us see how Random forest trains and predicts with all the features 

predictions = rfc.predict(X_test)

accuracy_score(predictions,y_test)

0.94298245614035092

Now that we have seen the score with all the features, lets see how much accurate the model gets with only the good features


In [10]:
from sklearn.feature_selection import SelectFromModel

In [14]:
# Create a selector object that will use the random forest classifier to identify
# features that have an importance of more than 0.035
sfm = SelectFromModel(rfc, threshold=0.035)

# Train the selector
sfm.fit(X_train, y_train)

SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
        prefit=False, threshold=0.035)

In [15]:
# Print the names of the most important features
for feature_list_index in sfm.get_support(indices=True):
    print(flabel[feature_list_index])

mean perimeter
mean concavity
mean concave points
area error
worst radius
worst perimeter
worst area
worst concavity
worst concave points


As we can see that only the above features have an importance of 0.035 and above

In [16]:
# Transform the data to create a new dataset containing only the most important features
# Note: We have to apply the transform to both the training X and test X data.
X_important_train = sfm.transform(X_train)
X_important_test = sfm.transform(X_test)

In [17]:
# Create a new random forest classifier for the most important features
rfcnew_important = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)

# Train the new classifier on the new dataset containing the most important features
rfcnew_important.fit(X_important_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [18]:
# Apply The Full Featured Classifier To The Test Data
imp_pred = rfcnew_important.predict(X_important_test)

# View The Accuracy Of Our Limited Feature (2 Features) Model
accuracy_score(y_test, imp_pred)

0.94298245614035092

In [20]:
X_important_train.shape

(341, 9)

In [21]:
X_train.shape

(341, 30)

# As we can now see, with the top 8 features, the accuracy remains the same and computation speed reduces 