# Ensemble Learning using RF 
We will be using tree-based ensemble methods on the [Covertype dataset](https://www.openml.org/d/180).

In [14]:
%matplotlib inline
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import openml
import time
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import metrics
import seaborn as sns

In [4]:
covertype = openml.datasets.get_dataset(180)
X, y, _, _ = covertype.get_data(target=covertype.default_target_attribute, dataset_format='array'); 
features = [f.name for i,f in covertype.features.items()][:-1]
X = pd.DataFrame(X, columns=features)
classes = covertype.retrieve_class_labels()

In [5]:
X.head()

Unnamed: 0,elevation,aspect,slope,horizontal_distance_to_hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,soil_type_31,soil_type_32,soil_type_33,soil_type_34,soil_type_35,soil_type_36,soil_type_37,soil_type_38,soil_type_39,soil_type_40
0,2754.0,146.0,5.0,150.0,2.0,1790.0,227.0,239.0,146.0,700.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3219.0,21.0,8.0,67.0,-1.0,2869.0,215.0,223.0,145.0,1825.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2965.0,337.0,16.0,42.0,7.0,4288.0,184.0,217.0,171.0,324.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2368.0,14.0,15.0,150.0,65.0,1006.0,205.0,208.0,137.0,812.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2366.0,165.0,3.0,390.0,156.0,1165.0,222.0,240.0,154.0,582.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
classes

['Aspen',
 'Cottonwood_Willow',
 'Douglas_fir',
 'Krummholz',
 'Lodgepole_Pine',
 'Ponderosa_Pine',
 'Spruce_Fir']

## Exercise 1: Random Forests

Implement a function `evaluate_rf` that measures the performance of a Random Forest Classifier, using trees of (max) depth 2,8,32, for any number of trees in the ensemble (`n_estimators`). For each model, store the cross validation score based on k=3.


In [12]:
def evaluate_RF(X, y, estimators, max_depth=[2,4,8,16,32], scoring='accuracy'):
    result = {}
    x_train,x_test,y_train,y_test = train_test_split(X,y, test_size=0.25)
    for depth in max_depth:
        rf = RandomForestClassifier(max_depth=depth, n_estimators=estimators)
        rf.fit(x_train, y_train)
        y_pred = rf.predict(x_test)
        result[depth] = metrics.accuracy_score(y_test, y_pred)
    return result

In [13]:
evaluate_RF(X, y, 10)

{2: 0.5369759773904852,
 4: 0.6053842530526469,
 8: 0.6928149570636617,
 16: 0.7756440450740969,
 32: 0.8457552809884417}

## Exercise 2: Feature importance
Retrieve the feature importances according to the random forest model. Which feature are most important?

In [15]:
rf = RandomForestClassifier(max_depth=16, n_estimators=15)
rf.fit(X,y)
rf.feature_importances_

array([3.06618275e-01, 3.47547164e-02, 2.68458398e-02, 4.15298808e-02,
       3.69216190e-02, 8.18503453e-02, 3.26386395e-02, 3.55754978e-02,
       3.29989975e-02, 7.74014869e-02, 1.98226525e-02, 1.02701565e-02,
       1.10884716e-02, 3.44148222e-02, 1.70125261e-03, 1.59741656e-02,
       2.17083675e-03, 2.18563315e-02, 7.42264925e-04, 4.63874698e-03,
       1.90504276e-07, 4.24165769e-06, 2.80476479e-05, 1.97679738e-02,
       2.10129683e-03, 2.14030149e-02, 4.66535108e-03, 4.40988048e-04,
       0.00000000e+00, 6.32602056e-04, 1.13423206e-03, 6.32121611e-04,
       3.01052711e-04, 1.22576968e-03, 2.90813868e-04, 3.33159414e-02,
       1.58198822e-02, 3.16032860e-03, 3.35631903e-04, 4.34506471e-04,
       3.88653262e-04, 2.72461932e-04, 6.39505605e-03, 3.74580847e-03,
       2.39244211e-03, 4.42824304e-03, 2.03956951e-03, 4.37538382e-04,
       2.22067318e-03, 2.24942862e-05, 4.21431945e-04, 1.53028896e-02,
       1.83743702e-02, 8.04938227e-03])

Plot the results.

In [21]:
pd.options.display.float_format = '{:,.3f}'.format

In [30]:
feature_imp = pd.DataFrame({
    "Feature": X.columns,
    "Importance": rf.feature_importances_
}).sort_values("Importance", ascending=False)
top_features = feature_imp.head(10)["Feature"].values

## Exercise 3: Feature selection
Re-build your tuned random forest, but this time only using the first 10 features.
Return both the accuracy and training time. Interpret the results.

In [32]:
# Model Solution
start = time.time()
score = evaluate_RF(X,y,25,max_depth=[32])
print("Normal RF: {:.2f} balanced ACC, {:.2f} seconds".format(score[32], (time.time()-start)))
start = time.time()
score = evaluate_RF(X[top_features],y,25,max_depth=[32])
print("Feature Selection RF: {:.2f} balanced ACC, {:.2f} seconds".format(score[32], (time.time()-start)))

Normal RF: 0.86 balanced ACC, 3.44 seconds
Feature Selection RF: 0.85 balanced ACC, 3.73 seconds


## Exercise 4: Confusion matrix
Do a standard stratified holdout and generate the confusion matrix of the tuned random forest. Which classes are still often confused?

In [33]:
# Model Solution
X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y, random_state=1)
tuned_forest = RandomForestClassifier(random_state=0, n_estimators=25, max_depth=32).fit(X_train, y_train)

In [35]:
metrics.accuracy_score(y_test, tuned_forest.predict(X_test))

0.8607920576832494

In [36]:
# Model Solution
confusion_matrix(y_test, tuned_forest.predict(X_test))

array([[ 8475,  1059,    41,    20,    30,    23,    79],
       [  661, 12032,    73,    22,    35,    67,    31],
       [   83,   167,  1510,     8,    10,    55,    11],
       [   81,   114,    39,    81,     4,    12,     4],
       [  103,   250,    19,     3,   260,    11,     7],
       [   89,   173,   113,     5,     2,   600,    10],
       [  173,   121,    14,     2,    10,     8,   799]])

In [38]:
print(classification_report(y_test, tuned_forest.predict(X_test)))

              precision    recall  f1-score   support

           0       0.88      0.87      0.87      9727
           1       0.86      0.93      0.90     12921
           2       0.83      0.82      0.83      1844
           3       0.57      0.24      0.34       335
           4       0.74      0.40      0.52       653
           5       0.77      0.60      0.68       992
           6       0.85      0.71      0.77      1127

    accuracy                           0.86     27599
   macro avg       0.79      0.65      0.70     27599
weighted avg       0.86      0.86      0.86     27599

