# Ensembles Random Forest
A demonstration of the feature importance and OOB estimate of generalisation accuracy facilities in `scikit-learn` Random Forest.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, RepeatedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from statistics import mean

In [None]:
wine_pd = pd.read_csv('Wine.csv')
wine_pd.head()

Random forest uses decision trees so there is no need to normalise the data.

In [None]:
y = wine_pd.pop('class').values
X = wine_pd.values
X.shape

The following code supresses all warnings. We shouldn't really do this  
but for the following experiment we start off training an RF with too few trees  
and this produces some warnings.

In [None]:
import warnings
warnings.filterwarnings("ignore")

### Generalisation Accuracy
Comparing the RF OOB generalisation accuracy estimates with that estimated using hold-out.  
The OOB estimate gets better as more ensemble members are used.  
Repeating 50 times. 

In [None]:
nreps = 50
nest_range =range(10,110,10)
oob_res_dict = {}
ho_res_dict ={}
for nest in nest_range:
    oob_accs = []
    ho_accs = []
    for r in range(nreps):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
        RFoob = RandomForestClassifier(n_estimators = nest, oob_score = True)
        RFoob.fit(X_train,y_train)
        oob_est = RFoob.oob_score_
        oob_accs.append(oob_est)
        y_pred = RFoob.predict(X_test)
        ho_est = accuracy_score(y_pred,y_test)
        ho_accs.append(ho_est)
    print(mean(oob_accs), mean(ho_accs))
    oob_res_dict[nest]=mean(oob_accs)
    ho_res_dict[nest]=mean(ho_accs)

In [None]:
fig = plt.figure(figsize=(5,4))

plt.plot(nest_range, list(oob_res_dict.values()), lw = 2, color = 'r', 
         label = 'OOB Estimate')
plt.plot(nest_range, list(ho_res_dict.values()), lw = 2, color = 'orange', 
         label = 'HO Estimate')

plt.xlabel("Number of estimators")
plt.ylabel("Accuracy")
plt.ylim([0.9,1])
plt.legend(loc = 'upper left')
plt.grid(axis = 'y')
fig.savefig('OOB-acc.pdf')

### Feature Importance
Generating RF feature importance scores for the wine dataset. 

In [None]:
wine_pd.columns

In [None]:
RFoob = RandomForestClassifier(n_estimators = 100, oob_score = True, random_state=0)
RFoob.fit(X_train,y_train)

In [None]:
RFoob.feature_importances_

In [None]:
names = list(wine_pd.columns)
names[11] = 'OD280/OD315'
y_pos = np.arange(len(names))
f = plt.figure(figsize=(5,4))
plt.bar(y_pos, RFoob.feature_importances_, align='center', width = 0.3, alpha=0.5, 
        label = "Feature")
plt.xticks(y_pos, names, rotation=90)
plt.ylabel('Feature Importance')
f.savefig('RF-FI.pdf',bbox_inches='tight')