
# Ensemble Learning using RF 


In [14]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns

In [2]:
data = pd.read_csv("diabetes.csv")
X = data.drop(columns=["Outcome"])
y = data["Outcome"]

In [3]:
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [4]:
y.head()

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

## Exercise 1: Random Forests

Implement a function `evaluate_rf` that measures the performance of a Random Forest Classifier, using trees of (max) depth 2,8,32, for any number of trees in the ensemble (`n_estimators`). For each model, store the cross validation score based on k=3.


In [6]:
def evaluate_RF(X, y, n_estimators, max_depth=[2**i for i in range(8)], scoring='accuracy'):
    result = {}
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
    for depth in max_depth:
        rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=depth)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_test)
        result[depth] = accuracy_score(y_test, y_pred)
    return result

In [7]:
evaluate_RF(X, y, 100)

{1: 0.6458333333333334,
 2: 0.6875,
 4: 0.6927083333333334,
 8: 0.6979166666666666,
 16: 0.703125,
 32: 0.6875,
 64: 0.7083333333333334,
 128: 0.6979166666666666}

## Exercise 2: Feature importance
Retrieve the feature importances according to the (tuned) random forest model. Which feature are most important?

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
model = RandomForestClassifier(n_estimators=100, max_depth=16)
model.fit(X_train, y_train)
model.feature_importances_

array([0.08407051, 0.25635884, 0.09063843, 0.0769344 , 0.0702761 ,
       0.16512616, 0.13036338, 0.12623217])

Plot the results.

In [10]:
pd.DataFrame({
    "Feature": X_train.columns,
    "Importance": model.feature_importances_
}).sort_values("Importance")

Unnamed: 0,Feature,Importance
4,Insulin,0.070276
3,SkinThickness,0.076934
0,Pregnancies,0.084071
2,BloodPressure,0.090638
7,Age,0.126232
6,DiabetesPedigreeFunction,0.130363
5,BMI,0.165126
1,Glucose,0.256359


## Exercise 3: Feature selection
Re-build your tuned random forest, but this time only using the first 5 features.
Return both the balanced accuracy and training time. Interpret the results.

In [13]:
cols = ["Glucose", "BMI", "DiabetesPedigreeFunction", "Age", "BloodPressure"]
X_new = X[cols].copy()

X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.25)
model = RandomForestClassifier(n_estimators=100, max_depth=16)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.765625

## Exercise 4: Confusion matrix
Do a standard stratified holdout and generate the confusion matrix of the tuned random forest. Which classes are still often confused?

In [15]:
confusion_matrix(y_test, y_pred)

array([[110,  20],
       [ 25,  37]])

In [17]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.85      0.83       130
           1       0.65      0.60      0.62        62

    accuracy                           0.77       192
   macro avg       0.73      0.72      0.73       192
weighted avg       0.76      0.77      0.76       192



<b> What are the conclusions based on the f1-score?</b>

## Exercise 5: Compare

Choose an additional supervised learning algorithm and compare the result. Which one is better?


In [19]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
# gbrt = GradientBoostingClassifier(random_state=0)
# gbrt.fit(X_train, y_train)
# y_pred = model.predict(X_test)
# accuracy_score(y_test, y_pred)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Insulin
- Pregnancies
- SkinThickness


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.25)
gbrt = GradientBoostingClassifier(random_state=0)
gbrt.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.9166666666666666