# Ensemble Learning and Random Forest

In [95]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [96]:
iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [97]:
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC(probability=True)

In [98]:
votting_clf = VotingClassifier(
    estimators=[("lr", log_clf), ("rf", rnd_clf), ("svm_clf", svm_clf)],
    voting="soft"
)

votting_clf.fit(X_train, y_train)

In [99]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, votting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 1.0
RandomForestClassifier 1.0
SVC 1.0
VotingClassifier 1.0


The votting classifier will have a higher accuracy than the individual models

In [100]:
import numpy as np

# Let's say our dataset is just samples 1 to 10
data = np.arange(1, 11)

def bagging_sample(data, sample_size):
    """Sampling WITH replacement (bagging)."""
    return np.random.choice(data, size=sample_size, replace=True)

def pasting_sample(data, sample_size):
    """Sampling WITHOUT replacement (pasting)."""
    return np.random.choice(data, size=sample_size, replace=False)

# Let's create 5 datasets of size 6
np.random.seed(42)  # for reproducibility
print("BAGGING (with replacement):")
for i in range(5):
    print(f"Set {i+1}: {bagging_sample(data, 6)}")

print("\nPASTING (without replacement):")
for i in range(5):
    print(f"Set {i+1}: {pasting_sample(data, 6)}")


BAGGING (with replacement):
Set 1: [ 7  4  8  5  7 10]
Set 2: [3 7 8 5 4 8]
Set 3: [8 3 6 5 2 8]
Set 4: [ 6  2  5  1 10  6]
Set 5: [ 9  1 10  3  7  4]

PASTING (without replacement):
Set 1: [6 4 2 7 1 8]
Set 2: [7 4 3 6 5 8]
Set 3: [5 9 2 4 1 6]
Set 4: [ 3  1  5 10  9  7]
Set 5: [5 3 8 1 7 4]


## Bagging with Sklearn

In [101]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1
)

bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

accuracy_score(y_test, y_pred)

1.0

By default the BaggingClassifier uses a soft decision, this is if the model has predict proba function.

### Out-Of-Bag Evaluation

In [102]:
bag_clf_oob = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1,
    oob_score=True
)

bag_clf_oob.fit(X_train, y_train)
bag_clf_oob.oob_score_

0.95

In [103]:
bag_clf_oob.oob_decision_function_

array([[1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.        , 1.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.        , 0.06103286, 0.93896714],
       [0.        , 1.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.        , 0.03669725, 0.96330275],
       [0.        , 0.93534483, 0.06465517],
       [0.        , 1.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.        , 1.        , 0.        ],
       [0.        , 0.70183486, 0.29816514],
       [0.        , 0.01086957, 0.98913043],
       [0.        , 1.        , 0.        ],
       [0.        , 0.        , 1.        ],
       [0.        , 1.        , 0.        ],
       [0.        , 0.        , 1.        ],
       [0.

In [104]:
y_pred = bag_clf_oob.predict(X_test)

accuracy_score(y_test, y_pred)

1.0

## Random Forest

In [105]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)

### Feature importance

In [106]:
rnd_clf_fi = RandomForestClassifier(n_estimators=500, n_jobs=-1)

rnd_clf_fi.fit(X, y)

for name, score in zip(iris["feature_names"], rnd_clf_fi.feature_importances_):
    print(name, score)

sepal length (cm) 0.10019819786691885
sepal width (cm) 0.02237086199760393
petal length (cm) 0.4280228790403335
petal width (cm) 0.4494080610951437


## Ada Boosting learning

In [107]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm="SAMME.R", learning_rate=0.5
)

ada_clf.fit(X_train, y_train)

accuracy_score(y_test, ada_clf.predict(X_test))



0.9666666666666667

## Gradient Boosting

### Early Stopping

In [108]:
len(X_train), len(y_train)

(120, 120)

In [109]:
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(X_train, y_train)

errors = [mean_squared_error(y_test, y_pred) for y_pred in gbrt.staged_predict(X_test)]
best_estimator = np.argmin(errors)

gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=best_estimator)
gbrt_best.fit(X_train, y_train)

In [112]:
gbrt_es = GradientBoostingRegressor(max_depth=2, warm_start=True)

min_val_error = float("inf")
error_going_up = 0

for n_estimators in range(1, 120):
    gbrt_es.n_estimators_ = n_estimators
    gbrt_es.fit(X_train, y_train)
    y_pred = gbrt_es.predict(X_test)
    val_error = mean_squared_error(y_test, y_pred)
    
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
        
    else:
        error_going_up += 1
        if error_going_up == 5:
            break
        
print(f"The optimal number of estimators was {gbrt_es.n_estimators_}")

The optimal number of estimators was 101
