In [2]:
import os
import pandas as pd

dataset_path = os.path.join(os.curdir, "datasets")
train_df = pd.read_csv(os.path.join(dataset_path, "BostonHouseTrain.csv"), index_col=0)
test_df = pd.read_csv(os.path.join(dataset_path, "BostonHouseTest.csv"), index_col=0)

X_train = train_df.values[:,:-1]
y_train = train_df.values[:,-1]
X_test = test_df.values[:,:-1]
y_test = test_df.values[:,-1]

train_df.head(5)

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,class
352,0.0795,60.0,1.69,0,0.411,6.579,35.9,10.7103,4,411,18.3,370.78,5.49,high
351,0.06211,40.0,1.25,0,0.429,6.49,44.4,8.7921,1,335,19.7,396.9,5.98,high
316,0.25356,0.0,9.9,0,0.544,5.705,77.7,3.945,4,304,18.4,396.42,11.5,low
285,0.00906,90.0,2.97,0,0.4,7.088,20.8,7.3073,1,285,15.3,394.72,7.85,high
393,11.5779,0.0,18.1,0,0.7,5.036,97.0,1.77,24,666,20.2,396.9,25.68,low


A simple weak learner (DecisionTreeClassifier)
---

In [32]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

wl = DecisionTreeClassifier(random_state=0, max_depth=2)
wl.fit(X_train, y_train)

y_train_pred = wl.predict(X_train)
acc_train = accuracy_score(y_train, y_train_pred)

y_test_pred = wl.predict(X_test)
acc_test = accuracy_score(y_test, y_test_pred)

print(f"The accuracy of the weak learner on the training set is {acc_test:.4f}")

The accuracy of the weak learner on the training set is 0.7830


Bootstrap Aggregating Ensemble learner
---

In [34]:
import numpy as np
from sklearn.base import clone
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

class BootstrapAggLearner:
    def __init__(self, learner, n_bootstrap):
        self.learner = learner
        self.n_bootstrap = n_bootstrap 
        self.learners = [clone(self.learner) for _ in range(self.n_bootstrap)]
    
    def fit(self, X, y):
        assert len(X) == len(y), f"Not same size for X :: {X.shape} and y :: {y.shape}"
        indices = np.arange(len(X))
        for idx in range(self.n_bootstrap):
            bootstrap_indices = np.random.choice(indices, replace=True, size=len(X))
            Xt, yt = X[bootstrap_indices], y[bootstrap_indices] 
            self.learners[idx].fit(Xt, yt)
        return self
    
    def predict(self, X):
        y_hat_ensemble = list(zip(*[learner.predict(X) for learner in self.learners]))
        y_hat = np.array([max(set(y_ensemble), key=lambda x: y_ensemble.count(x)) for y_ensemble in y_hat_ensemble])
        return y_hat
    
wl = DecisionTreeClassifier(random_state=0, max_depth=2)
N_BOOTSTRAP = 100
N_RUNS = 10

test_accur = [.0]*N_RUNS
for test_id in range(len(test_accur)):
    clf = BootstrapAggLearner(wl, N_BOOTSTRAP)
    clf.fit(X_train, y_train)
    
    y_test_pred = clf.predict(X_test)
    test_accur[test_id] = accuracy_score(y_test, y_test_pred)
mean_test_accur = np.mean(test_accur)

Random Forest Ensemble learner
---

#### First, let's make a Randomized search


In [42]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in range(200,2000,200)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

y_test_pred = rf_random.best_estimator_.predict(X_test)
acc_test = accuracy_score(y_test, y_test_pred)

print(f"The accuracy of the test set with the best estimator give us :: {acc_test:.4f}")
print(f"best params :: {rf_random.best_params_}")


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   23.8s finished


The accuracy of the test set with the best estimator give us :: 0.8585
best params :: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 30, 'bootstrap': True}


#### Let's try a grid search

In [43]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [10,15],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5,6],
    'min_samples_split': [3,4,5,6],
    'n_estimators': [1150, 1200, 1250, 1300,1350]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

grid_search.fit(X_train, y_train)

y_test_pred = grid_search.best_estimator_.predict(X_test)
acc_test = accuracy_score(y_test, y_test_pred)

print(f"The accuracy of the test set with the best estimator give us :: {acc_test:.4f}")
print(grid_search.best_params_)

Fitting 3 folds for each of 320 candidates, totalling 960 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:   12.6s
[Parallel(n_jobs=-1)]: Done 317 tasks      | elapsed:   33.8s
[Parallel(n_jobs=-1)]: Done 600 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 960 out of 960 | elapsed:  1.7min finished


The accuracy of the test set with the best estimator give us :: 0.8491
{'bootstrap': True, 'max_depth': 15, 'max_features': 2, 'min_samples_leaf': 3, 'min_samples_split': 6, 'n_estimators': 1300}
