# voting classifier


In [25]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

x, y = make_moons(n_samples=500, noise=0.30, random_state=42)
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)

In [26]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.ensemble import VotingClassifier



In [33]:
#hard voting

log_clf = LogisticRegression()
rf_clf = RandomForestClassifier()
svm_clf = SVC(random_state=42)

voting_clf = VotingClassifier([
    ("lr", log_clf), 
    ("rf", rf_clf),
    ("svm", svm_clf)
], voting="hard")

In [34]:
voting_clf.fit(x_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()),
                             ('svm', SVC(random_state=42))])

In [35]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rf_clf, svm_clf, voting_clf):
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.888
SVC 0.896
VotingClassifier 0.896


In [45]:
#soft voting

log_clf = LogisticRegression()
rf_clf = RandomForestClassifier()
svm_clf = SVC(gamma="scale", probability=True,)

voting_clf = VotingClassifier([
    ("lr", log_clf), 
    ("rf", rf_clf),
    ("svm", svm_clf)
], voting="soft")

In [46]:
voting_clf.fit(x_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()),
                             ('svm', SVC(probability=True))],
                 voting='soft')

In [47]:
for clf in (log_clf, rf_clf, svm_clf, voting_clf):
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.896
SVC 0.896
VotingClassifier 0.92


# bagging and pasting

In [55]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier()

bagging_clf = BaggingClassifier(dt_clf, n_estimators=500, max_samples=100, 
                                bootstrap=True, oob_score=True, n_jobs=-1, random_state=42)

In [56]:
bagging_clf.fit(x_train, y_train)

y_pred = bagging_clf.predict(x_test)

In [57]:
print(bagging_clf.__class__.__name__, accuracy_score(y_test, y_pred))

BaggingClassifier 0.904


In [58]:
dt_clf.fit(x_train, y_train)
print(accuracy_score(y_test, dt_clf.predict(x_test)))

0.856


In [60]:
bagging_clf.oob_score_

0.9253333333333333

random patches and random subspace

RP - instances and features are sampled

RS - only features are sampled

# random forest

In [66]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=500, criterion='entropy', max_leaf_nodes=16, n_jobs=-1, oob_score=True)

In [67]:
rf_clf.fit(x_train, y_train)

RandomForestClassifier(criterion='entropy', max_leaf_nodes=16, n_estimators=500,
                       n_jobs=-1, oob_score=True)

In [68]:
y_pred_rf = rf_clf.predict(x_test)

In [71]:
rf_clf.oob_score_

0.92

In [72]:
accuracy_score(y_test, y_pred_rf)

0.912

# extra trees


In [89]:
from sklearn.ensemble import ExtraTreesClassifier
extra_tree = ExtraTreesClassifier(n_estimators=500, criterion='entropy',
                                  max_leaf_nodes=16, n_jobs=-1, oob_score=True, bootstrap=True)


In [90]:
extra_tree.fit(x_train, y_train)
extra_tree.predict(x_test)

accuracy_score(y_test, extra_tree.predict(x_test))

0.92

In [92]:
extra_tree.feature_importances_

array([0.43556968, 0.56443032])

# adaboost

In [98]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                            n_estimators=56, 
                            algorithm="SAMME.R", learning_rate=0.6)

In [99]:
ada_clf.fit(x_train, y_train)

accuracy_score(y_test, ada_clf.predict(x_test))

0.896

# gradient boosting

In [110]:
np.random.seed(42)
x = np.random.rand(100, 1) - 0.5
y = 3*X[:, 0]**2 + 0.05 * np.random.randn(100)

In [111]:
from sklearn.tree import DecisionTreeRegressor
tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(x,y)

DecisionTreeRegressor(max_depth=2)

In [112]:
y2 = y - tree_reg1.predict(x)

tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(x,y2)

DecisionTreeRegressor(max_depth=2)

In [113]:
y3 = y2 - tree_reg1.predict(x)

tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(x,y3)

DecisionTreeRegressor(max_depth=2)

In [125]:
x_new = np.array([[0.8]])

y_pred = sum(tree.predict(x_new) for tree in [tree_reg1, tree_reg2, tree_reg3])

In [126]:
y_pred

array([0.03991297])

In [100]:
from sklearn.ensemble import GradientBoostingRegressor

In [128]:
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3)

In [129]:
gbrt.fit(x,y)

GradientBoostingRegressor(max_depth=2, n_estimators=3)

In [130]:
gbrt.predict(x_new)

array([0.33682158])

## early stopping

2 ways for early stopping

1) using staged_predict function

2) actually stopping the training

1st point trains all the predictors and then goes back to the best no. of estimators

2nd will stop it the moment error goes up a threshold set 

### method 1

In [132]:
import numpy as np
from sklearn.metrics import mean_squared_error


In [140]:
x_train, x_val, y_train, y_val = train_test_split(x,y)

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120, random_state=42)

gbrt.fit(x_train, y_train)

errors = [ mean_squared_error(y_val, y_pred) for y_pred in gbrt.staged_predict(x_val)]

best_n_estimator = np.argmin(errors) + 1 # +1 coz it startes with 0 and we need estimators to be from 1,2,3...

In [142]:
best_n_estimator

63

In [143]:
gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=best_n_estimator, random_state=42)
gbrt_best.fit(x_train, y_train)

GradientBoostingRegressor(max_depth=2, n_estimators=63, random_state=42)

In [146]:
mean_squared_error(y_val, gbrt_best.predict(x_val))

0.0014849381043079435

### method 2

In [163]:
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True, random_state=42, subsample=0.35)
min_val_error = 99999999

for n_estimator in range(1,120):
    gbrt.fit(x_train, y_train)
    y_pred = gbrt.predict(x_val)
    val_error = mean_squared_error(y_val, y_pred)
    
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up +=1
        if error_going_up ==6:
            break #early stopping

In [164]:
mean_squared_error(y_val, gbrt.predict(x_val))

0.001567585343517858

# xgboost

eXtreme gradient boost

In [165]:
from xgboost import XGBRegressor
xgboost = XGBRegressor()
xgboost.fit(x_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [166]:
y_pred = xgboost.predict(x_val)

mean_squared_error(y_val, y_pred)

0.002512477478675241

xgboost also supports early stopping by default

In [167]:
xgboost.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=2)
y_pred = xgboost.predict(x_val)
mean_squared_error(y_val, y_pred)

[0]	validation_0-rmse:0.24338
[1]	validation_0-rmse:0.17052
[2]	validation_0-rmse:0.12695
[3]	validation_0-rmse:0.09644
[4]	validation_0-rmse:0.07358
[5]	validation_0-rmse:0.05962
[6]	validation_0-rmse:0.05146
[7]	validation_0-rmse:0.04660
[8]	validation_0-rmse:0.04362
[9]	validation_0-rmse:0.04228
[10]	validation_0-rmse:0.04087
[11]	validation_0-rmse:0.04041
[12]	validation_0-rmse:0.04030
[13]	validation_0-rmse:0.04009
[14]	validation_0-rmse:0.04083


0.0016076228551608353

In [171]:
%timeit xgboost.fit(x_train, y_train)

85.6 ms ± 18.5 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [172]:
%timeit GradientBoostingRegressor().fit(x_train, y_train)


36 ms ± 5.79 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
