In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [11]:
from sklearn.preprocessing import StandardScaler

std = StandardScaler()

X_train = std.fit_transform(X_train)
X_test = std.fit_transform(X_test)

In [12]:
voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard'
)
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight=None,
                                             

In [13]:
from sklearn.metrics import accuracy_score as acc_sc

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  print(clf.__class__.__name__, acc_sc(y_test, y_pred))

LogisticRegression 0.9666666666666667
RandomForestClassifier 0.9666666666666667
SVC 0.9666666666666667
VotingClassifier 0.9666666666666667


In [14]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(DecisionTreeClassifier(),
                            n_estimators=500, max_samples=100,
                            bootstrap=True, n_jobs=-1, oob_score=True)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

acc_sc(y_pred, y_test)

0.9666666666666667

In [8]:
y_pred

array([0, 2, 1, 2, 0, 0, 0, 1, 0, 1, 1, 2, 0, 1, 1, 1, 1, 2, 2, 2, 0, 1,
       1, 0, 0, 2, 2, 2, 1, 0])

In [15]:
bag_clf.oob_score_

0.9666666666666667

In [16]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)

acc_sc(y_pred_rf, y_test)

0.9666666666666667

In [37]:
from sklearn.datasets import load_diabetes
import pandas as pd

diabetes_raw = load_diabetes()
diabetes = pd.DataFrame(diabetes_raw.data, columns=diabetes_raw.feature_names)

X, y = diabetes.iloc[:, :4], diabetes['s6']
X_train, X_test, y_train, y_test = train_test_split(X, y, 0.2)

TypeError: ignored

In [41]:
from sklearn.tree import DecisionTreeRegressor

tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X_train, y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=2,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [45]:
y2 = y_train - tree_reg1.predict(X_train)
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X_train, y2)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=2,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [47]:
y3 = y2 - tree_reg2.predict(X_train)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X_train, y3)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=2,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [48]:
y_pred = sum(tree.predict(X_test) for tree in (tree_reg1, tree_reg2, tree_reg3))

y_pred

array([-0.00220154,  1.9835114 ,  1.99937161,  1.10668193,  1.10668193,
       -0.00220154,  0.98627824, -0.00220154,  1.00213845, -0.00220154,
       -0.10674502, -0.00220154, -0.00220154,  1.9835114 ,  1.11871066,
        1.00213845, -0.00220154,  1.99937161,  1.9835114 , -0.00220154,
        0.98627824,  1.9835114 ,  1.00213845,  1.99937161, -0.00220154,
       -0.00220154,  2.00288158,  1.00213845,  2.00288158,  1.9835114 ])

In [51]:
from sklearn.metrics import mean_squared_error
import numpy as np

mean_squared_error(y_pred, y_test)

0.035194414981196305

In [52]:
mean_squared_error(tree_reg1.predict(X_test), y_test)

0.030489341397386332

In [53]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=1.0, loss='ls', max_depth=2,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=3,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [54]:
mean_squared_error(gbrt.predict(X_test), y_test)

0.03519441498119628

In [63]:
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(X_train, y_train)

errors = [mean_squared_error(y_test, y_pred) for y_pred in gbrt.staged_predict(X_test)]

import matplotlib.pyplot as plt

min(errors)

0.04095569257106221

In [65]:
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True)

min_val_error = float('inf')
error_going_up = 0
for n_estimators in range(1, 120):
  gbrt.n_estimators = n_estimators
  gbrt.fit(X_train, y_train)
  y_pred = gbrt.predict(X_test)
  error = mean_squared_error(y_pred, y_test)
  if error < min_val_error:
    min_val_error = error
    error_going_up = 0
  else:
    error_going_up += 1
    if error_going_up == 5: break

In [66]:
mean_squared_error(gbrt.predict(X_test), y_test)

0.04460414226008028

In [70]:
import xgboost

xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X_train, y_train)
mean_squared_error(xgb_reg.predict(X_test), y_test)



0.05315018068716514