In [6]:
# train a voting classifier composed of three diverse clssifiers
# on the moons dataset
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# prepare dataset
X, y = make_moons(n_samples=10000, noise=0.4, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard'
)

voting_clf.fit(X_train, y_train)

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

('LogisticRegression', 0.84150000000000003)
('RandomForestClassifier', 0.85050000000000003)
('SVC', 0.87050000000000005)
('VotingClassifier', 0.86450000000000005)


In [7]:
# train a bagging classifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1
)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.8735


In [8]:
# perform oob (out-of-bag) evaluation on each predictor in bagging classification
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    bootstrap=True, n_jobs=-1, oob_score=True
)
bag_clf.fit(X_train, y_train)
print(bag_clf.oob_score_)
y_pred = bag_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.83775
0.85


In [9]:
bag_clf.oob_decision_function_

array([[ 0.90740741,  0.09259259],
       [ 0.99462366,  0.00537634],
       [ 0.28654971,  0.71345029],
       ..., 
       [ 0.        ,  1.        ],
       [ 0.        ,  1.        ],
       [ 0.68648649,  0.31351351]])

In [10]:
# train a Random Forest model
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(
    n_estimators=500, 
    max_leaf_nodes=16, 
    n_jobs=-1
)
rnd_clf.fit(X_train, y_train)
y_pred = rnd_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.871

In [11]:
# a Bagging Classifier that equals to Random Forest
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(splitter="random", max_leaf_nodes=16),
    n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1
)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.87250000000000005

In [12]:
# train a RandomForsestClassifier on Iris data and output feature importance
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris["data"], iris["target"])
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name, score)

('sepal length (cm)', 0.10165709648146497)
('sepal width (cm)', 0.024862293533442877)
('petal length (cm)', 0.42774572457119925)
('petal width (cm)', 0.44573488541389289)


In [13]:
# train an AdaBoostClassifer
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm="SAMME.R", learning_rate=0.5
)
ada_clf.fit(X_train, y_train)
y_pred = ada_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.86099999999999999

In [14]:
# train a Gredient Boosted Regression Tree
from sklearn.tree import DecisionTreeRegressor
# train the first Decision Tree
tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X_train, y_train)
# calculate residual errors made by the first tree
y2 = y_train - tree_reg1.predict(X_train)
# train the second tree using the residual errors
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X_train, y2)
# repeat
y3 = y2 - tree_reg2.predict(X_train)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X_train, y3)
# ensemble of the three trees by adding up all the predictions
y_pred = sum(tree.predict(X_test) for tree in(tree_reg1, tree_reg2, tree_reg3))
y_pred

array([ 0.93411043,  0.63350556, -0.10387394, ...,  0.04410701,
        0.04410701,  0.19794765])

In [17]:
# train a scikitlearn's GBRT model
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, 
                                 learning_rate=1.0)
gbrt.fit(X_train, y_train)
y_pred = gbrt.predict(X_test)
y_pred

array([ 0.94642182,  0.77056455,  0.15042952, ...,  0.03873568,
        0.03873568,  0.31215112])

In [18]:
# finding the optimal number of tree in GBRT using staged_predict
import numpy as np
from sklearn.metrics import mean_squared_error

X_train, X_val, y_train, y_val = train_test_split(X, y)
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(X_train, y_train)

errors = [mean_squared_error(y_val, y_pred) 
          for y_pred in gbrt.staged_predict(X_val)]
bst_n_estimators = np.argmin(errors)

gbrt_best = GradientBoostingRegressor(max_depth=2,
                                     n_estimators=bst_n_estimators)
gbrt_best.fit(X_train, y_train)
print(bst_n_estimators)

105


In [19]:
# finding the optimal number of trees in GBRT using early stopping
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True)
min_val_error = float("inf")
error_going_up = 0
for n_estimators in range(1, 120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred)
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0 
    else:
        error_going_up += 1
        if error_going_up == 5:
            break
print(n_estimators)

75


In [4]:
# exercise 8
from sklearn.datasets import fetch_mldata
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
mnist = fetch_mldata('MNIST original')
X, y = mnist["data"], mnist["target"]
X_train, X_val, X_test = X[:50000], X[50000:60000], X[60000:]
y_train, y_val, y_test = y[:50000], y[50000:60000], y[60000:]
# create target vector for the classification task
y_train_3 = (y_train == 3)
y_test_3 = (y_test == 3)
rnd_clf = RandomForestClassifier(n_estimators=200, 
                                 max_leaf_nodes=16, n_jobs=-1)
ext_clf = ExtraTreesClassifier(n_estimators=200,
                             max_leaf_nodes=16, n_jobs=-1)
svm_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("linear_svc", LinearSVC(C=1, loss="hinge")),
])

# ensemble
voting_clf = VotingClassifier(
    estimators=[('rnd', rnd_clf), ('ext', ext_clf), ('svm', svm_clf)],
    voting='hard'
)

# train models and compare accuracy
for clf in (rnd_clf, ext_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train_3)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test_3, y_pred))


('RandomForestClassifier', 0.95620000000000005)
('ExtraTreesClassifier', 0.9496)
('Pipeline', 0.9758)
('VotingClassifier', 0.95809999999999995)
