In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

In [2]:
X, y = make_moons(n_samples = 150, noise = 0.2, random_state = 42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [3]:
# Voting Classifiers
log_clf = LogisticRegression(random_state = 42)
rnd_clf = RandomForestClassifier(random_state = 42)
svm_clf = SVC(random_state = 42)

voting_clf = VotingClassifier(estimators = [("lr", log_clf),
                                            ("rf", rnd_clf),
                                            ("svc", svm_clf)],
                              voting = "hard")

voting_clf.fit(X_train, y_train)



In [4]:
from sklearn.metrics import accuracy_score


for clf in [log_clf, rnd_clf, svm_clf, voting_clf]:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))


LogisticRegression 0.8333333333333334
RandomForestClassifier 0.9
SVC 0.9
VotingClassifier 0.9


In [5]:
svm_clf_proba = SVC(probability = True, random_state = 42)
voting_clf_soft = VotingClassifier(estimators = [("lr", log_clf),
                                                 ("rf", rnd_clf),
                                                 ("svc", svm_clf_proba)],
                                   voting = "soft")


In [6]:
for clf in [log_clf, rnd_clf, svm_clf, svm_clf_proba, voting_clf_soft]:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.8333333333333334
RandomForestClassifier 0.9
SVC 0.9
SVC 0.9
VotingClassifier 0.9


In [7]:
# Bagging and Pasting
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(DecisionTreeClassifier(random_state = 42), n_estimators = 500, max_samples = 100, bootstrap = True, n_jobs = -1)
dec_tree_clf = DecisionTreeClassifier(random_state = 42)

bag_clf.fit(X_train, y_train)
dec_tree_clf.fit(X_train, y_train)

In [8]:
y_pred = bag_clf.predict(X_test)
y_tree_pred = dec_tree_clf.predict(X_test)

In [9]:
score_ensemble = accuracy_score(y_test, y_pred)
score_tree = accuracy_score(y_test, y_tree_pred)
print("Ensemble: ", score_ensemble)
print("Tree: ", score_tree)

Ensemble:  0.9
Tree:  0.9


In [10]:
# Out of Bag Evaluation
new_bag_clf = BaggingClassifier(DecisionTreeClassifier(random_state = 42),
                                n_estimators = 500,
                                max_samples = 100,
                                bootstrap = True, 
                                n_jobs = -1, 
                                oob_score = True)


new_bag_clf.fit(X_train, y_train)

In [11]:
new_bag_clf.oob_score_

0.9083333333333333

In [12]:
y_pred = new_bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.8666666666666667

In [13]:
# Random Forests
from sklearn.ensemble import RandomForestClassifier

rnd_forest = RandomForestClassifier(n_estimators = 500,
                                    max_leaf_nodes = 16,
                                    n_jobs = -1)

rnd_forest.fit(X_train, y_train)

In [14]:
# Feature Importance
from sklearn.datasets import load_iris

iris = load_iris()

rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris["data"], iris["target"])



for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name, ":", score)

sepal length (cm) : 0.10204937140932638
sepal width (cm) : 0.02457171450977865
petal length (cm) : 0.42660459793186856
petal width (cm) : 0.4467743161490264


In [15]:
# Boosting
# AdaBoost

from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), 
                            n_estimators=200, 
                            algorithm="SAMME.R", 
                            learning_rate=0.5)




In [16]:
ada_clf.fit(X_train, y_train)
y_pred_ada = ada_clf.predict(X_test)
accuracy_score(y_test, y_pred_ada)

0.9333333333333333

In [17]:
# Gradient Boosting

import numpy as np
np.random.seed(42)
# Creating a quadratic data set
X = np.random.rand(100, 1) - 0.5
y = 3 * X[ : , 0] ** 2 + 0.05 * np.random.randn(100)

In [18]:
# Gradient Boosting

from sklearn.tree import DecisionTreeRegressor


tree_reg1 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg1.fit(X, y)

y2 = y - tree_reg1.predict(X)

tree_reg2 = DecisionTreeRegressor(max_depth=2, random_state=43)
tree_reg2.fit(X, y2)

y3 = y2 - tree_reg2.predict(X)

tree_reg3 = DecisionTreeRegressor(max_depth=2, random_state=44)
tree_reg3.fit(X, y3)

X_new = np.array([[-0.4], [0.], [0.5]])
y_pred = sum(tree.predict(X_new) for tree in [tree_reg1, tree_reg2, tree_reg3])
y_pred





array([0.49484029, 0.04021166, 0.75026781])

In [19]:
# Finding the Optimal Number of Trees

# We will use early stopping to find optimal number
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(X_train, y_train)

errors = [mean_squared_error(y_test, y_pred) for y_pred in gbrt.staged_predict(X_test)]

best_n_estimator = np.argmin(errors)

gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=best_n_estimator)
gbrt_best.fit(X_train, y_train)

print(best_n_estimator)

93


In [23]:
# Stacking

from sklearn.ensemble import StackingClassifier

X, y = make_moons(n_samples=150, noise=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)



In [25]:
stacking_clf = StackingClassifier(estimators=[("lr", LogisticRegression(random_state=42)),
                                             ("rf", RandomForestClassifier(random_state=42),
                                             ("svc", SVC(probability=True, random_state=42)))],
                                 final_estimator=RandomForestClassifier(random_state=43),
                                 cv = 5)
stacking_clf.fit(X_train, y_train)

In [26]:
stacking_clf.score(X_test, y_test)

0.9