In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor

import numpy as np
from sklearn.metrics import mean_squared_error

import xgboost

In [3]:
X, y = make_moons(n_samples=100, noise=0.15)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [2]:
# CLASSIFICADORES DE VOTAÇÃO

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

voting_clf = VotingClassifier(
            estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
            voting='hard')
voting_clf.fit(X_train, y_train)

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

In [24]:
# BAGGING E PASTING

bag_clf = BaggingClassifier(
            DecisionTreeClassifier(), n_estimators=500,
            max_samples=50, bootstrap=True, n_jobs=-1) #n_jobs = -1: todas CPUs disponiveis 
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [25]:
# OUT-OF-BAG EVALUATION

bag_clf = BaggingClassifier(
        DecisionTreeClassifier(), n_estimators=500,
        bootstrap=True, n_jobs=-1, oob_score=True)
bag_clf.fit(X_train, y_train)
bag_clf.oob_score_

0.925

# RANDOM FOREST

In [26]:
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)
y_pred_rf = rnd_clf.predict(X_test)

In [29]:
# RELEVÂNCIA DAS VARIÁVEIS

from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris["data"], iris["target"])
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name, score)
    
# Random Forests são úteis para se entender quais variáveis de fato importam

sepal length (cm) 0.07882503067494859
sepal width (cm) 0.021573175271866203
petal length (cm) 0.46402009210202627
petal width (cm) 0.43558170195115886


# BOOSTING

In [32]:
#### AdaBoost (Adaptive Boosting) ####

# Treina preditores sequencialmente, prestando mais antenção nas observações
#que o predecessor ajustou mal

ada_clf = AdaBoostClassifier(
DecisionTreeClassifier(max_depth=1), n_estimators=200,
algorithm="SAMME.R", learning_rate=0.5)
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                   learning_rate=0.5, n_estimators=200)

In [26]:
#### GRADIENT BOOSTING ####

# Tenta ajustar a nova variável preditora ao resíduo gerado pela anterior

# Testando com Árvores de Decisão

tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X, y)

y2 = y - tree_reg1.predict(X) # resíduo
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X, y2)

y3 = y2 - tree_reg2.predict(X) # resíduo
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X, y3)

# Fazer predições somando as de cada árvore
media0 = X[:,0].mean()
media1 = X[:,1].mean()
X_new = [[X[0][0] + media0, X[0][1] + media1]]

y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))




# USANDO SCIKIT-LEARN
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt.fit(X, y)

GradientBoostingRegressor(learning_rate=1.0, max_depth=2, n_estimators=3)

In [28]:
# ENCONTRANDO NÚMERO ÓTIMO DE ÁRVORES

X_train, X_val, y_train, y_val = train_test_split(X, y)
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(X_train, y_train)
errors = [mean_squared_error(y_val, y_pred)
        for y_pred in gbrt.staged_predict(X_val)]
bst_n_estimators = np.argmin(errors) + 1
gbrt_best = GradientBoostingRegressor(max_depth=2,n_estimators=bst_n_estimators)
gbrt_best.fit(X_train, y_train)

GradientBoostingRegressor(max_depth=2, n_estimators=115)

In [29]:
# FAZENDO O MESMO SEM RODAR TODAS AS ÁRVORES E ESCOLHENDO, DEPOIS, A ÓTIMA

gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True)
min_val_error = float("inf")
error_going_up = 0
for n_estimators in range(1, 120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred)
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up == 5:
            break # early stopping

In [33]:
#### XGBOOST ####  ( EXTREME GRADIENT BOOSTING)

xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X_train, y_train)
y_pred = xgb_reg.predict(X_val)

# EARLY STOPPING

xgb_reg.fit(X_train, y_train,
eval_set=[(X_val, y_val)], early_stopping_rounds=2)
y_pred = xgb_reg.predict(X_val)

[0]	validation_0-rmse:0.38974
[1]	validation_0-rmse:0.32492
[2]	validation_0-rmse:0.29121
[3]	validation_0-rmse:0.27635
[4]	validation_0-rmse:0.27144
[5]	validation_0-rmse:0.26412
[6]	validation_0-rmse:0.26017
[7]	validation_0-rmse:0.25776
[8]	validation_0-rmse:0.25655
[9]	validation_0-rmse:0.25574
[10]	validation_0-rmse:0.25528
[11]	validation_0-rmse:0.25501
[12]	validation_0-rmse:0.25484
[13]	validation_0-rmse:0.25477
[14]	validation_0-rmse:0.25472
[15]	validation_0-rmse:0.25470
[16]	validation_0-rmse:0.25468
[17]	validation_0-rmse:0.25463
[18]	validation_0-rmse:0.25473
[19]	validation_0-rmse:0.25476
