1,2

In [2]:
from sklearn import model_selection
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import datasets
from mlxtend.classifier import EnsembleVoteClassifier

cancer = datasets.load_breast_cancer()
X, y = cancer.data, cancer.target

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.25, random_state=1)

X_train, X_val, y_train, y_val = \
    train_test_split(X_train, y_train, test_size=0.25, random_state=42)

print('Train/Valid/Test sizes:', y_train.shape[0], y_val.shape[0], y_test.shape[0])

clf1 = DecisionTreeClassifier(random_state=42, max_depth=1)
clf2 = DecisionTreeClassifier(random_state=42, max_depth=2)
clf3 = DecisionTreeClassifier(random_state=42, max_depth=3)
eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], weights=[1, 1, 1])

labels = ['Classifier 1', 'Classifier 2', 'Classifier 3', 'Ensemble']
for clf, label in zip([clf1, clf2, clf3, eclf], labels):

    clf.fit(X_train, y_train)
    print("Validation Accuracy: %0.2f [%s]" % (clf.score(X_val, y_val), label))
    
print("Test Accuracy: %0.2f" % eclf.score(X_test, y_test))

Train/Valid/Test sizes: 319 107 143
Validation Accuracy: 0.94 [Classifier 1]
Validation Accuracy: 0.95 [Classifier 2]
Validation Accuracy: 0.95 [Classifier 3]
Validation Accuracy: 0.95 [Ensemble]
Test Accuracy: 0.90


3

In [3]:
from sklearn.ensemble import BaggingClassifier

tree = DecisionTreeClassifier(criterion='entropy',
                              random_state=42,
                              max_depth=None)


bag = BaggingClassifier(estimator=tree,
                        n_estimators=500,
                        oob_score=True,
                        bootstrap=True,
                        bootstrap_features=False,
                        n_jobs=-1, ## -1 usa todos los procesadores (Tareas en paralelo)
                        random_state=42)

bag.fit(X_train, y_train)
    
print("OOB Accuracy: %0.2f" % bag.oob_score_)
print("Test Accuracy: %0.2f" % bag.score(X_test, y_test))

OOB Accuracy: 0.95
Test Accuracy: 0.96


Basicamente estos algoritmos (Bootstrap agregating, adaboost, hacen varias bolsas de datos del training set, de distintas maneras)

Adaboost va mejorando cada bolsa con los elementos mal clasificados en la anterior, luego hace una votacion mayoritaria con los modelos de cada bolsa o simplemente usa la bolsa que mejor se adapto(No estoy seguro pero quizas sea la ultima)

4

In [4]:
from sklearn.ensemble import AdaBoostClassifier

#Por que si aumento el max_depth a 10 baja el accuracy? overfitting?
tree = DecisionTreeClassifier(criterion='entropy',
                              random_state=42,
                              max_depth=1)


boost = AdaBoostClassifier(estimator=tree,
                           n_estimators=500,
                           algorithm='SAMME',
                           random_state=42)

boost.fit(X_train, y_train)
    
print("Test Accuracy: %0.2f" % boost.score(X_test, y_test))

Test Accuracy: 0.97


5

In [5]:
from sklearn.ensemble import GradientBoostingClassifier


boost = GradientBoostingClassifier(
    learning_rate=0.1,
    n_estimators=100,
    max_depth=8,
    random_state=1)

boost.fit(X_train, y_train)
    
print("Test Accuracy: %0.2f" % boost.score(X_test, y_test))

Test Accuracy: 0.94


In [10]:
from sklearn.ensemble import RandomForestClassifier


forest = RandomForestClassifier(n_estimators=100,
                                max_depth=None,
                                min_samples_leaf=1,
                                min_samples_split=2,
                                max_features=0.3,
                                random_state=42)

forest.fit(X_train, y_train)
    
print("Test Accuracy: %0.2f" % forest.score(X_test, y_test))

Test Accuracy: 0.95
