In [1]:
# Step 1: Imports and data loading

import pandas as pd


from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
import warnings
warnings.filterwarnings('ignore')

# Load a freely available dataset from scikit-learn
data = load_breast_cancer()  # features + target in memory [web:17]
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

# Separate features and target
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


### Fitting the Ensemble

Define an `AdaBoostClassifier` estimator with default parameters and to fit to the data `X_train` and `y_train`

In [2]:
model_1 = AdaBoostClassifier().fit(X_train, y_train)
model_1_acc = model_1.score(X_test, y_test)

model_1_acc

0.958041958041958

### Grid Searching the Ensemble

Main parameters to search are the number of estimators and the complexity of the base estimator.  

In [3]:
params = {'n_estimators': [100, 200],
         'estimator__max_depth': [1, 2, 3]}
tree_grid = GridSearchCV(AdaBoostClassifier(estimator=DecisionTreeClassifier(), random_state = 42), 
                         param_grid=params).fit(X_train, y_train)
grid_acc = tree_grid.score(X_test, y_test)

grid_acc

0.9790209790209791

### A Different Base Estimator

Using a different base estimator such as `LogisticRegression` estimator.

In [4]:
params = {'mod__estimator__C': [.001, 0.01, 0.1, 1.0, 10.0]}
p = Pipeline([('scale', StandardScaler()),
             ('mod', AdaBoostClassifier(estimator = LogisticRegression(), 
                                       random_state = 42))
             ])
g = GridSearchCV(p,
                param_grid=params)
g.fit(X_train, y_train)
score2 = g.score(X_test, y_test)

score2

0.9790209790209791

Grid Searched Tree Model and rid Searched Logistic Model performed better than Base `AdaBoostClassifier`