In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve

In [4]:
from sklearn.base import clone

In [5]:
from sklearn.metrics import accuracy_score
#from sklearn.metrics import confusion_matrix

In [6]:
from sklearn import tree

In [7]:
from sklearn.neural_network import MLPClassifier

In [8]:
from sklearn.ensemble import AdaBoostClassifier

In [9]:
from sklearn.svm import NuSVC

In [10]:
from sklearn.neighbors import KNeighborsClassifier

### Read Cleaned Data

In [11]:
a = pd.read_csv('clean/adult_clean_norm_onehot.csv',
                index_col='idx')
t = pd.read_csv('clean/titanic_clean_norm_onehot.csv',
                index_col='idx')

### Define Utility Functions

In [12]:
def subset_data(which='a', size=None):
    dataset = a.copy()
    if which != 'a':
        dataset = t.copy()
    if size is not None:
        dataset = dataset[:size].copy()
    last_col = len(dataset.columns)-1
    
    X = dataset.drop(columns=dataset.columns[last_col])
    y = dataset[[dataset.columns[last_col]]]
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20, random_state=1)
    return (X_train, X_test, y_train, y_test)

In [13]:
def print_max_scores(test_scores_mean, train_scores_mean, param_name, param_range):
    if param_name == 'Training Examples':
        max_test_score = test_scores_mean[-1]
        max_train_score = train_scores_mean[-1]
        max_n = len(param_range)-1
        
    else:
        max_n = 0
        max_test_score = test_scores_mean[0]

        for n, score in enumerate(test_scores_mean):
            if score > max_test_score:
                max_test_score = score
                max_n = n
    
    max_train_score = train_scores_mean[max_n]
    delta = round(max_train_score*100,1) - round(max_test_score*100,1) 
    delta = round(delta,1)
    
    delta_state =     'Unacceptable'
    if delta < 1.0:
        delta_state = '  Acceptable'
    elif delta <= 3.0:
        delta_state = '    Marginal'
    
    print('        C.V. Score =', round(max_test_score*100,1), '@', param_name, '=', param_range[max_n])
    print('    Training Score =', round(max_train_score*100,1))
    print(delta_state, 'Delta =', delta)

In [14]:
def plot_validation_curve(estimator, title, modifier,
                          X_train, y_train, cv,
                          ylim, param_name, param_range):
    plt.figure(figsize=[8,9/2])
    plt.title('Validation Curve' + ' - ' + title + '\n' + modifier)
    plt.ylim(*ylim)
    plt.xlabel(param_name)
    plt.ylabel('Score')

    train_scores, test_scores = validation_curve(
        estimator, X_train, y_train, param_name=param_name, param_range=param_range,
        cv=cv, scoring="accuracy", n_jobs=-1)
        
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()
    
    print_max_scores(test_scores_mean,
                     train_scores_mean,
                     param_name,
                     param_range)
    
    plt.fill_between(param_range, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1, color='darkorange')
    plt.fill_between(param_range, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color='navy')
    plt.plot(param_range, train_scores_mean, 'o-', color='darkorange',
             label="Training score")
    plt.plot(param_range, test_scores_mean, 'o-', color='navy',
             label="Cross-validation score")

    plt.legend(loc="best")

    return plt

In [15]:
def plot_learning_curve(estimator, title, modifier,
                        X_train, y_train, cv, 
                        ylim=None, train_sizes=None):
    plt.figure(figsize=[8,9/2])
    plt.title('Learning Curve' + ' - ' + title + '\n' + modifier)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel('Training Examples')
    plt.ylabel('Score')
        
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X_train, y_train, cv=cv, n_jobs=-1, train_sizes=train_sizes)
        
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()
    
    print_max_scores(test_scores_mean,
                     train_scores_mean,
                     'Training Examples',
                     train_sizes)
    
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

In [16]:
def save_fig(section_num, fig_num, fig):
    path = 'report/figs/' + str(section_num) + '-' + str(fig_num) + '.png'
    fig.savefig(path,
                bbox_inches='tight',#)#,
                pad_inches=0.375)

### Subset and Split the Data

In [17]:
cv = ShuffleSplit(n_splits=10, 
                  test_size=0.2, 
                  random_state=1)
lc_ylim=(0.7, 1.0)
vc_ylim=(0.7, 1.0)
train_sizes = np.linspace(0.1, 1.0, 10)

## Decision Tree

In [None]:
learner = 'Decision Tree'

In [None]:
# default criterion = 'gini'

#### Titanic

In [None]:
X_train, X_test, y_train, y_test = subset_data('t')

In [None]:
# %%timeit -r 1 -n 1
tree_est = tree.DecisionTreeClassifier(#criterion='entropy',
                                       random_state=1)
fig = plot_learning_curve(tree_est, learner, 'Default Hyperparameters',#"criterion='entropy'",
                          X_train, y_train, cv=cv, ylim=lc_ylim, 
                          train_sizes=train_sizes)
section_num = 1
fig_num = 1
save_fig(section_num, fig_num, fig)
fig_num += 1

In [None]:
# %%timeit -r 1 -n 1
tree_est = tree.DecisionTreeClassifier(criterion='entropy',
                                       random_state=1)
fig = plot_learning_curve(tree_est, learner, "criterion='entropy'",
                          X_train, y_train, cv=cv, ylim=lc_ylim, 
                          train_sizes=train_sizes)
save_fig(section_num, fig_num, fig)
fig_num += 1

In [None]:
# %%timeit -r 1 -n 1
tree_est = tree.DecisionTreeClassifier(criterion='entropy',
                                       random_state=1)
fig = plot_validation_curve(tree_est, learner, "criterion='entropy'",
                            X_train, y_train, cv=cv, ylim=vc_ylim, 
                            param_name='max_leaf_nodes', 
                            param_range=np.linspace(5, 50, 10, dtype=int))
save_fig(section_num, fig_num, fig)
fig_num += 1

In [None]:
# %%timeit -r 1 -n 1
tree_est = tree.DecisionTreeClassifier(criterion='entropy',
                                       random_state=1)
fig = plot_validation_curve(tree_est, learner, "criterion='entropy'",
                            X_train, y_train, cv=cv, ylim=vc_ylim, 
                            param_name='max_leaf_nodes', 
                            param_range=np.linspace(5, 14, 10, dtype=int))
save_fig(section_num, fig_num, fig)
fig_num += 1

In [None]:
# %%timeit -r 1 -n 1
tree_est = tree.DecisionTreeClassifier(criterion='entropy',
                                       max_leaf_nodes=8,
                                       random_state=1)
fig = plot_learning_curve(tree_est, learner, "criterion='entropy', max_leaf_nodes=8",
                          X_train, y_train, cv=cv, ylim=lc_ylim, 
                          train_sizes=train_sizes)
save_fig(section_num, fig_num, fig)
fig_num += 1

#### Adult

In [None]:
X_train, X_test, y_train, y_test = subset_data('a')

In [None]:
# %%timeit -r 1 -n 1
tree_est = tree.DecisionTreeClassifier(criterion='entropy',
                                       random_state=1)
fig = plot_learning_curve(tree_est, learner, "criterion='entropy'",#
                          X_train, y_train, cv=cv, ylim=lc_ylim, 
                          train_sizes=train_sizes)
section_num, fig_num = 2, 1
save_fig(section_num, fig_num, fig)

In [None]:
# %%timeit -r 1 -n 1
tree_est = tree.DecisionTreeClassifier(criterion='entropy',
                                       random_state=1)
fig = plot_validation_curve(tree_est, learner, "criterion='entropy'",
                            X_train, y_train, cv=cv, ylim=vc_ylim, 
                            param_name='max_leaf_nodes', 
                            param_range=np.linspace(20, 200, 10, dtype=int))
section_num, fig_num = 2, 2
save_fig(section_num, fig_num, fig)

In [None]:
# %%timeit -r 1 -n 1
tree_est = tree.DecisionTreeClassifier(criterion='entropy',
                                       max_leaf_nodes=100,
                                       random_state=1)
fig = plot_learning_curve(tree_est, learner, "criterion='entropy', max_leaf_nodes=100",
                          X_train, y_train, cv=cv, ylim=lc_ylim, 
                          train_sizes=train_sizes)
section_num, fig_num = 2, 3
save_fig(section_num, fig_num, fig)

In [None]:
# %%timeit -r 1 -n 1
tree_est = tree.DecisionTreeClassifier(criterion='gini',
                                       max_leaf_nodes=100,
                                       random_state=1)
fig = plot_learning_curve(tree_est, learner, "criterion='gini', max_leaf_nodes=100",
                          X_train, y_train, cv=cv, ylim=lc_ylim, 
                          train_sizes=train_sizes)
#section_num, fig_num = 2, 4
#save_fig(section_num, fig_num, fig)

## Neural Network

In [None]:
learner = 'Multilayer Perceptron'

In [None]:
# default hidden_layer_sizes = (100,)
# default activation = 'relu'
# default solver = 'adam'
# default max_iter = 200

#### Titanic

In [None]:
X_train, X_test, y_train, y_test = subset_data('t')

In [None]:
#%%timeit -r 1 -n 1
mlp_est = MLPClassifier(random_state=1)
fig = plot_learning_curve(mlp_est, learner, 
                          "Default Hyperparameters",
                          X_train, y_train, cv=cv, ylim=lc_ylim, 
                          train_sizes=train_sizes)
section_num, fig_num = 3, 1
save_fig(section_num, fig_num, fig)

With the default max_iter of 200, the classifer exhibits high bias, with a score of only roughly 65% on both training and cross-validation data.

max_iter is the number of times a given data point can be used to train the model

In [None]:
# %%timeit -r 1 -n 1
mlp_est = MLPClassifier(activation='logistic',
                        solver='sgd',
                        max_iter=2000,
                        random_state=1)
fig = plot_learning_curve(mlp_est, learner, 
                          "activation='logistic', solver='sgd', max_iter=2000",
                          X_train, y_train, cv=cv, ylim=lc_ylim, 
                          train_sizes=train_sizes)
section_num, fig_num = 3, 2
save_fig(section_num, fig_num, fig)

In [None]:
# %%timeit -r 1 -n 1
mlp_est = MLPClassifier(activation='logistic',
                        solver='adam',
                        max_iter=2000,
                        random_state=1)
fig =  plot_validation_curve(mlp_est, learner, 
                             "activation='logistic', solver='adam', max_iter=2000",
                             X_train, y_train, cv=cv, ylim=vc_ylim, 
                             param_name='hidden_layer_sizes', 
                             param_range=np.linspace(2, 20, 10, dtype=int))
section_num, fig_num = 3, 3
save_fig(section_num, fig_num, fig)

The spread in the following chart can be reduced by increasing the max_iter parameter, but the final values remain unchanged.

In [None]:
# %%timeit -r 1 -n 1
mlp_est = MLPClassifier(activation='logistic',
                        solver='adam',
                        max_iter=2000,
                        hidden_layer_sizes=(12),
                        random_state=1)
fig = plot_learning_curve(mlp_est, learner, 
                          "activation='logistic', solver='adam', max_iter=2000, hidden_layer_sizes=(12)",
                          X_train, y_train, cv=cv, ylim=lc_ylim, 
                          train_sizes=train_sizes)
section_num, fig_num = 3, 4
save_fig(section_num, fig_num, fig)

#### Adult

In [None]:
X_train, X_test, y_train, y_test = subset_data('a')

3min 27s

In [None]:
#%%timeit -r 1 -n 1
mlp_est = MLPClassifier(random_state=1)
fig = plot_learning_curve(mlp_est, learner, 
                          "Default Hyperparameters",
                          X_train, y_train, cv=cv, 
                          ylim=lc_ylim, train_sizes=train_sizes)
section_num, fig_num = 4, 1
save_fig(section_num, fig_num, fig)

~~4min 30s~~

In [None]:
#%%timeit -r 1 -n 1
mlp_est = MLPClassifier(activation='logistic',
                        solver='adam',
                        random_state=1)
fig = plot_learning_curve(mlp_est, learner, 
                          "activation='logistic', solver='adam'",
                          X_train, y_train, cv=cv, 
                          ylim=lc_ylim, train_sizes=train_sizes)
section_num, fig_num = 4, 2
save_fig(section_num, fig_num, fig)

In [None]:
#%%timeit -r 1 -n 1
mlp_est = MLPClassifier(activation='logistic',
                        solver='adam',
                        random_state=1)
fig =  plot_validation_curve(mlp_est, learner, 
                             "activation='logistic', solver='adam'",
                             X_train, y_train, cv=cv, ylim=vc_ylim, 
                             param_name='hidden_layer_sizes', 
                             param_range=np.linspace(1, 20, 20, dtype=int))
section_num, fig_num = 4, 3
save_fig(section_num, fig_num, fig)

In [None]:
#%%timeit -r 1 -n 1
mlp_est = MLPClassifier(activation='logistic',
                        solver='adam',
                        hidden_layer_sizes=17,
                        random_state=1)
fig = plot_learning_curve(mlp_est, learner, 
                          "activation='logistic', solver='adam', hidden_layer_sizes=(17)",
                          X_train, y_train, cv=cv, 
                          ylim=lc_ylim, train_sizes=train_sizes)
section_num, fig_num = 4, 4
save_fig(section_num, fig_num, fig)

## Boosting

In [None]:
# base_estimator = DecisionTreeClassifier(max_depth=1)
# n_estimators = 50

#### Titanic

In [None]:
X_train, X_test, y_train, y_test = subset_data('t')

In [None]:
learner = 'Single-Layer Decision Tree'

In [None]:
base_est = tree.DecisionTreeClassifier(max_depth=1,
                                       random_state=1)
fig = plot_learning_curve(base_est, learner,
                          "Default Hyperparameters",
                          X_train, y_train, cv=cv, 
                          ylim=lc_ylim, train_sizes=train_sizes)
section_num, fig_num = 5, 1
save_fig(section_num, fig_num, fig)

In [None]:
learner = 'Boosted Single-Layer Decision Tree'

In [None]:
boost_est = AdaBoostClassifier(base_estimator=base_est,
                               random_state=1)
fig = plot_learning_curve(boost_est, learner, 
                          "Default Hyperparameters",
                          X_train, y_train, cv=cv, 
                          ylim=lc_ylim, train_sizes=train_sizes)
section_num, fig_num = 5, 2
save_fig(section_num, fig_num, fig)

In [None]:
boost_est = AdaBoostClassifier(base_estimator=base_est,
                               random_state=1)
fig = plot_validation_curve(boost_est, learner, 
                            "Default Hyperparameters",
                            X_train, y_train, cv=cv, ylim=vc_ylim, 
                            param_name='n_estimators', 
                            param_range=np.linspace(1, 15, 15, dtype=int))
section_num, fig_num = 5, 3
save_fig(section_num, fig_num, fig)

In [None]:
boost_est = AdaBoostClassifier(base_estimator=base_est,
                               n_estimators=6,
                               random_state=1)
fig = plot_learning_curve(boost_est, learner, 
                          "n_estimators=6",
                          X_train, y_train, cv=cv, 
                          ylim=lc_ylim, train_sizes=train_sizes)
section_num, fig_num = 5, 4
save_fig(section_num, fig_num, fig)

#### Adult

In [None]:
X_train, X_test, y_train, y_test = subset_data('a')

In [None]:
learner = 'Two-Layer Decision Tree'

In [None]:
base_est = tree.DecisionTreeClassifier(max_depth=2,
                                       random_state=1)
fig = plot_learning_curve(base_est, learner, 
                          "Default Hyperparameters",
                          X_train, y_train, cv=cv, 
                          ylim=lc_ylim, train_sizes=train_sizes)
section_num, fig_num = 6, 1
save_fig(section_num, fig_num, fig)

In [None]:
learner = 'Boosted Two-Layer Decision Tree'

In [None]:
boost_est = AdaBoostClassifier(base_estimator=base_est,
                               random_state=1)
fig = plot_validation_curve(boost_est, learner, 
                            "Default Hyperparameters",
                            X_train, y_train, cv=cv, ylim=vc_ylim, 
                            param_name='n_estimators', 
                            param_range=np.linspace(5, 150, 30, dtype=int))
section_num, fig_num = 6, 2
save_fig(section_num, fig_num, fig)

In [None]:
boost_est = AdaBoostClassifier(base_estimator=base_est,
                               n_estimators=55,
                               random_state=1)
fig = plot_learning_curve(boost_est, learner, 
                          "n_estimators=55",
                          X_train, y_train, cv=cv, 
                          ylim=lc_ylim, train_sizes=train_sizes)
section_num, fig_num = 6, 3
save_fig(section_num, fig_num, fig)

## Support Vector Machine

In [None]:
learner = 'Support Vector Machine'

In [None]:
# nu = 0.5
# kernel = 'rbf' other options = 'linear', 'poly', 'sigmoid'

#### Titanic

In [None]:
X_train, X_test, y_train, y_test = subset_data('t')

In [None]:
svc_est = NuSVC(kernel='rbf',
                random_state=1)
fig = plot_learning_curve(svc_est, learner, 
                          "kernel='rbf'",
                          X_train, y_train, cv=cv, 
                          ylim=lc_ylim, train_sizes=train_sizes)
section_num, fig_num = 7, 1
save_fig(section_num, fig_num, fig)

In [None]:
svc_est = NuSVC(kernel='linear',
                random_state=1)
fig = plot_learning_curve(svc_est, learner, 
                          "kernel='linear'",
                          X_train, y_train, cv=cv, 
                          ylim=lc_ylim, train_sizes=train_sizes)
section_num, fig_num = 7, 2
save_fig(section_num, fig_num, fig)

In [None]:
svc_est = NuSVC(kernel='poly',
                random_state=1)
fig = plot_learning_curve(svc_est, learner, 
                          "kernel='poly'",
                          X_train, y_train, cv=cv, 
                          ylim=lc_ylim, train_sizes=train_sizes)
section_num, fig_num = 7, 3
save_fig(section_num, fig_num, fig)

In [None]:
svc_est = NuSVC(kernel='rbf',
                random_state=1)
fig = plot_validation_curve(svc_est, learner, 
                            "kernel='rbf'",
                            X_train, y_train, cv=cv, ylim=vc_ylim, 
                            param_name='nu', 
                            param_range=np.linspace(.05, .7, 14))
section_num, fig_num = 7, 4
save_fig(section_num, fig_num, fig)

In [None]:
svc_est = NuSVC(kernel='linear',
                random_state=1)
fig = plot_validation_curve(svc_est, learner, 
                            "kernel='linear'",
                            X_train, y_train, cv=cv, ylim=vc_ylim, 
                            param_name='nu', 
                            param_range=np.linspace(.05, .7, 14))
section_num, fig_num = 7, 5
save_fig(section_num, fig_num, fig)

In [None]:
svc_est = NuSVC(kernel='poly',
                random_state=1)
fig = plot_validation_curve(svc_est, learner, 
                            "kernel='poly'",
                            X_train, y_train, cv=cv, ylim=vc_ylim, 
                            param_name='degree', 
                            param_range=np.linspace(1,6,6,dtype=int))
section_num, fig_num = 7, 7
save_fig(section_num, fig_num, fig)

In [None]:
svc_est = NuSVC(kernel='poly',
                degree=2,
                random_state=1)
fig = plot_validation_curve(svc_est, learner, 
                            "kernel='poly', degree=2",
                            X_train, y_train, cv=cv, ylim=vc_ylim, 
                            param_name='nu', 
                            param_range=np.linspace(.05, .7, 14))

In [None]:
svc_est = NuSVC(kernel='poly',
                degree=2,
                nu=.4,
                random_state=1)
fig = plot_learning_curve(svc_est, learner, 
                          "kernel='poly', degree=2, nu=0.4",
                          X_train, y_train, cv=cv, 
                          ylim=lc_ylim, train_sizes=train_sizes)
section_num, fig_num = 7, 8
save_fig(section_num, fig_num, fig)

#### Adult

In [None]:
X_train, X_test, y_train, y_test = subset_data('a',
                                               size=15625)

In [None]:
svc_est = NuSVC(kernel='rbf',
                random_state=1)
fig = plot_validation_curve(svc_est, learner, 
                            "kernel='rbf'",
                            X_train, y_train, cv=cv, ylim=vc_ylim, 
                            param_name='nu', 
                            param_range=np.linspace(.05, .45, 9))
section_num, fig_num = 8, 1
save_fig(section_num, fig_num, fig)

In [None]:
svc_est = NuSVC(kernel='linear',
                random_state=1)
fig = plot_validation_curve(svc_est, learner, 
                            "kernel='linear'",
                            X_train, y_train, cv=cv, ylim=vc_ylim, 
                            param_name='nu', 
                            param_range=np.linspace(.05, .45, 9))
section_num, fig_num = 8, 2
save_fig(section_num, fig_num, fig)

In [None]:
svc_est = NuSVC(kernel='poly',
                random_state=1)
fig = plot_validation_curve(svc_est, learner, 
                            "kernel='poly'",
                            X_train, y_train, cv=cv, ylim=vc_ylim, 
                            param_name='nu', 
                            param_range=np.linspace(.05, .45, 9))
section_num, fig_num = 8, 3
save_fig(section_num, fig_num, fig)

In [None]:
svc_est = NuSVC(kernel='poly',
                nu=0.45,
                random_state=1)
fig = plot_validation_curve(svc_est, learner, 
                            "kernel='poly'",
                            X_train, y_train, cv=cv, ylim=vc_ylim, 
                            param_name='degree', 
                            param_range=np.linspace(1,6,6,dtype=int))
section_num, fig_num = 8, 4
save_fig(section_num, fig_num, fig)

In [None]:
svc_est = NuSVC(kernel='rbf',
                nu=0.45,
                random_state=1)
fig = plot_learning_curve(svc_est, learner, 
                          "kernel='rbf', nu=0.45",
                          X_train, y_train, cv=cv, 
                          ylim=lc_ylim, train_sizes=train_sizes)
section_num, fig_num = 8, 5
save_fig(section_num, fig_num, fig)

In [None]:
svc_est = NuSVC(kernel='poly',
                degree=2,
                random_state=1)
fig = plot_validation_curve(svc_est, learner, 
                            "kernel='poly', degree=2",
                            X_train, y_train, cv=cv, ylim=vc_ylim, 
                            param_name='nu', 
                            param_range=np.linspace(.05, .45, 9))
#section_num, fig_num = 8, 5
#save_fig(section_num, fig_num, fig)

In [None]:
svc_est = NuSVC(kernel='poly',
                nu=0.45,
                degree=2,
                random_state=1)
fig = plot_learning_curve(svc_est, learner, 
                          "kernel='poly', degree=2, nu=0.45",
                          X_train, y_train, cv=cv, 
                          ylim=lc_ylim, train_sizes=train_sizes)
#section_num, fig_num = 8, 6
#save_fig(section_num, fig_num, fig)

## K-Nearest Neighbors

In [None]:
learner = 'K-Nearest Neighbors'

In [None]:
# n_neighbors = 5
# p = 2

#### Titanic

In [None]:
X_train, X_test, y_train, y_test = subset_data('t')

In [None]:
neigh_est = KNeighborsClassifier()
fig = plot_learning_curve(neigh_est, learner, 
                          "Default Hyperparameters",
                          X_train, y_train, cv=cv, 
                          ylim=lc_ylim, train_sizes=train_sizes)
section_num, fig_num = 9, 1
save_fig(section_num, fig_num, fig)

In [None]:
neigh_est = KNeighborsClassifier()
fig = plot_validation_curve(neigh_est, learner, 
                            "Default Hyperparameters",
                            X_train, y_train, cv=cv, ylim=vc_ylim, 
                            param_name='p', 
                            param_range=np.linspace(1, 3, 3, dtype=int))
section_num, fig_num = 9, 2
save_fig(section_num, fig_num, fig)

In [None]:
neigh_est = KNeighborsClassifier(p=1)
fig = plot_validation_curve(neigh_est, learner, 
                            "p=1",
                            X_train, y_train, cv=cv, ylim=vc_ylim, 
                            param_name='n_neighbors', 
                            param_range=np.linspace(2, 30, 15, dtype=int))
section_num, fig_num = 9, 3
save_fig(section_num, fig_num, fig)

In [None]:
neigh_est = KNeighborsClassifier(n_neighbors=28,
                                 p=1)
fig = plot_learning_curve(neigh_est, learner, 
                          "n_neighbors=28, p=1",
                          X_train, y_train, cv=cv, 
                          ylim=lc_ylim, train_sizes=train_sizes)
section_num, fig_num = 9, 4
save_fig(section_num, fig_num, fig)

#### Adult

In [None]:
X_train, X_test, y_train, y_test = subset_data('a')

In [None]:
# %%timeit -r 1 -n 1
neigh_est = KNeighborsClassifier()
fig = plot_learning_curve(neigh_est, learner, 
                          "Default Hyperparameters",
                          X_train, y_train, cv=cv, 
                          ylim=lc_ylim, train_sizes=train_sizes)
section_num, fig_num = 10, 1
save_fig(section_num, fig_num, fig)

In [None]:
#%%timeit -r 1 -n 1
neigh_est = KNeighborsClassifier()
fig = plot_validation_curve(neigh_est, learner, 
                            "Default Hyperparameters",
                            X_train, y_train, cv=cv, ylim=vc_ylim, 
                            param_name='p', 
                            param_range=np.linspace(1, 3, 3, dtype=int))
section_num, fig_num = 10, 2
save_fig(section_num, fig_num, fig)

In [None]:
#%%timeit -r 1 -n 1
neigh_est = KNeighborsClassifier(p=1)
fig = plot_validation_curve(neigh_est, learner, 
                            "p=1",
                            X_train, y_train, cv=cv, ylim=vc_ylim, 
                            param_name='n_neighbors', 
                            param_range=np.linspace(10, 100, 10, dtype=int))
section_num, fig_num = 10, 3
save_fig(section_num, fig_num, fig)

In [None]:
neigh_est = KNeighborsClassifier(p=1,n_neighbors=60)
fig = plot_learning_curve(neigh_est, learner, 
                          "p=1, n_neighbors=60",
                          X_train, y_train, cv=cv, 
                          ylim=lc_ylim, train_sizes=train_sizes)
section_num, fig_num = 10, 4
save_fig(section_num, fig_num, fig)

## Training Accuracy

In [None]:
def print_results(clf):
    clf.fit(X_train, y_train)
    train_score = clf.score(X_train, y_train)
    test_score = clf.score(X_test, y_test)
    print("Training: ", round(train_score*100.,1))
    print(" Testing: ", round(test_score*100.,1))

#### Titanic

In [None]:
X_train, X_test, y_train, y_test = subset_data('t')

In [None]:
clf_1 = tree.DecisionTreeClassifier(criterion='entropy',
                                  max_leaf_nodes=8,
                                  random_state=1)
clf = clone(clf_1)
print_results(clf)

In [None]:
clf_2 = MLPClassifier(activation='logistic',
                    solver='adam',
                    max_iter=2000,
                    hidden_layer_sizes=12,
                    random_state=1)
clf = clone(clf_2)
print_results(clf)

In [None]:
base_est = tree.DecisionTreeClassifier(max_depth=1,
                                       random_state=1)
clf_3 = AdaBoostClassifier(base_estimator=base_est,
                           n_estimators=6,
                           random_state=1)
clf = clone(clf_3)
print_results(clf)

In [None]:
clf_4 = NuSVC(kernel='poly',
              nu=.4,
              degree=2,
              random_state=1)
clf = clone(clf_4)
print_results(clf)

In [None]:
clf_5 = KNeighborsClassifier(n_neighbors=28,
                             p=1)
clf = clone(clf_5)
print_results(clf)

#### Adult

In [None]:
X_train, X_test, y_train, y_test = subset_data('a')

In [None]:
clf_6 = tree.DecisionTreeClassifier(criterion='entropy',
                                    max_leaf_nodes=100,
                                    random_state=1)
clf = clone(clf_6)
print_results(clf)

In [None]:
clf_7 = MLPClassifier(activation='logistic',
                      solver='adam',
                      hidden_layer_sizes=17,
                      random_state=1)
clf = clone(clf_7)
print_results(clf)

In [None]:
base_est = tree.DecisionTreeClassifier(max_depth=2,
                                       random_state=1)
clf_8 = AdaBoostClassifier(base_estimator=base_est,
                           n_estimators=55,
                           random_state=1)
clf = clone(clf_8)
print_results(clf)

In [None]:
clf_9 = NuSVC(kernel='poly',
              nu=0.45,
              degree=2,
              random_state=1)
clf = clone(clf_9)
print_results(clf)

In [None]:
clf_10 = KNeighborsClassifier(n_neighbors=60,
                              p=1)
clf = clone(clf_10)
print_results(clf)

## Training Times

Also do training iterations?

#### Titanic

In [None]:
X_train, X_test, y_train, y_test = subset_data('t')

In [None]:
clf = clone(clf_1)

In [None]:
%%timeit
clf.fit(X_train, y_train)

In [None]:
clf = clone(clf_2)

In [None]:
#%%timeit
#clf.fit(X_train, y_train)

In [None]:
clf = clone(clf_3)

In [None]:
%%timeit
clf.fit(X_train, y_train)

In [None]:
clf = clone(clf_4)

In [None]:
%%timeit
clf.fit(X_train, y_train)

In [None]:
clf = clone(clf_5)

In [None]:
%%timeit
clf.fit(X_train, y_train)

In [None]:
X_train, X_test, y_train, y_test = subset_data('a')

In [None]:
clf = clone(clf_6)

In [None]:
%%timeit
clf.fit(X_train, y_train)

In [None]:
clf = clone(clf_7)

In [None]:
%%timeit
clf.fit(X_train, y_train)

In [None]:
clf = clone(clf_8)

In [None]:
%%timeit
clf.fit(X_train, y_train)

In [None]:
clf = clone(clf_9)

In [None]:
%%timeit
clf.fit(X_train, y_train)

In [None]:
clf = clone(clf_10)

In [None]:
%%timeit
clf.fit(X_train, y_train)

## Iterations

In [18]:
X_train, X_test, y_train, y_test = subset_data('t')

In [19]:
clf_2 = MLPClassifier(activation='logistic',
                    solver='adam',
                    max_iter=2000,
                    hidden_layer_sizes=12,
                    random_state=1)
clf_2.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
              beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=12, learning_rate='constant',
              learning_rate_init=0.001, max_iter=2000, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=1, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [22]:
clf_2.n_iter_

458

In [20]:
X_train, X_test, y_train, y_test = subset_data('a')

In [21]:
clf_7 = MLPClassifier(activation='logistic',
                      solver='adam',
                      hidden_layer_sizes=17,
                      random_state=1)
clf_7.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
              beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=17, learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=1, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [23]:
clf_7.n_iter_

185