# Section Five - Predicting  the future (30 Minutes)
Your model performs remarkably on your dataset, but fails miserably when it sees new data. How can we build models that accurately predict the future, and not just learn the features of our dataset?

## Build a robust model with cross validation

In [7]:
# train test split
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import linear_model
dataset = datasets.load_iris()

In [4]:
dataset.data.shape

(150, 4)

In [5]:
dataset.target.shape

(150,)

In [6]:
hold_out_percent = 0.4

# let's not overfit the entire dataset, split the data
X_train, X_test, y_train, y_test = train_test_split(
    dataset.data, dataset.target, test_size=hold_out_percent, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((90, 4), (60, 4), (90,), (60,))

In [27]:
model = linear_model.LogisticRegression(
    penalty='l2',
    dual=False,  # Dual or primal formulation.
    tol=0.0001,  # Tolerance for stopping criteria.
    C=1.0,  # Inverse of regularization strength; must be a positive float.
    fit_intercept=True
).fit(X_train, y_train)

# might overfit test set, parameters can be tweaked until estimator optimal
# can split into (train, test, validation), but it is messy

In [11]:
# solution - k-fold CV
from sklearn.model_selection import cross_val_score
model = linear_model.LogisticRegression(penalty='l2', dual=False)

# split data, fit model and computing the score 5 times 
# different splits each time
scores = cross_val_score(model, dataset.data, dataset.target, cv=5)
scores  

array([1.        , 0.96666667, 0.93333333, 0.9       , 1.        ])

In [12]:
# compute 95% confidence interval of accuracy
scores.mean(), scores.std() * 2

(0.9600000000000002, 0.07774602526460399)

In [13]:
# use a custom scorer, instead of model.score()
cross_val_score(model, dataset.data, dataset.target, cv=5, scoring='f1_macro')

array([1.        , 0.96658312, 0.93333333, 0.89769821, 1.        ])

## Create complex models with scikit-learn pipelines

In [None]:
sklearn.pipeline.Pipeline(
    steps,  # List[name, transform]  (implementing fit/transform)
    memory=None) # cache the fitted transformers of the pipeline? folder name for cache

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.decomposition import PCA

In [17]:
Pipeline([('pca', PCA()), ('classifier', SVC())])

Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('classifier', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [19]:
from sklearn.pipeline import make_pipeline
make_pipeline(PCA(), SVC()) 

Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [20]:
pipe = make_pipeline(PCA(), SVC()) 
pipe.steps[1]

('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
   max_iter=-1, probability=False, random_state=None, shrinking=True,
   tol=0.001, verbose=False))

In [21]:
pipe.named_steps["svc"]

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [22]:
pipe.named_steps["svc"].decision_function_shape

'ovr'

In [23]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [24]:
pipe.score(X_train, y_train)

0.9777777777777777

In [25]:
pipe.score(X_test, y_test)

1.0

In [28]:
model.score(X_test, y_test)

0.9666666666666667

## Find the best model with hyperparameter search


In [None]:
sklearn.model_selection.GridSearchCV(
    estimator, 
    param_grid, 
    scoring=None, # how to evaluate the predictions? str or callable
    n_jobs=None,  # jobs in parallel. None means 1. -1 uses all processors
)

In [None]:
sklearn.svm.SVC(
    C=1.0,  # Penalty parameter C of the error term.
    kernel=’rbf’,
    degree=3,  # degree of the poly kernel
    gamma=’auto_deprecated’  # Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.
)

In [29]:
param_grid = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
 ]

In [38]:
from sklearn.model_selection import GridSearchCV

model = GridSearchCV(SVC(), param_grid, cv=5, return_train_score=True)
model.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'C': [1, 10, 100, 1000], 'kernel': ['linear']}, {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [39]:
import pandas as pd

pd.DataFrame(model.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,param_gamma,params,split0_test_score,split1_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.000999,2e-06,0.0,0.0,1,linear,,"{'C': 1, 'kernel': 'linear'}",1.0,0.894737,...,0.955556,0.040879,1,0.971429,1.0,0.986301,0.972603,0.958904,0.977847,0.01407
1,0.000801,0.000401,0.000599,0.000489,10,linear,,"{'C': 10, 'kernel': 'linear'}",0.95,0.842105,...,0.933333,0.051783,7,0.971429,1.0,0.986301,0.986301,0.958904,0.980587,0.014115
2,0.00139,0.000802,0.001197,0.001466,100,linear,,"{'C': 100, 'kernel': 'linear'}",0.95,0.789474,...,0.944444,0.082619,2,0.985714,1.0,0.972603,0.986301,0.986301,0.986184,0.008667
3,0.0004,0.00049,0.0002,0.000399,1000,linear,,"{'C': 1000, 'kernel': 'linear'}",0.95,0.789474,...,0.944444,0.082619,2,0.985714,1.0,0.986301,0.986301,0.986301,0.988924,0.005543
4,0.000407,0.000498,0.000192,0.000383,1,rbf,0.001,"{'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}",0.4,0.578947,...,0.555556,0.093728,11,0.557143,0.591549,0.575342,0.424658,0.630137,0.555766,0.069837
5,0.000999,0.001097,0.000395,0.000484,1,rbf,0.0001,"{'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}",0.35,0.578947,...,0.4,0.092578,12,0.357143,0.591549,0.356164,0.356164,0.356164,0.403437,0.094057
6,0.000398,0.000488,0.000598,0.000488,10,rbf,0.001,"{'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}",1.0,0.842105,...,0.911111,0.076542,8,0.857143,0.943662,0.917808,0.890411,0.90411,0.902627,0.028756
7,0.000997,0.00063,0.0002,0.000399,10,rbf,0.0001,"{'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}",0.5,0.578947,...,0.577778,0.059998,10,0.571429,0.591549,0.589041,0.479452,0.657534,0.577801,0.057247
8,0.000399,0.000488,0.000397,0.000487,100,rbf,0.001,"{'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}",1.0,0.842105,...,0.944444,0.059025,2,0.928571,0.985915,0.958904,0.945205,0.958904,0.9555,0.018865
9,0.000798,0.000399,0.000201,0.000402,100,rbf,0.0001,"{'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}",1.0,0.842105,...,0.911111,0.076542,8,0.857143,0.943662,0.917808,0.890411,0.90411,0.902627,0.028756


In [43]:
pipe.steps

[('pca',
  PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)),
 ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False))]

In [41]:
model = GridSearchCV(pipe, dict(svc__C=[0.1, 10, 100]), cv=5, return_train_score=True)
model.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'svc__C': [0.1, 10, 100]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=0)