In [115]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

%matplotlib inline

# Cross Validation in Detail

* Train | Test Split
* Train | Validation | Test Split
* Scikit-Learn cross_val_score
* Scikit-Learn cross_validate

Cross Validation Train | Test Split

In [116]:
df = pd.read_csv('notebook/DATA/Advertising.csv')
df

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9
...,...,...,...,...
195,38.2,3.7,13.8,7.6
196,94.2,4.9,8.1,9.7
197,177.0,9.3,6.4,12.8
198,283.6,42.0,66.2,25.5


In [117]:
X = df.drop('sales', axis=1)
X

Unnamed: 0,TV,radio,newspaper
0,230.1,37.8,69.2
1,44.5,39.3,45.1
2,17.2,45.9,69.3
3,151.5,41.3,58.5
4,180.8,10.8,58.4
...,...,...,...
195,38.2,3.7,13.8
196,94.2,4.9,8.1
197,177.0,9.3,6.4
198,283.6,42.0,66.2


In [118]:
y = df['sales']
y

0      22.1
1      10.4
2       9.3
3      18.5
4      12.9
       ... 
195     7.6
196     9.7
197    12.8
198    25.5
199    13.4
Name: sales, Length: 200, dtype: float64

In [119]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [120]:
from sklearn.preprocessing import StandardScaler


In [121]:
scaler = StandardScaler()

In [122]:
scaler.fit(X_train) # we fit to the training data

In [123]:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

Use ridge to adjust hyper params

In [124]:
from sklearn.linear_model import Ridge # we import ridge since we will implement Cross Validation ourselves

In [125]:
model = Ridge(alpha=100)

In [126]:
model.fit(X_train, y_train)

In [127]:
y_pred = model.predict(X_test)

In [128]:
from sklearn.metrics import mean_squared_error

In [129]:
mean_squared_error(y_test, y_pred)

7.34177578903413

Adjusting the hyperparameters based on the test results (alpha = 1)

In [130]:
model_two = Ridge(alpha=1)

In [131]:
model_two.fit(X_train, y_train)

In [132]:
y_pred_two = model_two.predict(X_test)

In [133]:
mean_squared_error(y_test, y_pred_two)

2.319021579428752

This isnt the first time the X_test has been exposed to the data so it is a little bit biased

For this we have to do another train test split

# Train | Validation Strategy


The set is sepperated into Train, Validation, Test

We want our test set to have enough data to evaluate the model

In general: <br>
* Train : 70%
* Validation and Test : 30%

We set aside the test set to calculate the final metrics

# We don't adjust the hyper-params of the final Test set

We will perform train_test_split() twice

1 one the first set to get teh training data, then split the test and validation set

We add the other since the data will be split later into test and validation

In [134]:
X_train, X_other, y_train, y_other = train_test_split(X, y, test_size= 0.3, random_state=101)

Now split the other set

In [135]:
# The ration will be of the remaing 30% from the first split
# test_size = 0.5 - 50% of the remaining 30% -> test = 15% of all data
X_eval, X_test, y_eval, y_test = train_test_split(X_other, y_other, test_size=0.5, random_state=101)
# Put the validation/eval sets before the test

In [136]:
len(X_train)

140

In [137]:
len(X_eval)

30

In [138]:
len(X_test)


30

In [139]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

In [140]:
# transform all the sets
X_train = scaler.transform(X_train)
X_eval = scaler.transform(X_eval)
X_test = scaler.transform(X_test)

In [141]:
from sklearn.linear_model import Ridge

In [142]:
model_one = Ridge(alpha=100) # bad choice

In [143]:
model_one.fit(X_train, y_train)

In [144]:
# now we use the evaluation data
y_eval_pred = model.predict(X_eval)

In [145]:
from sklearn.metrics import mean_squared_error

In [146]:
mean_squared_error(y_eval, y_eval_pred) # really close to the org data

7.320101458823871

In [147]:
# now with a better alpha
model_two = Ridge(alpha=1)

In [148]:
model_two.fit(X_train, y_train)

In [149]:
y_eval_pred_two = model_two.predict(X_eval)

In [150]:
mean_squared_error(y_eval, y_eval_pred_two)

2.383783075056986

For the final performance we keep the second model and predict of the final test set

After this there is no adjusting of the hyper-params

In [151]:
y_final_test_pred = model_two.predict(X_test)

In [152]:
mean_squared_error(y_test, y_final_test_pred) # this result is a little better than the evaluation

2.2542600838005176

In the real world we deplot a model that is fit to the entire dataset

# 2. Cross Validation Score

To perform K-cross val test we split the entire set

We split the data into training and test

The test set is split for the final validation

We now split the training set based on a k variable

A common value is K = 5

We are training into K-1 Folds <br>
4 Folds are used for training and one is used for validation

This way we obtain errors for all possible fold combinations

We then get teh average errors of all the other parameters

In [153]:
X = df.drop('sales', axis=1)
y = df['sales']

In [154]:
from sklearn.model_selection import train_test_split

In [155]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [156]:
from sklearn.preprocessing import StandardScaler

In [157]:
scaler = StandardScaler()

In [158]:
scaler.fit(X_train)

In [159]:
X_train = scaler.transform(X_train)

In [160]:
X_test = scaler.transform(X_test)

In [161]:
model = Ridge(alpha=100) # high alpha , bad - results

In [162]:
from sklearn.model_selection import cross_val_score 

In [163]:
# estimator - machine learning model
# x - X_train
# y - y_train
# cv - k folds
# scoring metric
scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv = 5) # folds 5 times

This runs the model and scores it 5 times

In [164]:
scores # higher means better

array([ -9.32552967,  -4.9449624 , -11.39665242,  -7.0242106 ,
        -8.38562723])

We take the mean of all the errors

In [165]:
abs(scores.mean()) # this is the positive mean squared error

8.215396464543607

In [166]:
model = Ridge(alpha=1) # now we adjust the model

Now that we have adjust the model we test teh scores again

In [167]:
scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
scores

array([-3.15513238, -1.58086982, -5.40455562, -2.21654481, -4.36709384])

In [168]:
abs(scores.mean()) # since we test in different folds we have differences

3.344839296530695

This is a better evaluation since it test multiple conditions

In [170]:
model.fit(X_train, y_train) # now we train on the model

In [171]:
y_final_test_pred = model.predict(X_test)

In [172]:
mean_squared_error(y_test, y_final_test_pred)

2.319021579428752

# cross_validate function allows us to view mulitple performance metrics from cross validation on a model and explore how uch time fitting and testing took.

In [173]:
X = df.drop('sales', axis=1)
y = df['sales']

In [174]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [175]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [176]:
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [177]:
from sklearn.model_selection import cross_validate

In [178]:
model = Ridge(alpha=100)

We will pass a list for the scoring attribute

In [179]:
scores = cross_validate(model, X_train, y_train, scoring=['neg_mean_squared_error', 'neg_mean_absolute_error'], cv=10)

In [181]:
scores # this has scored some other stuff too 

{'fit_time': array([0.00144911, 0.0021708 , 0.00178552, 0.00123096, 0.00113559,
        0.00111485, 0.00112772, 0.00111556, 0.00114179, 0.00122762]),
 'score_time': array([0.00114751, 0.00112462, 0.00118041, 0.00093031, 0.00089216,
        0.00089073, 0.00088191, 0.00089622, 0.0008769 , 0.00115299]),
 'test_neg_mean_squared_error': array([ -6.06067062, -10.62703078,  -3.99342608,  -5.00949402,
         -9.14179955, -13.08625636,  -3.83940454,  -9.05878567,
         -9.05545685,  -5.77888211]),
 'test_neg_mean_absolute_error': array([-1.8102116 , -2.54195751, -1.46959386, -1.86276886, -2.52069737,
        -2.45999491, -1.45197069, -2.37739501, -2.44334397, -1.89979708])}

We pass the dataframe to a dictionary so it is organized better

In [183]:
scoresDataFrame = pd.DataFrame(scores)

In [186]:
scoresDataFrame

Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error
0,0.001449,0.001148,-6.060671,-1.810212
1,0.002171,0.001125,-10.627031,-2.541958
2,0.001786,0.00118,-3.993426,-1.469594
3,0.001231,0.00093,-5.009494,-1.862769
4,0.001136,0.000892,-9.1418,-2.520697
5,0.001115,0.000891,-13.086256,-2.459995
6,0.001128,0.000882,-3.839405,-1.451971
7,0.001116,0.000896,-9.058786,-2.377395
8,0.001142,0.000877,-9.055457,-2.443344
9,0.001228,0.001153,-5.778882,-1.899797


In [185]:
scoresDataFrame.mean()

fit_time                        0.001350
score_time                      0.000997
test_neg_mean_squared_error    -7.565121
test_neg_mean_absolute_error   -2.083773
dtype: float64

In [187]:
model = Ridge(alpha=1)

In [188]:
scores = cross_validate(model, X_train, y_train, scoring=['neg_mean_squared_error', 'neg_mean_absolute_error'], cv=10)

In [189]:
scores

{'fit_time': array([0.00164938, 0.00193405, 0.00197172, 0.00123572, 0.00178146,
        0.00112724, 0.0009582 , 0.00106382, 0.00080776, 0.00073028]),
 'score_time': array([0.00103736, 0.00169539, 0.0011127 , 0.00111365, 0.00155377,
        0.00085378, 0.00090337, 0.00071812, 0.00059891, 0.00057888]),
 'test_neg_mean_squared_error': array([-2.96250773, -3.05737833, -2.1737403 , -0.83303438, -3.46401792,
        -8.2326467 , -1.90586431, -2.76504844, -4.98950515, -2.84643818]),
 'test_neg_mean_absolute_error': array([-1.45717399, -1.5553078 , -1.23877012, -0.76893775, -1.43448944,
        -1.4943158 , -1.08136203, -1.25001123, -1.58097132, -1.22332553])}

In [191]:
scoresDataFrame = pd.DataFrame(scores)
scoresDataFrame

Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error
0,0.001649,0.001037,-2.962508,-1.457174
1,0.001934,0.001695,-3.057378,-1.555308
2,0.001972,0.001113,-2.17374,-1.23877
3,0.001236,0.001114,-0.833034,-0.768938
4,0.001781,0.001554,-3.464018,-1.434489
5,0.001127,0.000854,-8.232647,-1.494316
6,0.000958,0.000903,-1.905864,-1.081362
7,0.001064,0.000718,-2.765048,-1.250011
8,0.000808,0.000599,-4.989505,-1.580971
9,0.00073,0.000579,-2.846438,-1.223326


In [192]:
scoresDataFrame.mean() # the results are better overall

fit_time                        0.001326
score_time                      0.001017
test_neg_mean_squared_error    -3.323018
test_neg_mean_absolute_error   -1.308467
dtype: float64

In [193]:
model.fit(X_train, y_train)

In [194]:
y_final_pred = model.predict(X_test)

In [195]:
mean_squared_error(y_test, y_final_pred)

2.319021579428752