# Resampling
This notebook explores resampling. It uses the boston house price dataset built into Sklearn.

## Imports

In [2]:
# Core libraries
import pandas as pd

# Sklearn processing
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

# Sklearn regression algorithms
from sklearn.linear_model import LinearRegression

# Sklearn regression model evaluation functions
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import r2_score

## Load data, split into X and y and scale data

In [3]:
# Load built-in sample data set
from sklearn.datasets import load_boston
boston = load_boston()

# Define the X (input) and y (target) features
X = boston.data
y = boston.target

# Rescale the input features
scaler = MinMaxScaler(feature_range=(0,1))
X = scaler.fit_transform(boston.data)

## Resample using train / test split method

In [4]:
# Train test split
test_size = 0.33
seed = 7
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

# Create model
model = LinearRegression()

#Fit model
model.fit(X_train, y_train)

# Evaluate
predictions = model.predict(X_test)
r2_score(y_test, predictions)

0.6663089606572572

## Resample using k-fold cross-validation method

In [5]:
# Create 5 folds
seed = 7
kfold = KFold(n_splits=5, shuffle=True, random_state=seed)

# Create a model 
model = LinearRegression()

# Train and evaluate multiple models using kfolds
results = cross_val_score(model, X, y, cv=kfold, scoring='r2')
print(results)
print("Mean:", results.mean())
print("Std:", results.std())

[0.57854155 0.7778757  0.65202483 0.72660245 0.81452587]
Mean: 0.7099140812635653
Std: 0.08538245536223632


### Finalise model

Note that the above model is not fitted when cross_val_score() returns.  The following inspection of the model returns an error:

In [6]:
model.coef_

AttributeError: 'LinearRegression' object has no attribute 'coef_'

If we want to proceed to build our final model we can fit it using all the data:

In [7]:
model.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

Get the cross-validation predictions:

In [8]:
cross_val_predict(model, X, y, cv=kfold)

array([29.78598216, 25.1101617 , 31.25366363, 28.57082895, 27.61096828,
       25.67980513, 22.96665196, 18.95836224, 11.24148715, 18.31703683,
       18.85408884, 20.97975714, 20.77125804, 19.19222285, 19.3503863 ,
       19.29929357, 20.34544005, 17.02296496, 16.75073234, 18.4097657 ,
       12.34945917, 17.64296287, 16.65787755, 13.73879305, 15.25287301,
       13.47218997, 15.43116443, 14.44253264, 19.45105871, 20.78411284,
       12.17069386, 17.93815075, 10.54355651, 14.28059453, 13.59803026,
       24.11760088, 22.53456569, 23.37391825, 22.73044931, 31.68958811,
       34.4939606 , 27.59319317, 25.12100859, 24.30023082, 22.90209276,
       22.1668571 , 20.35159727, 17.9758763 ,  8.88262872, 17.2333787 ,
       21.21235318, 23.6486951 , 27.84572867, 23.96330998, 15.19682414,
       30.73437068, 24.28935509, 33.4659491 , 21.68413942, 21.19225355,
       17.48971422, 17.49219924, 24.10484188, 22.79549347, 22.22201393,
       29.15015547, 26.16144135, 21.18431783, 17.50796887, 20.61

### Alternative evaluation metrics

Get a list of alternative evaluation metrics that can be used in the call to cross_val_score():

In [9]:
from sklearn.metrics import SCORERS
sorted(SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'brier_score_loss',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'mutual_info_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'v_measure_score']