<a href="https://colab.research.google.com/github/ThierrryScotto/cross-validation/blob/main/cross_validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as np
from sklearn.model_selection import TimeSeriesSplit

## Rolling Forecast Origin

The following code and output provide a simplified view of how rolling forecast horizons work in practice

In [None]:
def rolling_forecast_origin(train, min_train_size, horizon):
  '''
  Roling forecast origin generator.
  '''
  for i in range(len(train) - min_train_size - horizon + 1):
    split_train = train[:min_train_size + i]
    split_val = train[min_train_size + i : min_train_size + i + horizon]
    yield split_train, split_val

In [None]:
full_series = [2502, 2414, 2800, 2143, 2708, 1900, 2333, 2222, 1234, 3456]

test = full_series[-2:]
train = full_series[:-2]
print("Full training set: {0}".format(train))
print("Full test set: {0}".format(test))

Full training set: [2502, 2414, 2800, 2143, 2708, 1900, 2333, 2222]
Full test set: [1234, 3456]


In [None]:
cv_rolling = rolling_forecast_origin(train, min_train_size=4, horizon=1)
cv_rolling

i = 0

for cv_train, cv_val in cv_rolling:
  print(f'CV[{i+1}]')
  print(f'Train:\t{cv_train}')
  print(f'Val:\t{cv_val}')
  print('----------')
  i += 1

CV[1]
Train:	[2502, 2414, 2800, 2143]
Val:	[2708]
----------
CV[2]
Train:	[2502, 2414, 2800, 2143, 2708]
Val:	[1900]
----------
CV[3]
Train:	[2502, 2414, 2800, 2143, 2708, 1900]
Val:	[2333]
----------
CV[4]
Train:	[2502, 2414, 2800, 2143, 2708, 1900, 2333]
Val:	[2222]
----------


## Sliding Window Cross Validation

In [None]:
def sliding_window(train, window_size, horizon):
  '''
  Sliding window generator.
  '''
  for i in range(len(train) - window_size - horizon + 1):
    split_train = train[i : window_size + i]
    split_val = train[i + window_size : window_size + i + horizon]
    yield split_train, split_val

In [None]:
cv_sliding = sliding_window(train, window_size=4, horizon=1)

print('full training set {0}\n'.format(train))

i = 0
for cv_train, cv_val in cv_sliding:
    print(f'CV[{i+1}]')
    print(f'Train:\t{cv_train}')
    print(f'Val:\t{cv_val}')
    print('----------')
    i += 1

full training set [2502, 2414, 2800, 2143, 2708, 1900, 2333, 2222]

CV[1]
Train:	[2502, 2414, 2800, 2143]
Val:	[2708]
----------
CV[2]
Train:	[2414, 2800, 2143, 2708]
Val:	[1900]
----------
CV[3]
Train:	[2800, 2143, 2708, 1900]
Val:	[2333]
----------
CV[4]
Train:	[2143, 2708, 1900, 2333]
Val:	[2222]
----------


# TimeSeriesSplit

In [1]:
import numpy as np
from sklearn.model_selection import TimeSeriesSplit

In [2]:
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([1, 2, 3, 4, 5, 6])
tscv = TimeSeriesSplit()
print(tscv)

TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None)


In [3]:
X

array([[1, 2],
       [3, 4],
       [1, 2],
       [3, 4],
       [1, 2],
       [3, 4]])

In [4]:
for train_index, test_index in tscv.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

TRAIN: [0] TEST: [1]
TRAIN: [0 1] TEST: [2]
TRAIN: [0 1 2] TEST: [3]
TRAIN: [0 1 2 3] TEST: [4]
TRAIN: [0 1 2 3 4] TEST: [5]


In [5]:
X_train

array([[1, 2],
       [3, 4],
       [1, 2],
       [3, 4],
       [1, 2]])

In [6]:
X_test

array([[3, 4]])

In [8]:
# Fix test_size to 2 with 12 samples
X = np.random.randn(12, 2)
y = np.random.randint(0, 2, 12)

In [9]:
X

array([[ 0.38054118,  1.58639799],
       [-0.18590741, -0.69542485],
       [-0.60448261,  1.37777955],
       [-1.20524204,  0.16336728],
       [ 0.74271769, -0.6220279 ],
       [-1.77683981, -0.253243  ],
       [ 0.94565524,  0.42989277],
       [-0.67322479, -0.14418229],
       [ 0.28660759,  1.31462322],
       [ 0.13478586, -0.0146635 ],
       [ 0.43824536, -1.89303326],
       [-1.40537923,  1.20809026]])

In [10]:
y

array([0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1])

In [11]:
tscv = TimeSeriesSplit(n_splits=3, test_size=2)

for train_index, test_index in tscv.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

TRAIN: [0 1 2 3 4 5] TEST: [6 7]
TRAIN: [0 1 2 3 4 5 6 7] TEST: [8 9]
TRAIN: [0 1 2 3 4 5 6 7 8 9] TEST: [10 11]


In [12]:
# Add in a 2 period gap
tscv = TimeSeriesSplit(n_splits=3, test_size=2, gap=2)

for train_index, test_index in tscv.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

TRAIN: [0 1 2 3] TEST: [6 7]
TRAIN: [0 1 2 3 4 5] TEST: [8 9]
TRAIN: [0 1 2 3 4 5 6 7] TEST: [10 11]
