In [84]:
import numpy as np
import pandas as pd

from sklearn.model_selection import TimeSeriesSplit

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression



# Dataset

In [85]:
np.random.seed(42)
dates = pd.date_range(start='2020-01-01', end='2022-12-31', freq='D')
ts = pd.Series(np.cumsum(np.random.randn(len(dates))), index=dates)

X = pd.DataFrame(index=ts.index)

X['lag1'] = ts.shift(1)

y = ts.values


In [86]:
X = X.dropna()
y = y[1:]



In [87]:
def expanding_window_cv(X, y, min_train_size=365, step=30):
    n_samples = len(y)
    for train_end in range(min_train_size, n_samples, step):
        yield (np.arange(train_end), np.arange(train_end, min(train_end + step, n_samples)))


In [88]:

model = LinearRegression()
mse_scores_expanding = []



In [89]:
for train_index, test_index in expanding_window_cv(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mse_scores_expanding.append(mse)
    print(f"Train size: {len(train_index)}, Test size: {len(test_index)}, MSE: {mse:.4f}")

print(f"Average MSE (Expanding Window): {np.mean(mse_scores_expanding):.4f}\n")



Train size: 365, Test size: 30, MSE: 1.1989
Train size: 395, Test size: 30, MSE: 1.1166
Train size: 425, Test size: 30, MSE: 1.1142
Train size: 455, Test size: 30, MSE: 1.2669
Train size: 485, Test size: 30, MSE: 0.8449
Train size: 515, Test size: 30, MSE: 1.0600
Train size: 545, Test size: 30, MSE: 0.9159
Train size: 575, Test size: 30, MSE: 0.7631
Train size: 605, Test size: 30, MSE: 1.2057
Train size: 635, Test size: 30, MSE: 1.0344
Train size: 665, Test size: 30, MSE: 1.1476
Train size: 695, Test size: 30, MSE: 0.9542
Train size: 725, Test size: 30, MSE: 1.1205
Train size: 755, Test size: 30, MSE: 0.8632
Train size: 785, Test size: 30, MSE: 0.6038
Train size: 815, Test size: 30, MSE: 0.8527
Train size: 845, Test size: 30, MSE: 1.0954
Train size: 875, Test size: 30, MSE: 1.0324
Train size: 905, Test size: 30, MSE: 1.0761
Train size: 935, Test size: 30, MSE: 0.7883
Train size: 965, Test size: 30, MSE: 1.0758
Train size: 995, Test size: 30, MSE: 1.0684
Train size: 1025, Test size: 30,

In [90]:
tscv = TimeSeriesSplit(n_splits=5, test_size=30, gap=0)
mse_scores_sliding = []



In [91]:
for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mse_scores_sliding.append(mse)
    print(f"Train size: {len(train_index)}, Test size: {len(test_index)}, MSE: {mse:.4f}")

print(f"Average MSE (Sliding Window): {np.mean(mse_scores_sliding):.4f}")

Train size: 945, Test size: 30, MSE: 0.8334
Train size: 975, Test size: 30, MSE: 0.9734
Train size: 1005, Test size: 30, MSE: 1.0004
Train size: 1035, Test size: 30, MSE: 1.1569
Train size: 1065, Test size: 30, MSE: 0.6404
Average MSE (Sliding Window): 0.9209
