In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from joblib import Parallel, delayed


In [7]:
daily_train = pd.read_csv('../src/data/m4_forecasting/Daily-train.csv')
daily_test = pd.read_csv('../src/data/m4_forecasting/Daily-test.csv')

In [8]:
def clean_daily_series(row):
    # Drop the ID in column V1
    ts = row.iloc[1:]

    # Drop trailing NaNs (uneven lengths)
    ts = ts.dropna().astype(float)

    # Assign daily index (fake but consistent)
    ts.index = pd.date_range(start="2000-01-01", periods=len(ts), freq="D")

    return ts


In [None]:
def build_windows(ts, L, H):
    """
    ts: 1D numpy array
    L: lookback window
    H: forecast horizon
    Returns: X [N,L], Y [N,H]
    """
    X, Y = [], []
    for i in range(len(ts) - L - H + 1):
        X.append(ts[i:i+L])
        Y.append(ts[i+L:i+L+H])
    return np.array(X), np.array(Y)

In [None]:

def forecast_linear_regression(train_row, L=30, H=7):
    ts = train_row.dropna().astype(float).values
    if len(ts) < L + H:
        return None
    # Build supervised dataset
    X, Y = build_windows(ts, L=L, H=H)
    # Fit separate models for each horizon step
    y_pred = np.zeros(H)
    for h in range(H):
        lr = LinearRegression()
        lr.fit(X, Y[:,h])
        y_pred[h] = lr.predict(ts[-L:].reshape(1,-1))[0]
    return y_pred


In [11]:
def rmse(actual, predicted):
    return np.sqrt(mean_squared_error(actual, predicted))

def evaluate_forecast(train_row, test_row, L=30, H=7):
    train_ts = clean_daily_series(train_row)
    test_ts = clean_daily_series(test_row)

    y_pred = forecast_linear_regression(train_ts, L=L, H=H)
    y_true = test_ts
    return rmse(y_true[:H], y_pred[:H])


In [None]:

H = 7
L = 30
NUM_SERIES = 50

daily_train_copy = daily_train.copy()
daily_test_copy = daily_test.copy()


results = Parallel(n_jobs=-1, backend="loky", verbose=10)(
        delayed(evaluate_forecast)(daily_train_copy.iloc[i], daily_test_copy.iloc[i], L=L, H=H)
        for i in range(NUM_SERIES)
    )
print("Linear Regression RMSE per series:", results)
print(f"mean rmse: {np.mean(results)}")
print(f"median rmse: {np.median(results)}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.11001992225646973s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    1.2s


Linear Regression RMSE per series: [22.96848995321024, 20.115484308032418, 65.37137745238108, 304.3491118726209, 41.6784504461823, 62.57857200561561, 2599.1228317704335, 7.4679556529014945, 7.302353589431489, 44.93125277023629, 25.423959564685727, 15.839756256140037, 140.31763526841286, 114.47298720742813, 233.56917560747257, 66.87428418365555, 43.77472467735104, 200.76589525445928, 98.5829061010088, 74.86224434653404, 110.24702442703659, 59.240487376487295, 30.152439375830276, 50.83214166543085, 21.795766325293844, 38.56327137197271, 26.689095852675038, 12.835127210034539, 264.60757073276113, 13.040148781919282, 77.4008489911567, 68.77013300302698, 89.90322381914528, 16.20874995125817, 86.930319417938, 42.692892710628385, 113.13680403236852, 62.06380851489135, 261.9832568542338, 77.80546998012937, 506.6935296350251, 178.48981996502727, 8.073216472442743, 41.252039540553916, 39.62028036824387, 182.39755349778164, 51.68411611426886, 157.3503942865973, 210.94243812934687, 163.96506471826

[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    1.9s finished
