In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from joblib import Parallel, delayed


In [7]:
daily_train = pd.read_csv('../src/data/m4_forecasting/Daily-train.csv')
daily_test = pd.read_csv('../src/data/m4_forecasting/Daily-test.csv')

In [8]:
def clean_daily_series(row):
    # Drop the ID in column V1
    ts = row.iloc[1:]

    # Drop trailing NaNs (uneven lengths)
    ts = ts.dropna().astype(float)

    # Assign daily index (fake but consistent)
    ts.index = pd.date_range(start="2000-01-01", periods=len(ts), freq="D")

    return ts


In [None]:
def build_windows(ts, L, H):
    """
    ts: 1D numpy array
    L: lookback window
    H: forecast horizon
    Returns: X [N,L], Y [N,H]
    """
    X, Y = [], []
    for i in range(len(ts) - L - H + 1):
        X.append(ts[i:i+L])
        Y.append(ts[i+L:i+L+H])
    return np.array(X), np.array(Y)

In [None]:

def forecast_linear_regression(train_row, L=30, H=7):
    ts = train_row.dropna().astype(float).values
    if len(ts) < L + H:
        return None
    # Build supervised dataset
    X, Y = build_windows(ts, L=L, H=H)
    # Fit separate models for each horizon step
    y_pred = np.zeros(H)
    for h in range(H):
        lr = LinearRegression()
        lr.fit(X, Y[:,h])
        y_pred[h] = lr.predict(ts[-L:].reshape(1,-1))[0]
    return y_pred


In [14]:
def rmse(actual, predicted):
    return np.sqrt(mean_squared_error(actual, predicted))

def evaluate_forecast(train_row, test_row, L=30, H=7):
    train_ts = clean_daily_series(train_row)
    test_ts = clean_daily_series(test_row)

    y_pred = forecast_linear_regression(train_ts, L=L, H=H)
    y_pred = np.atleast_1d(y_pred)
    y_true = np.atleast_1d(test_ts)
    return rmse(y_true[:H], y_pred[:H])


In [17]:

H = 14
L = 30
NUM_SERIES = 50

daily_train_copy = daily_train.copy()
daily_test_copy = daily_test.copy()


results = Parallel(n_jobs=-1, backend="loky", verbose=10)(
        delayed(evaluate_forecast)(daily_train_copy.iloc[i], daily_test_copy.iloc[i], L=L, H=H)
        for i in range(NUM_SERIES)
    )
print("Linear Regression RMSE per series:", results)
print(f"mean rmse: {np.mean(results)}")
print(f"median rmse: {np.median(results)}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.08603405952453613s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.6s


Linear Regression RMSE per series: [25.729277490224458, 39.66512662714831, 108.50972035627173, 338.40328757386027, 106.8589009819357, 105.9686222910848, 2142.776640186857, 8.501928461076279, 8.420310296011662, 59.15204281267744, 26.182180311636554, 26.915583417493554, 293.9105906306403, 201.6261216621385, 1134.520484526862, 75.99066467265075, 35.860780044622246, 210.60383509284355, 132.89697291042128, 150.6443563975968, 151.85655067423386, 55.791492926873175, 30.383966427839788, 52.402141605972396, 25.057643256639384, 35.664569570777736, 23.708266731040485, 15.038589361748189, 260.5733103287645, 13.402385738140893, 115.5042429499441, 101.86087886898687, 127.04803160000992, 37.76371366823188, 131.00821222752552, 80.56467569750606, 97.6744135544985, 48.25294560904691, 613.2940772158629, 94.51311328957037, 374.16374839887993, 57.07359492418262, 31.10743487143791, 96.73124321345067, 51.222398420289586, 1038.8963183473752, 78.51198905323785, 159.04319393688152, 166.38116462611748, 303.39550

[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    1.5s finished
