In [25]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from joblib import Parallel, delayed
from sklearn.preprocessing import StandardScaler


In [26]:
daily_train = pd.read_csv('../src/data/m4_forecasting/Daily-train.csv')
daily_test = pd.read_csv('../src/data/m4_forecasting/Daily-test.csv')

In [27]:
def clean_daily_series(row):
    # Drop the ID in column V1
    ts = row.iloc[1:]

    # Drop trailing NaNs (uneven lengths)
    ts = ts.dropna().astype(float)

    # Assign daily index (fake but consistent)
    ts.index = pd.date_range(start="2000-01-01", periods=len(ts), freq="D")

    return ts


In [28]:
def build_windows(ts, L, H):
    """
    ts: 1D numpy array
    L: lookback window
    H: forecast horizon
    Returns: X [N,L], Y [N,H]
    """
    X, Y = [], []
    for i in range(len(ts) - L - H + 1):
        X.append(ts[i:i+L])
        Y.append(ts[i+L:i+L+H])
    return np.array(X), np.array(Y)

In [None]:

def forecast_linear_regression(train_row, L=30, H=7):
    # Build supervised dataset
    X, Y = build_windows(train_row, L=L, H=H)
    # Fit separate models for each horizon step
    y_pred = np.zeros(H)
    for h in range(H):
        lr = LinearRegression()
        lr.fit(X, Y[:,h])
        y_pred[h] = lr.predict(train_row[-L:].reshape(1,-1))[0]
    return y_pred


In [None]:


def rmse(actual, predicted):
    return np.sqrt(mean_squared_error(actual, predicted))

def evaluate_forecast(train_row, test_row, L=30, H=7):
    train_ts = clean_daily_series(train_row)
    test_ts = clean_daily_series(test_row)

    # Enforce M4 horizon limit (daily test series have length of 14)
    H_eval = min(H, len(test_ts))

    scaler = StandardScaler()
    train_ts_scaled = scaler.fit_transform(
        train_ts.to_numpy().reshape(-1, 1)
    ).flatten()

    y_pred = forecast_linear_regression(train_ts_scaled, L=L, H=H_eval)
    y_pred = np.atleast_1d(y_pred)

    y_pred = scaler.inverse_transform(
        y_pred.reshape(-1, 1)
    ).flatten()
    
    y_true = np.atleast_1d(test_ts)

    return rmse(y_true[:H], y_pred[:H])


In [31]:

H = 14
L = 30
NUM_SERIES = 50

daily_train_copy = daily_train.copy()
daily_test_copy = daily_test.copy()

results = Parallel(n_jobs=-1, backend="loky", verbose=10)(
        delayed(evaluate_forecast)(daily_train_copy.iloc[i], daily_test_copy.iloc[i], L=L, H=H)
        for i in range(NUM_SERIES)
    )
print("Linear Regression RMSE per series:", results)
print(f"mean rmse: {np.mean(results)}")
print(f"median rmse: {np.median(results)}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
python(7948) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(7949) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(7950) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(7951) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(7952) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(7953) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(7954) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(7955) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: D

Linear Regression RMSE per series: [25.729277490223705, 39.66512662714902, 108.5097203562716, 338.40328757386027, 106.85890098193548, 105.9686222910864, 2142.7766401868585, 8.501928461077984, 8.420310296012614, 59.152042812678175, 26.182180311637946, 26.91558341749536, 293.910590630646, 201.62612166214254, 1134.5204845268627, 75.99066467265659, 35.86078004462152, 210.60383509284932, 132.89697291042395, 150.64435639759145, 151.85655067423474, 55.79149292687228, 30.383966427839677, 52.402141605970776, 25.057643256639448, 35.66456957077644, 23.70826673104035, 15.038589361747146, 260.5733103287635, 13.402385738139602, 115.50424294994562, 101.86087886898774, 127.04803160000935, 37.76371366823252, 131.00821222752293, 80.56467569750819, 97.67441355449897, 48.25294560904689, 613.2940772158627, 94.51311328957034, 374.16374839888124, 57.07359492418178, 31.107434871437718, 96.73124321345028, 51.22239842028944, 1038.896318347371, 78.51198905323753, 159.04319393688218, 166.38116462611774, 303.39550

[Parallel(n_jobs=-1)]: Done  47 out of  50 | elapsed:    7.4s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    7.5s finished
