In [98]:
import torch
from torch import nn
import numpy as np
import pandas as pd 

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from joblib import Parallel, delayed

from pathlib import Path
import importlib.util

module_path = Path.cwd().resolve() / ".." /"src" / "nnknn" / "nnknn.py"
spec = importlib.util.spec_from_file_location("nnknn", str(module_path))
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
NNKNN = mod.NNKNN


In [99]:
daily_train = pd.read_csv('../src/data/m4_forecasting/Daily-train.csv')
daily_test = pd.read_csv('../src/data/m4_forecasting/Daily-test.csv')

In [100]:

def rmse(actual, predicted):
    # actual = np.asarray(actual).reshape(-1)
    # predicted = np.asarray(predicted).reshape(-1)

    return np.sqrt(mean_squared_error(actual, predicted))

In [101]:
def clean_daily_series(row):
    # Drop the ID in column V1
    ts = row.iloc[1:]

    # Drop trailing NaNs (uneven lengths)
    ts = ts.dropna().astype(float)

    # Assign daily index (fake but consistent)
    ts.index = pd.date_range(start="2000-01-01", periods=len(ts), freq="D")

    return ts


In [102]:
def build_windows(ts, L, H):
    X, Y = [], []
    for i in range(len(ts) - L - H + 1):
        X.append(ts[i:i+L])
        Y.append(ts[i+L:i+L+H])
    return torch.tensor(X, dtype=torch.float32), torch.tensor(Y, dtype=torch.float32)

In [109]:
def train_nnknn(X_cases, Y_targets, L, epochs=100, lr=0.01, shared_weights=False):
    model = NNKNN(num_features=L, num_cases=X_cases.shape[0], shared_weights=shared_weights)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()

    for _epoch in range(epochs):
        optimizer.zero_grad()
        # Use the cases themselves as queries for training
        y_hat, _, _ = model(X_cases, X_cases, Y_targets)
        loss = criterion(y_hat, Y_targets)
        loss.backward()
        optimizer.step()
    
    return model

def forecast_nnknn(train_row, L=30, H=7, epochs=100, shared_weights=False):
    X_cases, Y_targets = build_windows(train_row, L=L, H=H)

    # train model
    model = train_nnknn(X_cases, Y_targets, L=L, epochs=epochs, shared_weights=shared_weights)

    # last window as query
    query = torch.tensor(train_row[-L:], dtype=torch.float32).unsqueeze(0)
    
    with torch.no_grad():
        y_hat, _, _ = model(query, X_cases, Y_targets)
    
    return y_hat.squeeze(0).cpu().numpy()  # shape [H]


In [110]:
def evaluate_nnknn(train_row, test_row, L=30, H=7):

    train_ts = clean_daily_series(train_row)
    test_ts = clean_daily_series(test_row)
    

    # Enforce M4 horizon limit (daily test series have length of 14)
    H_eval = min(H, len(test_ts))

    scaler = StandardScaler()
    train_ts_scaled = scaler.fit_transform(
        train_ts.to_numpy().reshape(-1, 1)
    ).flatten()

    y_pred = forecast_nnknn(train_ts_scaled, L=L, H=H_eval)
    y_pred = np.atleast_1d(y_pred)

    y_pred = scaler.inverse_transform(
        y_pred.reshape(-1, 1)
    ).flatten()
    
    y_true = np.atleast_1d(test_ts)

    return rmse(y_true[:H], y_pred[:H])


In [111]:

L = 30 #size of window
HORIZON = 7 # forecast horizon
NUM_SERIES = 50

daily_train_copy = daily_train.copy()
daily_test_copy = daily_test.copy()

results = Parallel(n_jobs=-1, backend="loky", verbose=10)(
        delayed(evaluate_nnknn)(daily_train_copy.iloc[i], daily_test_copy.iloc[i], L=L, H=HORIZON)
        for i in range(NUM_SERIES)
    )
print("RMSE for each series:", results)
print(f"mean rmse: {np.mean(results)}")
print(f"median rmse: {np.median(results)}")



[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
python(2809) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(2810) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(2811) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(2812) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  7.9min
python(4569) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(4596) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(4631) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  8.9m

RMSE for each series: [179.24913367699904, 25.737294936522726, 43.08965779396816, 338.36090011877695, 243.31954989882482, 677.4823387413791, 3299.9217942391574, 206.99113955110872, 176.24047598142118, 226.39961618328667, 361.9621793896055, 150.60506899444587, 862.64424959999, 1030.7051717659206, 3637.330806320032, 540.1971446847008, 227.59266164555692, 161.0396977191821, 611.9564082030961, 59.386376813806045, 354.12085844841204, 272.86921265292915, 23.124320303289917, 241.19963591163292, 20.533938496610425, 288.35957580844035, 38.730946847217126, 249.70870526800263, 1590.9459039835447, 266.6457338883834, 144.03429193946755, 184.10436833043667, 114.06477154669855, 562.1095184420647, 259.43247983360243, 9.254382384109153, 68.54135123964458, 207.34870670641, 200.10850619918415, 118.47793576494553, 158.86493933934807, 79.72646321539472, 11.920604528188703, 51.10587281165379, 12.534860192598961, 115.75153780830493, 49.29723682611074, 265.9350296781802, 407.07902176208916, 413.64068054116177

[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 11.5min finished
