In [None]:
import pandas as pd
import numpy as np
from pmdarima import auto_arima
from sklearn.metrics import mean_squared_error
from joblib import Parallel, delayed

In [21]:
daily_train = pd.read_csv('../src/data/m4_forecasting/Daily-train.csv')
daily_test = pd.read_csv('../src/data/m4_forecasting/Daily-test.csv')

In [51]:
def clean_daily_series(row):
    # Drop the ID in column V1
    ts = row.iloc[1:]

    # Drop trailing NaNs (uneven lengths)
    ts = ts.dropna().astype(float)

    # Assign daily index (fake but consistent)
    ts.index = pd.date_range(start="2000-01-01", periods=len(ts), freq="D")

    return ts


In [None]:
# M: seasonal period (7 for daily series (repeats weekly))
# H: forecast horizon (if 7, will predict 7 days into the future)
def forecast_daily_series(row, M=7, H=7):
    # Fit ARIMA
    model = auto_arima(
        row,
        seasonal=True,
        m=M,
        error_action='ignore',
        suppress_warnings=True
    )

    forecast = model.predict(n_periods=H)

    return forecast.tolist()


In [None]:

# functions for evaluation

def rmse(actual, predicted):
    return np.sqrt(mean_squared_error(actual, predicted))

def evaluate_daily_series(train_row, test_row, horizon):
    # Clean  data
    train_ts = clean_daily_series(train_row)
    test_ts = clean_daily_series(test_row)

    # Forecast
    y_pred = forecast_daily_series(train_ts, H=horizon)
    y_true = test_ts.iloc[:horizon]

    return rmse(y_true, y_pred)

In [None]:
# run model in parallel

# define configuration 
HORIZON = 5 # forecast horizon: # future steps for the model to predict
NUM_SERIES = 50 # define number of series to use for sake of training time

daily_train_copy = daily_train.copy()
daily_test_copy = daily_test.copy()

results = Parallel(n_jobs=-1, backend="loky", verbose=10) (
    delayed(evaluate_daily_series)(daily_train_copy.iloc[i], daily_test_copy.iloc[i], horizon=HORIZON)
    for i in range(NUM_SERIES or len(daily_train_copy))
)


print(f"rmse per series: {results}")
print(f"mean rmse: {np.mean(results)}")
print(f"median rmse: {np.median(results)}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   14.6s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   36.4s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done  41 out of  50 | elapsed:  4.5min remaining:   59.0s
[Parallel(n_jobs=-1)]: Done  47 out of  50 | elapsed:  5.2min remaining:   19.8s


rmse per series: [20.307782989515967, 21.778980692401504, 9.286549412995134, 31.163892693444236, 16.8837606724112, 143.48200827180872, 2975.182214311859, 7.690758006000417, 6.3716066857516855, 23.353971824937982, 13.599387723417045, 25.24949279388023, 52.89181003904858, 67.55969257110793, 460.5436598393897, 23.317936813740154, 46.40425546428569, 136.93821964667129, 69.64523260755159, 114.66286233999223, 71.11488053436543, 56.204782419509456, 28.07059767568111, 50.45700861701919, 22.81462764304004, 42.48574278595761, 25.58568114736649, 3.935927338459238, 222.72552557692134, 3.2570259826664523, 35.25249671679479, 36.067333521224725, 150.8244302750469, 13.942740046346701, 45.61578674099571, 42.99142356331086, 128.25131578272405, 62.218930231261, 174.89354176103143, 98.0183656260397, 118.34168197920276, 91.71539874065252, 16.224795838469035, 25.0645794709324, 27.487742722893692, 160.382043882724, 29.186263892454583, 128.60482106048747, 147.1692902748396, 110.39384040787783]
mean rmse: 128.

[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  6.5min finished
