In [2]:
import pandas as pd
import numpy as np
from pmdarima import auto_arima
from sklearn.metrics import mean_squared_error
from joblib import Parallel, delayed

In [3]:
daily_train = pd.read_csv('../src/data/m4_forecasting/Daily-train.csv')
daily_test = pd.read_csv('../src/data/m4_forecasting/Daily-test.csv')

In [4]:
def clean_daily_series(row):
    # Drop the ID in column V1
    ts = row.iloc[1:]

    # Drop trailing NaNs (uneven lengths)
    ts = ts.dropna().astype(float)

    # Assign daily index (fake but consistent)
    ts.index = pd.date_range(start="2000-01-01", periods=len(ts), freq="D")

    return ts


In [5]:
# M: seasonal period (7 for daily series (repeats weekly))
# H: forecast horizon (if 7, will predict 7 days into the future)
def forecast_daily_series(row, M=7, H=7):
    # Fit ARIMA
    model = auto_arima(
        row,
        seasonal=True,
        m=M,
        error_action='ignore',
        suppress_warnings=True
    )

    forecast = model.predict(n_periods=H)

    return forecast.tolist()


In [None]:

# functions for evaluation

def rmse(actual, predicted):
    return np.sqrt(mean_squared_error(actual, predicted))

def evaluate_daily_series(train_row, test_row, H=14):
    
    train_ts = clean_daily_series(train_row)
    test_ts = clean_daily_series(test_row)

    y_pred = forecast_daily_series(train_ts, H=H)
    y_pred = np.atleast_1d(y_pred)
    y_true = np.atleast_1d(test_ts)
    return rmse(y_true[:H], y_pred[:H])

In [13]:
# run model in parallel

# define configuration 
HORIZON = 1 # forecast horizon: # future steps for the model to predict
NUM_SERIES = 50 # define number of series to use for sake of training time

daily_train_copy = daily_train.copy()
daily_test_copy = daily_test.copy()

results = Parallel(n_jobs=-1, backend="loky", verbose=10) (
    delayed(evaluate_daily_series)(daily_train_copy.iloc[i], daily_test_copy.iloc[i], H=HORIZON)
    for i in range(NUM_SERIES or len(daily_train_copy))
)


print(f"rmse per series: {results}")
print(f"mean rmse: {np.mean(results)}")
print(f"median rmse: {np.median(results)}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   19.8s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   49.2s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done  41 out of  50 | elapsed:  3.0min remaining:   39.9s
[Parallel(n_jobs=-1)]: Done  47 out of  50 | elapsed:  3.6min remaining:   13.8s


rmse per series: [8.492437810945376, 17.5, 2.7999999999999545, 5.606616348444504, 17.217290322580993, 101.20540221356032, 1221.8949355485893, 0.7396433287442505, 0.1806702626072365, 2.099999999999909, 5.623084577114696, 1.2566724013986459, 28.692073196607453, 81.32527769807348, 190.94965449310075, 12.876338237919299, 22.474181245511772, 64.79999999999927, 2.246572816417938, 52.70000000000073, 5.376206783743328, 10.294385079639142, 21.5755171568253, 9.348297319754238, 18.62145437447475, 8.129648421099773, 19.51828556688656, 0.032724740337528146, 93.78965168825289, 1.6498420906251567, 4.384335698981431, 2.491083596880344, 32.00249002277087, 0.0, 0.0, 12.75, 66.0, 6.4248913792130224, 24.650895060665334, 8.0, 202.0232683814711, 112.55200740285727, 6.600000000000136, 36.7271606953218, 16.899999999999864, 31.0, 15.899999999999977, 57.0, 63.0, 34.0]
mean rmse: 55.228459919228314
median rmse: 16.39999999999992


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  4.4min finished
