## VAR model for benchmark performance

### import statements

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import adfuller

## initialising data

In [None]:
# Synthetic data generation
np.random.seed(0)
date_range = pd.date_range("2020-01-01", periods=40, freq="M")

df = pd.DataFrame({
    "Export": np.random.normal(100, 10, 40).cumsum(),  # some random walk style
    "Import": np.random.normal(80, 8, 40).cumsum(),
    "Feature1": np.random.normal(50, 5, 40).cumsum(),
    "Feature2": np.random.normal(30, 3, 40).cumsum(),
    "Feature3": np.random.normal(60, 6, 40).cumsum()
}, index=date_range)

df.head()

In [None]:
def adf_test(series, title=''):
    """Perform ADF test and print results."""
    print(f'Augmented Dickey-Fuller Test: {title}')
    result = adfuller(series.dropna(), autolag='AIC')
    labels = ['ADF Statistic', 'p-value', '# Lags Used', '# Observations Used']
    out = dict(zip(labels, result[0:4]))
    for key, val in out.items():
        print(f"   {key}: {val}")
    for key, val in result[4].items():
        print(f"   Critical Value {key}: {val}")
    print("")

# Example: check stationarity for each column
for col in df.columns:
    adf_test(df[col], title=col)

# If non-stationary, consider differencing
df_diff = df.diff().dropna()

# Re-check with ADF after differencing
for col in df_diff.columns:
    adf_test(df_diff[col], title=col)

In [None]:
train_size = int(0.8 * len(df_diff))  # e.g., 80% training
train_data = df_diff.iloc[:train_size]
test_data = df_diff.iloc[train_size:]

In [None]:
# 5a. Determine optimal lag order
model = VAR(endog=train_data)
results = model.select_order(maxlags=5)  # test up to 5 lags
selected_lag = results.selected_orders['aic']  # or 'bic', 'hqic', 'fpe'
print("Selected Lag Order by AIC:", selected_lag)

# 5b. Fit the final model
var_model = VAR(endog=train_data)
var_results = var_model.fit(selected_lag)
print(var_results.summary())

In [None]:
# Forecast length is the size of your test set
forecast_steps = len(test_data)

# Obtain forecast
forecast_values = var_results.forecast(
    y=train_data.values[-selected_lag:], 
    steps=forecast_steps
)

# Wrap forecast into a DataFrame
forecast_index = test_data.index
forecast_df = pd.DataFrame(forecast_values, 
                           index=forecast_index, 
                           columns=[f"{col}_forecast" for col in df_diff.columns])

# Because we used differenced data, 
#   forecast_df must be integrated back (cumulative sum + last known level).
#   We'll demonstrate how with a loop.
recovered_forecast = df.iloc[train_size:selected_lag+train_size].copy()  # anchor
for t in range(forecast_steps):
    # Add differenced forecast to last known value
    next_vals = recovered_forecast.iloc[-1] + forecast_df.iloc[t]
    recovered_forecast = pd.concat([recovered_forecast, pd.DataFrame([next_vals], index=[forecast_df.index[t]])])

# recovered_forecast now holds the "levels" forecasts from differenced data
final_forecast = recovered_forecast.iloc[selected_lag:]  # skip the anchor rows

# Extract only forecast columns
final_forecast = final_forecast[df.columns]  # align columns

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Align true values and forecasts
actual = df.loc[test_data.index, :]  # the actual values in the original scale
preds = final_forecast.loc[test_data.index, :]

mae_export = mean_absolute_error(actual["Export"], preds["Export"])
mae_import = mean_absolute_error(actual["Import"], preds["Import"])

mse_export = mean_squared_error(actual["Export"], preds["Export"])
mse_import = mean_squared_error(actual["Import"], preds["Import"])

print("MAE (Export):", mae_export)
print("MAE (Import):", mae_import)
print("MSE (Export):", mse_export)
print("MSE (Import):", mse_import)