In [None]:
import os
import pandas as pd

import numpy as np
import matplotlib.pyplot as plt
import pmdarima as pm
from pmdarima import pipeline, arima, model_selection
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
import statsmodels.api as sm
import datetime as dt

In [None]:
file_P = os.path.join(os.getcwd(), 'Elspotprices2nd.csv')
df_prices = pd.read_csv(file_P)
df_prices['HourUTC'] = pd.to_datetime(df_prices['HourUTC'])

file_P = os.path.join(os.getcwd(), 'ProdConData.csv')
df_data = pd.read_csv(file_P)
df_data['HourUTC'] = pd.to_datetime(df_data['HourUTC'])


In [None]:
df_prices["SpotPriceDKK"].max()

#mean spot price is 643.112
#lowest is -447.459
#highest is 6982.64

1.1

First, we must define the training dataset, which runs from 1/1/19 until 31/8/24, and the testing dataset, which runs from 1/9/24 until 30/9/24. For that we will visualize the data and then split it into the two groups.

**The provided data doesn't end on 30/09/2024, but on 31/12/2024

In [None]:
#We define relevant timestamps to filter only for the time periods mentioned in the task for training and testing

t_start = pd.Timestamp(dt.datetime(2019, 1, 1, 0, 0, 0))
t_end = pd.Timestamp(dt.datetime(2024, 9, 30, 23, 0, 0))

#from start to end of testing data there are 720 data points


In [None]:
#Data filtering -- data remains as dataframe with HourUTC and SpotPriceDKK; drop indices after narrowing data down to specified dates

reduced_df = df_prices.loc[(df_prices['HourUTC']>=t_start) & (df_prices['HourUTC']<=t_end)]
reduced_df = reduced_df.reset_index(drop=True)

#Data split 
train, test = model_selection.train_test_split(reduced_df["SpotPriceDKK"], test_size=720)

#Data split __ do the same as for regular data with train_test_split function; test should be 720 long

#n's are relevant for x 
n_train = len(train)
n_test = len(test)
n_data = len(reduced_df)

In [None]:
#Data visualization
plt.figure(figsize=(10, 4), dpi=100)
plt.plot(np.arange(1,n_train+1), train)
plt.plot(np.arange(n_train+1,n_data+1), test)
plt.legend(["Training set", "Testing set"])
plt.grid(alpha=0.25)
plt.xticks(np.arange(0, n_data+1, 365*24), rotation=45)
plt.tight_layout()
plt.show()

Then, day-ahead predictions will be done with an ARIMA model (seasonal or not?). 30 predictions are needed of 24 values each. The correct values for them are known--contained in the test dataset--, so the model will be updated after each forecast.
A persistence model is included in the graph to use as benchmark for the model. Each new set of 24 values will be assumed to be equal to the previous 24.

In [None]:
#model characteristics: daily seasonality, training data set reduced to previous year (august 2023-2024))

#t_start_reduced_train_fulldf = pd.Timestamp(dt.datetime(2022, 1, 1, 0, 0, 0))
#t_end_reduced_train_fulldf = pd.Timestamp(dt.datetime(2024, 8, 31, 23, 0, 0))

#reduced_train_fulldf = train_fulldf.loc[(df_prices['HourUTC']>=t_start_reduced_train_fulldf) & (df_prices['HourUTC']<=t_end_reduced_train_fulldf)]

#reduced_train_spotprices = reduced_train_fulldf["SpotPriceDKK"].values
#train_spotprices = train_fulldf["SpotPriceDKK"].values
#test_spotprices = test_fulldf["SpotPriceDKK"].values

#n_train_reduced = len(reduced_train_fulldf)
#n_data_reduced = n_train_reduced + n_test

#ACF and PACF plots to detect seasonality 

# Plot ACF/PACF plots ==> shows strong correlations thorughout; the seasonality is strong on a daily basis
'''
fig, ax = plt.subplots(2, 1, figsize=(8, 6))
sm.graphics.tsa.plot_acf(reduced_train_spotprices, title = "ACF", lags=25, ax=ax[0])
sm.graphics.tsa.plot_pacf(reduced_train_spotprices, title = "PACF", lags=25, ax=ax[1])
plt.tight_layout()
plt.show()
'''

# Create a pipeline with the appropriate m and k; m = 24 for daily seasonality and k = 4---k value is how smooth curve fitted is, and max should be m//2, so 12
pipe = pipeline.Pipeline([
    ("fourier", pm.preprocessing.FourierFeaturizer(m=24, k = 6)),
    ("arima", arima.AutoARIMA(stepwise=False, trace = False, error_action="ignore",
                              seasonal=False, maxiter=10, 
                              suppress_warnings=True))])

pipe.fit(train)


# Create a list for the forecasts
rolling_forecast = []
N = int(len(test)/24)

for i in range(N):
    forecast = pipe.predict(n_periods=24)
    pipe.update(test[i*24:(i+1)*24])
    rolling_forecast.extend(forecast)

# Plot forecasts
plt.figure(figsize=(10, 4), dpi=100)
plt.plot(np.arange(1,n_train+1), train)
plt.plot(np.arange(n_train+1,n_data+1), test)
plt.plot(np.arange(n_train+1,n_data+1), rolling_forecast)
plt.title("24-hour ahead predictions")
plt.legend(["Training set", "Actual values", "Forecasts"])
plt.grid(alpha=0.25)
plt.xlim(n_data - 6*7*24, n_data)
plt.tight_layout()
plt.show()





To compare the ARIMA with the persistance model we report the RMSE value of each.

In [None]:
#plotting of benchmark,i.e daily persistence model

data_spotprices = reduced_df["SpotPriceDKK"].values

#Empty list for 24 hour predictions
Persistence24 = []

for i in range(N):
    Persistence24.extend(data_spotprices[len(train)+ 24 * (i - 1) : len(train) + i * 24])

# Plot the forecasts
plt.figure(figsize=(10, 4), dpi=100)
plt.plot(np.arange(1,n_train+1), train)
plt.plot(np.arange(n_train+1,n_data+1), test)
plt.plot(np.arange(n_train+1,n_data+1), rolling_forecast)
plt.plot(np.arange(n_train+1,n_data+1), Persistence24)
plt.title("24-hour ahead predictions")
plt.legend(["Training set", "Actual values", "Forecasts", "Daily persistence"])
plt.grid(alpha=0.25)
plt.xlim(n_data - 6*7*24, n_data)
plt.tight_layout()
plt.show()

#RMSE for ARIMA and persistence.
RMSE_P24 = root_mean_squared_error(Persistence24, test)
RMSE_F = root_mean_squared_error(rolling_forecast, test)

print("RMSE for daily persistence: ", round(RMSE_P24))
print("RMSE for forecasts: ", round(RMSE_F))

#with original data and k = 6, RMSE are 365 persistence and 309 forecats; 309 is the lowest RMSE for the forecast model

1.2

Add any exogenous variables you want (maximum 3) and repeat the process (choose/optimize
your model and evaluate it for the day-ahead prediction).

What exogenous variables helped you improve the prediction and how did you choose the specific ones?

Report the RMSE value and compare your results with those from task 1.1 and briefly discuss them

In [None]:
#merge exogeneous data with og spot price data about Hour UTC column & do refiltering for stated months

df_data.head()