# Importing the data
### setting comprehensible col names and right types

In [1]:
# Importing all the necessary packages
import pandas as pd
import re
import matplotlib.pyplot as plt
import numpy as np
from prophet import Prophet
import pandas as pd
from pandas.errors import PerformanceWarning
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import norm
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from prophet import Prophet
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import RandomizedSearchCV
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima.model import ARIMA


# For legibility, we mute some warnings
import warnings

# Ignore FutureWarning for deprecated 'T' frequency in Prophet
warnings.filterwarnings("ignore", category=FutureWarning, message="'T' is deprecated")

# Ignore PerformanceWarning from pandas
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)

In [46]:
# Mix of different sources, mostly ESO
balancing_df = pd.read_csv("balancing_data.csv")
# Demand data only for GB
GB_demand_df = pd.read_csv("demand_load_data.csv")
# Generation data only for GB
GB_generation_df = pd.read_csv("generation_data.csv")
# the price dataframe only concerns EPEX (only prices from there)
EPEX_price_df = pd.read_csv("price_data.csv")

In [2]:
# Function to find the maximum number of consecutive NaNs filled in a column
# As Angelica Asked
def max_consecutive_nans_filled(df, column):
    """

    This function calculates and returns the maximum number 
    of consecutive NaNs in a column that is to be filled

    """
    # Identify consecutive NaNs
    na_groups = df[column].isna().astype(int).groupby(df[column].notna().cumsum()).sum()
    # Get the maximum number of consecutive NaNs that would be interpolated
    max_consecutive_nans = na_groups.max()
    nans_before = df[column].isna().sum()

    print(f"NaNs in {column}: {nans_before}")
    print(f"Max consecutive NaNs filled for '{column}': {max_consecutive_nans}")
    return


def fill_missing_with_prophet(df, column_name, time_column="Datetime"):
    """
    Use Prophet to fill missing values in a specific column of a DataFrame.
    """
    # Prepare the data for Prophet
    temp_df = df[[time_column, column_name]].rename(columns={time_column: 'ds', column_name: 'y'})

    # Separate known and missing data
    known_data = temp_df.dropna()

    # Initialize and fit the Prophet model
    model = Prophet(daily_seasonality=True, weekly_seasonality=True, yearly_seasonality=True)
    model.fit(known_data)

    # Make predictions for the full range of dates in the original data
    future = pd.DataFrame({'ds': temp_df['ds']})
    forecast = model.predict(future)

    # Fill missing values with Prophet predictions
    temp_df.set_index('ds', inplace=True)
    temp_df['yhat'] = forecast.set_index('ds')['yhat']
    temp_df[column_name] = temp_df['y'].combine_first(temp_df['yhat'])

    # Update the original DataFrame with the filled values
    df[column_name] = temp_df[column_name].reindex(df[time_column].values).values
    return df


In [48]:
def rename_balancing_columns(df):
    # Define a dictionary for concise renaming
    rename_map = {
        'GMT Time': 'GMT Time',
        'System Price (ESO Outturn) - GB (£/MWh)': 'System_Price',
        'NIV Outturn (+ve long) - GB (MW)': 'NIV_Outturn',
        'BM Bid Acceptances (total) - GB (MW)': 'BM_Bid_Acceptances',
        'BM Offer Acceptances (total) - GB (MW)': 'BM_Offer_Acceptances',
        'Total BSAD Volume - Turn Up - GB (MW)': 'BSAD_Turn_Up',
        'Total BSAD Volume - Turn Down - GB (MW)': 'BSAD_Turn_Down',
        'Total BSAD Volume - Total - GB (MW)': 'BSAD_Total',
        'Intraday Volume (EPEX Outturn, APX, MID) - GB (MWh)': 'EPEX_Intraday_Volume'
    }
    
    # Apply the renaming map
    df = df.rename(columns=rename_map)

    # Force all the non datetime columns to numeric
    for column in df.columns:
        if column != 'GMT Time':  # Skip the 'GMT Time' column
            df[column] = pd.to_numeric(df[column], errors='coerce')
    return df

# Apply the function to rename columns in balancing_df
balancing_df = rename_balancing_columns(balancing_df)

print("Final columns:")
print(balancing_df.columns.values)


Final columns:
['GMT Time' 'System_Price' 'NIV_Outturn' 'BM_Bid_Acceptances'
 'BM_Offer_Acceptances' 'BSAD_Turn_Up' 'BSAD_Turn_Down' 'BSAD_Total'
 'EPEX_Intraday_Volume']


In [49]:
def rename_demand_columns(df):
    """
    Rename columns for easier reference and convert non-datetime columns to numeric.
    """
    # Define a dictionary for concise renaming
    rename_map = {
        'GMT Time': 'GMT Time',
        'Loss of Load Probability - Latest - GB ()': 'Loss_of_Load_Prob',
        'Actual Total Load - GB (MW)': 'Total_Load',
        'Demand Outturn (ITSDO) - GB (MW)': 'Demand_Outturn'
    }
    
    # Apply the renaming map
    df = df.rename(columns=rename_map)

    # Force all the non-datetime columns to numeric
    for column in df.columns:
        if column != 'GMT Time':  # Skip the 'GMT Time' column
            df[column] = pd.to_numeric(df[column], errors='coerce')
    return df

# Apply the renaming and filling functions
GB_demand_df = rename_demand_columns(GB_demand_df)


print("Final columns:")
print(GB_demand_df.columns.values)

Final columns:
['GMT Time' 'Loss_of_Load_Prob' 'Total_Load' 'Demand_Outturn']


In [50]:
def rename_columns_generation(df):
    # Define a function to clean each column name
    def clean_column_name(col):
        # Extract the generation type using regex
        match = re.search(r'Actual Aggregated Generation By Type - (.+?) - GB', col)
        if match:
            # Replace spaces with underscores for readability
            return match.group(1).replace(" ", "_")
        return col  # Return the column as is if no match is found

    # Rename columns using the clean_column_name function
    df.columns = [clean_column_name(col) for col in df.columns]
    for column in df.columns:
        if column != 'GMT Time':  # Skip the 'GMT Time' column
            df[column] = pd.to_numeric(df[column], errors='coerce')

    return df

# Apply the function to rename columns in generation_df
GB_generation_df = rename_columns_generation(GB_generation_df)


print("Final columns:")
print(GB_generation_df.columns.values)

Final columns:
['GMT Time' 'Biomass' 'Fossil_Gas' 'Fossil_Hard_Coal' 'Fossil_Oil'
 'Hydro_Pumped_Storage' 'Hydro_Run-of-River_and_Poundage' 'Nuclear'
 'Solar' 'Wind_Onshore' 'Wind_Offshore']


In [51]:
def rename_epex_columns(df):
    # Define a dictionary for manual renaming based on your desired column names
    rename_map = {
        'GMT Time': 'GMT Time',
        'Day Ahead Price (EPEX half-hourly, local) - GB (LC/MWh)': 'Day_Ahead_Price',
        'Intraday Price (EPEX Outturn, APX, MID) - GB (£/MWh)': 'Intraday_Price'
    }

    # Rename columns using the dictionary
    df = df.rename(columns=rename_map)
    for column in df.columns:
        if column != 'GMT Time':  # Skip the 'GMT Time' column
            df[column] = pd.to_numeric(df[column], errors='coerce')
    
    return df

# Apply the function to rename columns in EPEX_price_df
EPEX_price_df = rename_epex_columns(EPEX_price_df)


print("Final columns:")
print(EPEX_price_df.columns.values)

Final columns:
['GMT Time' 'Day_Ahead_Price' 'Intraday_Price']


In [52]:
# Set 'GMT Time' as index for each dataframe
balancing_df.set_index('GMT Time', inplace=True)
GB_demand_df.set_index('GMT Time', inplace=True)
GB_generation_df.set_index('GMT Time', inplace=True)
EPEX_price_df.set_index('GMT Time', inplace=True)

# Merge using index
merged_df = balancing_df.join([GB_demand_df, GB_generation_df, EPEX_price_df], how='inner')
# We put back the datetime column into the merged DF and rename it for practicality
merged_df.reset_index(inplace=True)
merged_df.rename(columns={'GMT Time': 'Datetime'}, inplace=True)

print("Merged columns:")
print(merged_df.columns.values)

Merged columns:
['Datetime' 'System_Price' 'NIV_Outturn' 'BM_Bid_Acceptances'
 'BM_Offer_Acceptances' 'BSAD_Turn_Up' 'BSAD_Turn_Down' 'BSAD_Total'
 'EPEX_Intraday_Volume' 'Loss_of_Load_Prob' 'Total_Load' 'Demand_Outturn'
 'Biomass' 'Fossil_Gas' 'Fossil_Hard_Coal' 'Fossil_Oil'
 'Hydro_Pumped_Storage' 'Hydro_Run-of-River_and_Poundage' 'Nuclear'
 'Solar' 'Wind_Onshore' 'Wind_Offshore' 'Day_Ahead_Price' 'Intraday_Price']


# Helper Functions

In [None]:
def calculate_fft(df, variable, n_top_seasonalities, threshold_pc=0.02):
    """
    Calculate significant positive frequencies and their amplitudes using Fast Fourier Transform (FFT),
    selecting the lower of 2% of the max amplitude or the top `n` frequencies.

    Parameters:
    - df (DataFrame): The input DataFrame containing the time series data.
    - variable (str): The name of the column in `df` on which to perform FFT.
    - n_top_seasonalities (int): The maximum number of significant frequencies to consider.
    - threshold_pc (float): Percentage (0 < threshold_pc <= 1) of the maximum amplitude to filter significant frequencies.

    Returns:
    - zip: A generator yielding (positive frequency, amplitude) for each significant frequency.
    """
    # Compute fast Fourier transform
    price_fft = np.fft.fft(df[variable].dropna())

    # Get frequencies corresponding to FFT coefficients
    freqs = np.fft.fftfreq(len(price_fft), d=1/48)

    # Calculate amplitudes
    amplitudes = np.abs(price_fft)

    # Calculate the threshold based on 2% of the max amplitude
    threshold = threshold_pc * np.max(amplitudes)

    # Filter positive frequencies with amplitudes above threshold
    positive_indices = np.where((amplitudes > threshold) & (freqs > 0))
    positive_freqs = freqs[positive_indices]
    positive_amplitudes = amplitudes[positive_indices]

    # Sort by amplitude and select the lower of `n_top_seasonalities` or all significant frequencies
    sorted_indices = np.argsort(positive_amplitudes)[::-1]
    selected_indices = sorted_indices[:min(n_top_seasonalities, len(sorted_indices))]

    # Select the top frequencies and amplitudes
    significant_freqs = positive_freqs[selected_indices]
    significant_amplitudes = positive_amplitudes[selected_indices]

    return zip(significant_freqs, significant_amplitudes)



def prophet_predictions(df, variable, freq_amp):
    """
    Generate predictions using Prophet with multiple seasonalities based on significant frequencies.

    This function applies Prophet to model and predict a specified variable, adding custom seasonalities 
    derived from significant frequencies (e.g., daily, weekly patterns). The seasonalities are added 
    dynamically based on the frequency components identified through FFT, with Fourier orders adjusted 
    for shorter and longer periods.

    Parameters:
    - df (DataFrame): The input DataFrame containing the time series data.
    - variable (str): The name of the column in `df` to be modeled by Prophet.
    - freq_amp (list of tuples): A list of (frequency, amplitude) pairs, where each frequency represents 
                                 a significant periodic component to be modeled as seasonality.

    Returns:
    - forecast (DataFrame): The forecasted values for the specified period, including trend and seasonal components.
    """
    # Use Prophet to model_F multiple seasonalities
    prophet_balancing_df = df.reset_index().rename(columns={'Datetime': 'ds', variable: 'y'})
    model_F = Prophet(outlier_prior_scale=0.05)

    # Adding seasonalities based on significant frequencies
    for freq, amp in freq_amp:
        if freq != 0:  # Ignore the DC component
            period_in_days = 1 / freq
            # Add seasonality to Prophet
            seasonality_name = f"seasonal_freq_{freq:.4f}"
            if period_in_days <= 1:
                fourier_order = 5
            elif period_in_days > 1 and period_in_days > 7:
                fourier_order = 10
            else:
                fourier_order = 20
            model_F.add_seasonality(name=seasonality_name, period=period_in_days, fourier_order=fourier_order)

    # Fit the model_F
    model_F.fit(prophet_balancing_df)

    # Make future dataframe for predictions, 48 rows because we predict for next day
    future = model_F.make_future_dataframe(periods=48, freq='30T')

    forecast = model_F.predict(future)

    # Plot the forecast
    # model_F.plot(forecast)
    # plt.show()
    return forecast


def t_arima(df, p, q):
    """
    Fit an ARIMA model to the time series data and forecast future values.

    This function applies an ARIMA model to a specified variable within a DataFrame, determining the degree 
    of differencing (d) based on stationarity tests. It then forecasts future values beyond the length of 
    the data provided, accommodating additional time steps for further predictions.

    Parameters:
    - df (Series): The input time series data.
    - p (int): The order of the autoregressive part.
    - q (int): The order of the moving average part.

    Returns:
    - Series: The forecasted values over the extended range, including additional time steps.
    """
    # Fit ARIMA

    # setting the frequency for the arima
    # df = df.asfreq('30T')
    
    if adfuller(df.dropna())[1] < 0.05:
        d = 0
        print("d = 0")
    elif adfuller(df.diff().dropna())[1] < 0.05:
        d = 1
        print("d = 1")
    else:
        d = 2
        print("d = 2")
    arima_model = ARIMA(df.dropna(), order=(p, d, q))
    arima_fit = arima_model.fit()

    forecast = arima_fit.predict(start=0, end=len(df) - 1 + 48)  # we predict for the 48 rows after
    return pd.Series(forecast, index=df.index)


def metrics(y_test, y_pred):
    """
    Calculate and display error metrics for model evaluation.

    This function computes standard error metrics, including Mean Absolute Error (MAE), Mean Squared Error (MSE),
    Root Mean Squared Error (RMSE), and R-squared (R2), to evaluate the accuracy of model predictions.

    Parameters:
    - y_test (Series or array-like): The true values for the target variable in the test set.
    - y_pred (Series or array-like): The predicted values for the target variable in the test set.

    Returns:
    - None
    """
    # Calculate error metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    # Print error metrics
    print("Mean Absolute Error (MAE):", mae)
    print("Mean Squared Error (MSE):", mse)
    print("Root Mean Squared Error (RMSE):", rmse)
    print("R-squared (R2):", r2)
    return rmse


def ensemble_model(merged_df_2024, variable, fft_threshold, p, q):
    """
    Generate an ensemble forecast by combining Prophet and ARIMA models.

    Parameters:
    - merged_df_2024 (DataFrame): The main DataFrame containing time series data to be used for forecasting.
    - variable (str): The target variable for which predictions are to be generated.
    - fft_threshold (float): The threshold for filtering frequencies in the Fast Fourier Transform (FFT) for Prophet.
    - p (int): The order of the autoregressive (AR) term in the ARIMA model for the residuals.
    - q (int): The order of the moving average (MA) term in the ARIMA model for the residuals.

    Returns:
    - DataFrame: A DataFrame containing the original data, Prophet predictions, ARIMA residual forecasts, 
      and the final combined forecast.

    Steps:
    1. Calculate initial predictions with Prophet:
       - Use Prophet to generate predictions for the target variable, applying FFT with the specified threshold to 
         preprocess the time series data and isolate important frequencies.
       - Set 'ds' as the index for Prophet predictions to align with the datetime index of the main DataFrame.
    
    2. Calculate residuals between actual values and Prophet predictions:
       - Compute residuals by subtracting Prophet’s forecast from the actual values in `variable`.
    
    3. Fit ARIMA on the residuals:
       - Fit an ARIMA model on the residuals, using specified AR and MA orders (`p` and `q`), to capture any 
         remaining patterns not accounted for by Prophet.

    4. Combine Prophet and ARIMA forecasts:
       - Add the ARIMA forecasted residuals back to the initial Prophet predictions to produce the final ensemble 
         forecast.
       - Calculate the final residuals as the difference between actual values and the combined forecast to assess 
         the accuracy of the ensemble model.

    Returns:
    - The updated DataFrame with columns for Prophet's forecast, ARIMA residuals, combined forecast, and final residuals.
    """
    # Step 1: Calculate predictions with Prophet
    preds_SP = prophet_predictions(merged_df_2024, variable, calculate_fft(merged_df_2024, variable, fft_threshold))
    # Merging the predictions of prophet to merged df
    preds_SP = preds_SP.set_index('ds')
    preds_SP.index.name = 'Datetime'

    # Convert both indexes to datetime format
    preds_SP.index = pd.to_datetime(preds_SP.index)
    merged_df_2024.index = pd.to_datetime(merged_df_2024.index)
    # Merge both dataframes for convenience
    merged_df_SP = merged_df_2024.join(preds_SP, how='outer')
    merged_df_SP.reset_index(inplace=True)

    # Step 2: Calculate Residuals
    # Calculate residuals as the difference between actual values and Prophet's forecast
    merged_df_SP['residuals'] = merged_df_SP[variable] - merged_df_SP["yhat"]

    # Step 3: Fit ARIMA on the Residuals
    # Using the residuals, fit an ARIMA model
    residuals_forecast_series = t_arima(merged_df_SP['residuals'], p, q)

    # Step 4: Combine the Predictions
    # Add the ARIMA residuals forecast back to the Prophet forecast
    merged_df_SP['combined_forecast'] = merged_df_SP['yhat'] + residuals_forecast_series

    merged_df_SP['final_residuals'] = merged_df_SP[variable] - merged_df_SP['combined_forecast']
    
    return merged_df_SP

## Filling in the NaNs

### So First, we try to fill in the columns that can be filled using other columns
### NIV_Outturn = - (BM_Bid_Acceptances + BM_Offer_Acceptances) 
### and 
### BSAD_Total = BSAD_Turn_Down + BSAD_Turn_Up

In [54]:
# Replace the "No Data Available" by 0s in the BSAD columns where applicable
# if all three are missing we just let them be replaced by NaNs

# Replace "No Data Available" in "BSAD_Turn_Up" with 0 if "BSAD_Total" is equal to other column
merged_df.loc[(merged_df["BSAD_Turn_Up"].isna()) & (merged_df["BSAD_Total"] == merged_df["BSAD_Turn_Down"]), "BSAD_Turn_Up"] = 0

# Replace "No Data Available" in "BSAD_Turn_Down" with 0 if "BSAD_Total" is equal to other column
merged_df.loc[(merged_df["BSAD_Turn_Down"].isna()) & (merged_df["BSAD_Total"] == merged_df["BSAD_Turn_Up"]), "BSAD_Turn_Down"] = 0    

# Replace 'NIV_Outturn' with NaN if both 'BM_Bid_Acceptances' and 'BM_Offer_Acceptances' are NaN and 'NIV_Outturn' is 0
merged_df.loc[(merged_df['NIV_Outturn'] == 0) & merged_df['BM_Bid_Acceptances'].isna() & merged_df['BM_Offer_Acceptances'].isna(), 'NIV_Outturn'] = np.nan

# Replace 'NIV_Outturn' with the negative of the sum of 'BM_Offer_Acceptances' and 'BM_Bid_Acceptances' 
# if 'NIV_Outturn' is zero and neither of the other two columns contains NaN
merged_df.loc[(merged_df['NIV_Outturn'] == 0) & merged_df['BM_Offer_Acceptances'].notna() & merged_df['BM_Bid_Acceptances'].notna(), 'NIV_Outturn'] = -(merged_df['BM_Offer_Acceptances'] + merged_df['BM_Bid_Acceptances'])

# Extrapolate 'BM_Bid_Acceptances' with condition to set both columns to NaN if bid check fails
bid_values = -merged_df['NIV_Outturn'] - merged_df['BM_Offer_Acceptances']
merged_df.loc[merged_df['BM_Bid_Acceptances'].isna() & merged_df['NIV_Outturn'].notna(), 'BM_Bid_Acceptances'] = bid_values.where(bid_values <= 0)
merged_df.loc[merged_df['BM_Bid_Acceptances'].isna(), 'BM_Offer_Acceptances'] = np.nan

# Extrapolate 'BM_Offer_Acceptances' with condition to set both columns to NaN if offer check fails
offer_values = -merged_df['NIV_Outturn'] - merged_df['BM_Bid_Acceptances']
merged_df.loc[merged_df['BM_Offer_Acceptances'].isna() & merged_df['NIV_Outturn'].notna(), 'BM_Offer_Acceptances'] = offer_values.where(offer_values >= 0)
merged_df.loc[merged_df['BM_Offer_Acceptances'].isna(), 'BM_Bid_Acceptances'] = np.nan

### Markt the rows where there are missing values for each variable

In [None]:
for column in merged_df.columns:
    merged_df[f'{column}_missing'] = merged_df[column].isnull().astype(int)

### Then, we fill the other NaNs using Prophet

In [55]:
# Apply Prophet-based filling for each column with missing data, except 'GMT Time'
for column in merged_df.columns:
    if column != 'Datetime':  # time col is never empty and anyway not numerical
        if column not in ["BSAD_Turn_Up", "BSAD_Turn_Down", "BSAD_Total"]:  # there are simply too many Nans in here to fill (we would just be training on our own created data)
            max_consecutive_nans_filled(merged_df, column)  # print information about the NaNs before filling, including max consecutive
            merged_df = fill_missing_with_prophet(merged_df, column)
            print(f"Missing values in '{column}' have been filled using Prophet.")
            print()

NaNs in System_Price: 45
Max consecutive NaNs filled for 'System_Price': 3


19:33:35 - cmdstanpy - INFO - Chain [1] start processing
19:35:42 - cmdstanpy - INFO - Chain [1] done processing


Missing values in 'System_Price' have been filled using Prophet.

NaNs in NIV_Outturn: 23
Max consecutive NaNs filled for 'NIV_Outturn': 3


19:36:07 - cmdstanpy - INFO - Chain [1] start processing
19:36:50 - cmdstanpy - INFO - Chain [1] done processing


Missing values in 'NIV_Outturn' have been filled using Prophet.

NaNs in BM_Bid_Acceptances: 420
Max consecutive NaNs filled for 'BM_Bid_Acceptances': 26


19:37:14 - cmdstanpy - INFO - Chain [1] start processing
19:38:34 - cmdstanpy - INFO - Chain [1] done processing


Missing values in 'BM_Bid_Acceptances' have been filled using Prophet.

NaNs in BM_Offer_Acceptances: 420
Max consecutive NaNs filled for 'BM_Offer_Acceptances': 26


19:39:08 - cmdstanpy - INFO - Chain [1] start processing
19:40:39 - cmdstanpy - INFO - Chain [1] done processing


Missing values in 'BM_Offer_Acceptances' have been filled using Prophet.

NaNs in EPEX_Intraday_Volume: 640
Max consecutive NaNs filled for 'EPEX_Intraday_Volume': 48


19:41:03 - cmdstanpy - INFO - Chain [1] start processing
19:42:07 - cmdstanpy - INFO - Chain [1] done processing


Missing values in 'EPEX_Intraday_Volume' have been filled using Prophet.

NaNs in Loss_of_Load_Prob: 556
Max consecutive NaNs filled for 'Loss_of_Load_Prob': 70


19:42:30 - cmdstanpy - INFO - Chain [1] start processing
19:42:44 - cmdstanpy - INFO - Chain [1] done processing


Missing values in 'Loss_of_Load_Prob' have been filled using Prophet.

NaNs in Total_Load: 2091
Max consecutive NaNs filled for 'Total_Load': 53


19:43:07 - cmdstanpy - INFO - Chain [1] start processing
19:44:13 - cmdstanpy - INFO - Chain [1] done processing


Missing values in 'Total_Load' have been filled using Prophet.

NaNs in Demand_Outturn: 571
Max consecutive NaNs filled for 'Demand_Outturn': 48


19:44:35 - cmdstanpy - INFO - Chain [1] start processing
19:46:27 - cmdstanpy - INFO - Chain [1] done processing


Missing values in 'Demand_Outturn' have been filled using Prophet.

NaNs in Biomass: 2195
Max consecutive NaNs filled for 'Biomass': 70


19:46:52 - cmdstanpy - INFO - Chain [1] start processing
19:49:36 - cmdstanpy - INFO - Chain [1] done processing


Missing values in 'Biomass' have been filled using Prophet.

NaNs in Fossil_Gas: 2195
Max consecutive NaNs filled for 'Fossil_Gas': 70


19:50:04 - cmdstanpy - INFO - Chain [1] start processing
19:52:51 - cmdstanpy - INFO - Chain [1] done processing


Missing values in 'Fossil_Gas' have been filled using Prophet.

NaNs in Fossil_Hard_Coal: 2195
Max consecutive NaNs filled for 'Fossil_Hard_Coal': 70


19:53:14 - cmdstanpy - INFO - Chain [1] start processing
19:54:54 - cmdstanpy - INFO - Chain [1] done processing


Missing values in 'Fossil_Hard_Coal' have been filled using Prophet.

NaNs in Fossil_Oil: 2195
Max consecutive NaNs filled for 'Fossil_Oil': 70


19:55:16 - cmdstanpy - INFO - Chain [1] start processing
19:55:22 - cmdstanpy - INFO - Chain [1] done processing


Missing values in 'Fossil_Oil' have been filled using Prophet.

NaNs in Hydro_Pumped_Storage: 2195
Max consecutive NaNs filled for 'Hydro_Pumped_Storage': 70


19:55:50 - cmdstanpy - INFO - Chain [1] start processing
19:57:01 - cmdstanpy - INFO - Chain [1] done processing


Missing values in 'Hydro_Pumped_Storage' have been filled using Prophet.

NaNs in Hydro_Run-of-River_and_Poundage: 2195
Max consecutive NaNs filled for 'Hydro_Run-of-River_and_Poundage': 70


19:57:23 - cmdstanpy - INFO - Chain [1] start processing
19:59:26 - cmdstanpy - INFO - Chain [1] done processing


Missing values in 'Hydro_Run-of-River_and_Poundage' have been filled using Prophet.

NaNs in Nuclear: 2195
Max consecutive NaNs filled for 'Nuclear': 70


19:59:48 - cmdstanpy - INFO - Chain [1] start processing
20:02:33 - cmdstanpy - INFO - Chain [1] done processing


Missing values in 'Nuclear' have been filled using Prophet.

NaNs in Solar: 2195
Max consecutive NaNs filled for 'Solar': 70


20:03:00 - cmdstanpy - INFO - Chain [1] start processing
20:03:38 - cmdstanpy - INFO - Chain [1] done processing


Missing values in 'Solar' have been filled using Prophet.

NaNs in Wind_Onshore: 2195
Max consecutive NaNs filled for 'Wind_Onshore': 70


20:04:04 - cmdstanpy - INFO - Chain [1] start processing
20:05:49 - cmdstanpy - INFO - Chain [1] done processing


Missing values in 'Wind_Onshore' have been filled using Prophet.

NaNs in Wind_Offshore: 2195
Max consecutive NaNs filled for 'Wind_Offshore': 70


20:06:16 - cmdstanpy - INFO - Chain [1] start processing
20:08:23 - cmdstanpy - INFO - Chain [1] done processing


Missing values in 'Wind_Offshore' have been filled using Prophet.

NaNs in Day_Ahead_Price: 64
Max consecutive NaNs filled for 'Day_Ahead_Price': 48


20:08:50 - cmdstanpy - INFO - Chain [1] start processing
20:10:48 - cmdstanpy - INFO - Chain [1] done processing


Missing values in 'Day_Ahead_Price' have been filled using Prophet.

NaNs in Intraday_Price: 640
Max consecutive NaNs filled for 'Intraday_Price': 48


20:11:14 - cmdstanpy - INFO - Chain [1] start processing
20:13:04 - cmdstanpy - INFO - Chain [1] done processing


Missing values in 'Intraday_Price' have been filled using Prophet.



### Save the filled df in case we need to retrieve it

In [56]:
# In order to save the merged to csv for easier retrieval
merged_df.to_csv("merged_df_filled.csv", index=False)

## Create Data for the next 48 observations

In [4]:
merged_df = pd.read_csv("merged_df_filled.csv")

In [5]:
# for predictions for different variables,we use different lenghts for training the ensemble model
merged_df.set_index('Datetime', inplace=True)
merged_df.index = pd.to_datetime(merged_df.index)

merged_df_2024 = merged_df[(merged_df.index >= '2023-10-01') & (merged_df.index < '2025-01-01')]  # we only look at the last 365 (for System Price)

In [7]:
# Creating rows to append generated predictions
# Generate a date range that starts after the last date in merged_df_2024
date_range_df_temp = pd.DataFrame({'value': [None] * 48}, index=pd.date_range(start=merged_df.index[-1] + pd.Timedelta(minutes=30), periods=48, freq='30T'))
# Concatenate without resetting the index, preserving the datetime index
df_with_preds = pd.concat([merged_df, date_range_df_temp])

In [8]:
# Making the predictions for next 48 observations using Prophet + ARIMA

# Apply Prophet (using FFT inputs) + ARIMA for each column in order to create predictions for the next 48 observations
for column in merged_df.columns:
    if column != "System_Price":  # For System Price, there is a different procedure
        # Creating prediction using Prophet and ARIMA for each column for next 48
        prophet_arima_preds = ensemble_model(merged_df, column, 45, 2, 3)
        # Appending the predictions to the end of the df with predictions
        df_with_preds.loc[df_with_preds.index[-48:], column] = prophet_arima_preds['combined_forecast'].iloc[-48:].values
        

21:45:41 - cmdstanpy - INFO - Chain [1] start processing
21:53:05 - cmdstanpy - INFO - Chain [1] done processing


d = 0


21:57:06 - cmdstanpy - INFO - Chain [1] start processing
22:04:05 - cmdstanpy - INFO - Chain [1] done processing


d = 0


22:07:16 - cmdstanpy - INFO - Chain [1] start processing
22:15:55 - cmdstanpy - INFO - Chain [1] done processing


d = 0


22:19:15 - cmdstanpy - INFO - Chain [1] start processing
22:23:55 - cmdstanpy - INFO - Chain [1] done processing


d = 0


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
22:36:11 - cmdstanpy - INFO - Chain [1] start processing
22:42:03 - cmdstanpy - INFO - Chain [1] done processing


d = 0


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
22:45:13 - cmdstanpy - INFO - Chain [1] start processing
22:52:56 - cmdstanpy - INFO - Chain [1] done processing


d = 0


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
22:55:11 - cmdstanpy - INFO - Chain [1] start processing
22:57:47 - cmdstanpy - INFO - Chain [1] done processing


d = 0


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
23:01:44 - cmdstanpy - INFO - Chain [1] start processing
23:04:49 - cmdstanpy - INFO - Chain [1] done processing


d = 0


23:06:33 - cmdstanpy - INFO - Chain [1] start processing
23:10:26 - cmdstanpy - INFO - Chain [1] done processing


d = 0


23:12:41 - cmdstanpy - INFO - Chain [1] start processing
23:19:10 - cmdstanpy - INFO - Chain [1] done processing


d = 0


23:21:28 - cmdstanpy - INFO - Chain [1] start processing
23:35:13 - cmdstanpy - INFO - Chain [1] done processing


d = 0


23:37:55 - cmdstanpy - INFO - Chain [1] start processing
23:47:40 - cmdstanpy - INFO - Chain [1] done processing


d = 0


23:51:12 - cmdstanpy - INFO - Chain [1] start processing
23:56:09 - cmdstanpy - INFO - Chain [1] done processing


d = 0


23:58:51 - cmdstanpy - INFO - Chain [1] start processing
00:00:07 - cmdstanpy - INFO - Chain [1] done processing


d = 0


00:02:50 - cmdstanpy - INFO - Chain [1] start processing
00:05:14 - cmdstanpy - INFO - Chain [1] done processing


d = 0


00:08:10 - cmdstanpy - INFO - Chain [1] start processing
00:14:09 - cmdstanpy - INFO - Chain [1] done processing


d = 0


00:16:28 - cmdstanpy - INFO - Chain [1] start processing
00:35:42 - cmdstanpy - INFO - Chain [1] done processing


d = 0


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
00:39:05 - cmdstanpy - INFO - Chain [1] start processing
00:51:49 - cmdstanpy - INFO - Chain [1] done processing


d = 0


00:55:13 - cmdstanpy - INFO - Chain [1] start processing
01:09:44 - cmdstanpy - INFO - Chain [1] done processing


d = 0


01:14:08 - cmdstanpy - INFO - Chain [1] start processing
01:23:10 - cmdstanpy - INFO - Chain [1] done processing


d = 0


01:26:59 - cmdstanpy - INFO - Chain [1] start processing
01:32:52 - cmdstanpy - INFO - Chain [1] done processing


d = 0


01:36:06 - cmdstanpy - INFO - Chain [1] start processing
01:48:22 - cmdstanpy - INFO - Chain [1] done processing


d = 0


In [None]:
# For System Price there is another system, we keep more information
# Prediction using Prophet (using FFT inputs) + ARIMA for System_Price
prophet_arima_preds_SP = ensemble_model(merged_df_2024, 'System_Price', 45, 2, 3)



01:49:58 - cmdstanpy - INFO - Chain [1] start processing
01:50:25 - cmdstanpy - INFO - Chain [1] done processing


d = 0


ValueError: Length of values (17616) does not match length of index (118368)

In [10]:
# We append the predictions for System Price
df_with_preds.loc[df_with_preds.index[-17616:], "combined_predictions_SP"] = prophet_arima_preds_SP['combined_forecast'].values
# Calculate the residuals from the SP prediction (for XGBoost input later)
df_with_preds.loc[df_with_preds.index[-17616:], "combined_predictions_SP_residuals"] = df_with_preds["System_Price"] - df_with_preds["combined_predictions_SP"]

## Feature Engineering

In [None]:
# Variable that tracks the difference between total load and demand
df_with_preds["Load-Demand"] = df_with_preds["Total_Load"] - df_with_preds["Demand_Outturn"]

# Recalculate the LoLP using the Normal CDF (we only use the difference btw Total load and demand outturn)
# For instructions, we consulted:
# approximation since we do not have some of the information
# https://bscdocs.elexon.co.uk/category-3-documents/loss-of-load-probability-calculation-methodolgy-statement
df_with_preds['LoLP'] = 1 - norm.cdf(df_with_preds["Load-Demand"], loc=0, scale=np.sqrt(700))
# Calculate the LoLP lag 1 as proxy for prediction of not enough electricity for next day since the load and demand are super autocorellated
df_with_preds['LoLP_lag1'] = df_with_preds['LoLP'].shift(1).copy()


# Feature engineering to create wind+solar variable, ignoring NaNs (if there is NaN in one of them, the sum is not NaN)
df_with_preds["Wind_Solar"] = df_with_preds[["Solar", "Wind_Onshore", "Wind_Offshore"]].sum(axis=1, skipna=True)
# Sum all columns except 'GMT Time', ignoring NaNs
df_with_preds['Total_Generation'] = df_with_preds[['Biomass', 'Fossil_Gas', 'Fossil_Hard_Coal', 'Fossil_Oil',
                                                    'Hydro_Pumped_Storage', 'Hydro_Run-of-River_and_Poundage', 'Nuclear',
                                                    'Solar', 'Wind_Onshore', 'Wind_Offshore']].sum(axis=1, skipna=True)


# Total_Load = Total Generation + Exports - Imports - Stored Energy
# So we create a column that is the difference between exports, imports and stored energy
df_with_preds["Exports-Imports-Stored"] = df_with_preds["Total_Load"] - df_with_preds["Total_Generation"]
df_with_preds["Generation-Demand"] = df_with_preds["Total_Generation"] - df_with_preds["Demand_Outturn"]

# Recalculate the LoLP using Generation, using the Normal CDF (we only use the difference btw Total Generation and demand outturn)
# For instructions, we consulted:
# https://bscdocs.elexon.co.uk/category-3-documents/loss-of-load-probability-calculation-methodolgy-statement
df_with_preds['LoLP_Gen'] = 1 - norm.cdf(df_with_preds["Total_Generation"] - df_with_preds["Demand_Outturn"], loc=0, scale=np.sqrt(700))
# Calculate the LoLP lag 1 as proxy for prediction of not enough electricity for next day since the load and demand are super autocorellated
df_with_preds['LoLP_Gen_lag1'] = df_with_preds['LoLP_Gen'].shift(1).copy()


# Caolumn with Day Ahead Price but lag 48, since they are predictions for next day
df_with_preds['Day_Ahead_Price_lag48'] = df_with_preds['Day_Ahead_Price'].shift(48).copy()


# We want to keep some of the columns from the Prophet prediction of SP
## Filter columns that contain "seasonal_freq_" but do not end with "_lower" or "_upper"
filtered_columns = [col for col in prophet_arima_preds_SP.columns if "seasonal_freq_" in col and not col.endswith(("_lower", "_upper"))] + ["daily", "weekly","additive_terms", "trend"]



ValueError: Length of values (17616) does not match length of index (118368)

In [13]:
## Add the filtered columns from prophet_arima_preds_SP to df_with_preds
df_with_preds.loc[df_with_preds.index[-17616:], filtered_columns] = prophet_arima_preds_SP[filtered_columns].values


### Finally, we save the df with predictions for all columns to a csv for easier retrieval

In [14]:
df_with_preds.reset_index(inplace=True)
# In order to save the merged to csv for easier retrieval
df_with_preds.to_csv("merged_df_with_preds.csv", index=False)

In [16]:
df_with_preds.isna().mean().to_string()

'index                                0.000000\nSystem_Price                         0.000406\nNIV_Outturn                          0.000000\nBM_Bid_Acceptances                   0.000000\nBM_Offer_Acceptances                 0.000000\nBSAD_Turn_Up                         0.312998\nBSAD_Turn_Down                       0.312998\nBSAD_Total                           0.312998\nEPEX_Intraday_Volume                 0.000000\nLoss_of_Load_Prob                    0.000000\nTotal_Load                           0.000000\nDemand_Outturn                       0.000000\nBiomass                              0.000000\nFossil_Gas                           0.000000\nFossil_Hard_Coal                     0.000000\nFossil_Oil                           0.000000\nHydro_Pumped_Storage                 0.000000\nHydro_Run-of-River_and_Poundage      0.000000\nNuclear                              0.000000\nSolar                                0.000000\nWind_Onshore                         0.000000\nWind_Offshor