# Importing the data
### setting comprehensible col names and right types

In [1]:
# Importing all the necessary packages
import pandas as pd
import re
import matplotlib.pyplot as plt
import numpy as np
from prophet import Prophet
from arch import arch_model
from statsforecast import StatsForecast
from statsforecast.models import AutoARIMA
from statsmodels.stats.diagnostic import het_arch
import pandas as pd
from pandas.errors import PerformanceWarning
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import norm
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from prophet import Prophet
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import RandomizedSearchCV
from statsmodels.tsa.stattools import adfuller
import pmdarima as pm
from pmdarima.arima import ndiffs, nsdiffs
from statsmodels.tsa.arima.model import ARIMA


# For legibility, we mute some warnings
import warnings

# Ignore FutureWarning for deprecated 'T' frequency in Prophet
warnings.filterwarnings("ignore", category=FutureWarning, message="'T' is deprecated")

# Ignore PerformanceWarning from pandas
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)

In [2]:
# Mix of different sources, mostly ESO
balancing_df = pd.read_csv("balancing_data.csv")
# Demand data only for GB
GB_demand_df = pd.read_csv("demand_load_data.csv")
# Generation data only for GB
GB_generation_df = pd.read_csv("generation_data.csv")
# the price dataframe only concerns EPEX (only prices from there)
EPEX_price_df = pd.read_csv("price_data.csv")

In [3]:
def rename_balancing_columns(df):
    # Define a dictionary for concise renaming
    rename_map = {
        'GMT Time': 'GMT Time',
        'System Price (ESO Outturn) - GB (£/MWh)': 'System_Price',
        'NIV Outturn (+ve long) - GB (MW)': 'NIV_Outturn',
        'BM Bid Acceptances (total) - GB (MW)': 'BM_Bid_Acceptances',
        'BM Offer Acceptances (total) - GB (MW)': 'BM_Offer_Acceptances',
        'Total BSAD Volume - Turn Up - GB (MW)': 'BSAD_Turn_Up',
        'Total BSAD Volume - Turn Down - GB (MW)': 'BSAD_Turn_Down',
        'Total BSAD Volume - Total - GB (MW)': 'BSAD_Total',
        'Intraday Volume (EPEX Outturn, APX, MID) - GB (MWh)': 'EPEX_Intraday_Volume'
    }
    
    # Apply the renaming map
    df = df.rename(columns=rename_map)

    # Force all the non datetime columns to numeric
    for column in df.columns:
        if column != 'GMT Time':  # Skip the 'GMT Time' column
            df[column] = pd.to_numeric(df[column], errors='coerce')
    return df

# Apply the function to rename columns in balancing_df
balancing_df = rename_balancing_columns(balancing_df)

print("Final columns:")
print(balancing_df.columns.values)


Final columns:
['GMT Time' 'System_Price' 'NIV_Outturn' 'BM_Bid_Acceptances'
 'BM_Offer_Acceptances' 'BSAD_Turn_Up' 'BSAD_Turn_Down' 'BSAD_Total'
 'EPEX_Intraday_Volume']


In [4]:
def rename_demand_columns(df):
    """
    Rename columns for easier reference and convert non-datetime columns to numeric.
    """
    # Define a dictionary for concise renaming
    rename_map = {
        'GMT Time': 'GMT Time',
        'Loss of Load Probability - Latest - GB ()': 'Loss_of_Load_Prob',
        'Actual Total Load - GB (MW)': 'Total_Load',
        'Demand Outturn (ITSDO) - GB (MW)': 'Demand_Outturn'
    }
    
    # Apply the renaming map
    df = df.rename(columns=rename_map)

    # Force all the non-datetime columns to numeric
    for column in df.columns:
        if column != 'GMT Time':  # Skip the 'GMT Time' column
            df[column] = pd.to_numeric(df[column], errors='coerce')
    return df

# Apply the renaming and filling functions
GB_demand_df = rename_demand_columns(GB_demand_df)


print("Final columns:")
print(GB_demand_df.columns.values)

Final columns:
['GMT Time' 'Loss_of_Load_Prob' 'Total_Load' 'Demand_Outturn']


In [5]:
def rename_columns_generation(df):
    # Define a function to clean each column name
    def clean_column_name(col):
        # Extract the generation type using regex
        match = re.search(r'Actual Aggregated Generation By Type - (.+?) - GB', col)
        if match:
            # Replace spaces with underscores for readability
            return match.group(1).replace(" ", "_")
        return col  # Return the column as is if no match is found

    # Rename columns using the clean_column_name function
    df.columns = [clean_column_name(col) for col in df.columns]
    for column in df.columns:
        if column != 'GMT Time':  # Skip the 'GMT Time' column
            df[column] = pd.to_numeric(df[column], errors='coerce')

    return df

# Apply the function to rename columns in generation_df
GB_generation_df = rename_columns_generation(GB_generation_df)


print("Final columns:")
print(GB_generation_df.columns.values)

Final columns:
['GMT Time' 'Biomass' 'Fossil_Gas' 'Fossil_Hard_Coal' 'Fossil_Oil'
 'Hydro_Pumped_Storage' 'Hydro_Run-of-River_and_Poundage' 'Nuclear'
 'Solar' 'Wind_Onshore' 'Wind_Offshore']


In [6]:
def rename_epex_columns(df):
    # Define a dictionary for manual renaming based on your desired column names
    rename_map = {
        'GMT Time': 'GMT Time',
        'Day Ahead Price (EPEX half-hourly, local) - GB (LC/MWh)': 'Day_Ahead_Price',
        'Intraday Price (EPEX Outturn, APX, MID) - GB (£/MWh)': 'Intraday_Price'
    }

    # Rename columns using the dictionary
    df = df.rename(columns=rename_map)
    for column in df.columns:
        if column != 'GMT Time':  # Skip the 'GMT Time' column
            df[column] = pd.to_numeric(df[column], errors='coerce')
    
    return df

# Apply the function to rename columns in EPEX_price_df
EPEX_price_df = rename_epex_columns(EPEX_price_df)


print("Final columns:")
print(EPEX_price_df.columns.values)

Final columns:
['GMT Time' 'Day_Ahead_Price' 'Intraday_Price']


In [7]:
# Set 'GMT Time' as index for each dataframe
balancing_df.set_index('GMT Time', inplace=True)
GB_demand_df.set_index('GMT Time', inplace=True)
GB_generation_df.set_index('GMT Time', inplace=True)
EPEX_price_df.set_index('GMT Time', inplace=True)

# Merge using index
merged_df = balancing_df.join([GB_demand_df, GB_generation_df, EPEX_price_df], how='inner')
# We put back the datetime column into the merged DF and rename it for practicality
merged_df.reset_index(inplace=True)
merged_df.rename(columns={'GMT Time': 'Datetime'}, inplace=True)

print("Merged columns:")
print(merged_df.columns.values)

Merged columns:
['Datetime' 'System_Price' 'NIV_Outturn' 'BM_Bid_Acceptances'
 'BM_Offer_Acceptances' 'BSAD_Turn_Up' 'BSAD_Turn_Down' 'BSAD_Total'
 'EPEX_Intraday_Volume' 'Loss_of_Load_Prob' 'Total_Load' 'Demand_Outturn'
 'Biomass' 'Fossil_Gas' 'Fossil_Hard_Coal' 'Fossil_Oil'
 'Hydro_Pumped_Storage' 'Hydro_Run-of-River_and_Poundage' 'Nuclear'
 'Solar' 'Wind_Onshore' 'Wind_Offshore' 'Day_Ahead_Price' 'Intraday_Price']


# Helper Functions

In [None]:
def calculate_fft(series, n_top_seasonalities, threshold_pc=0.02):
    """
    Calculate significant positive frequencies and their amplitudes using Fast Fourier Transform (FFT),
    selecting the lower of 2% of the max amplitude or the top `n` frequencies.

    Parameters:
    - series (pd.Series): The input time series data.
    - n_top_seasonalities (int): The maximum number of significant frequencies to consider.
    - threshold_pc (float): Percentage (0 < threshold_pc <= 1) of the maximum amplitude to filter significant frequencies.

    Returns:
    - zip: A generator yielding (positive frequency, amplitude) for each significant frequency.
    """
    # Compute fast Fourier transform
    price_fft = np.fft.fft(series.dropna())

    # Get frequencies corresponding to FFT coefficients
    freqs = np.fft.fftfreq(len(price_fft), d=1/48)

    # Calculate amplitudes
    amplitudes = np.abs(price_fft)

    # Calculate the threshold based on 2% of the max amplitude
    threshold = threshold_pc * np.max(amplitudes)

    # Filter positive frequencies with amplitudes above threshold
    positive_indices = np.where((amplitudes > threshold) & (freqs > 0))
    positive_freqs = freqs[positive_indices]
    positive_amplitudes = amplitudes[positive_indices]

    # Sort by amplitude and select the lower of `n_top_seasonalities` or all significant frequencies
    sorted_indices = np.argsort(positive_amplitudes)[::-1]
    selected_indices = sorted_indices[:min(n_top_seasonalities, len(sorted_indices))]

    # Select the top frequencies and amplitudes
    significant_freqs = positive_freqs[selected_indices]
    significant_amplitudes = positive_amplitudes[selected_indices]

    return zip(significant_freqs, significant_amplitudes)


def prophet_predictions(series, freq_amp):
    """
    Generate predictions using Prophet with multiple seasonalities based on significant frequencies.

    Parameters:
    - series (pd.Series): The input time series data.
    - freq_amp (list of tuples): A list of (frequency, amplitude) pairs, where each frequency represents 
                                 a significant periodic component to be modeled as seasonality.

    Returns:
    - forecast (DataFrame): The forecasted values for the specified period, including trend and seasonal components.
    """
    # Prepare data for Prophet
    df = pd.DataFrame({'ds': series.index, 'y': series})
    model = Prophet()

    # Adding seasonalities based on significant frequencies
    for freq, amp in freq_amp:
        if freq != 0:  # Ignore the DC component
            period_in_days = 1 / freq
            seasonality_name = f"seasonal_freq_{freq:.4f}"
            fourier_order = 5 if period_in_days <= 1 else (10 if period_in_days <= 7 else 20)
            model.add_seasonality(name=seasonality_name, period=period_in_days, fourier_order=fourier_order)

    model.fit(df)
    future = model.make_future_dataframe(periods=48, freq='30T')
    forecast = model.predict(future)

    return forecast.set_index('ds')['yhat']

def statsforecast_arima(df):
    """
    Fit an AutoARIMA model to the time series data and forecast future values.

    This function uses AutoARIMA from the statsforecast package to automatically select the best ARIMA model.
    It performs both in-sample prediction and forecasts future values beyond the length of the data provided.

    Parameters:
    - df (Series): The input time series data. The index must be a DateTimeIndex.

    Returns:
    - DataFrame: A DataFrame containing the in-sample predictions and forecasted values over an extended range.
    """
    # Ensure the input is a pandas DataFrame with required columns
    df = pd.DataFrame({'unique_id': 1, 'ds': df.index, 'y': df.values})

    # Initialize the StatsForecast object with the AutoARIMA model
    sf = StatsForecast( models=[AutoARIMA()], freq='30min', n_jobs=-1)

    sf.fit()
    
    # Define the forecast horizon
    forecast_horizon = 48  # 24 hours at 30-minute intervals

    # Forecast future values
    forecast = sf.forecast(df=df, h=forecast_horizon, fitted=True)
    values=sf.forecast_fitted_values()
    values.set_index('ds', inplace=True)
    forecast.set_index('ds', inplace=True)
    result = pd.concat([values, forecast])
    return result["AutoARIMA"]


def ensemble_model(series, fft_threshold):
    """
    Generate an ensemble forecast by combining Prophet and ARIMA models directly on a time series.

    Parameters:
    - series (pd.Series): The input time series data with DateTimeIndex.
    - fft_threshold (float): The threshold for filtering frequencies in the Fast Fourier Transform (FFT) for Prophet.

    Returns:
    - DataFrame: A DataFrame containing the original data, Prophet predictions, ARIMA residual forecasts,
      and the final combined forecast.
    """
    # Step 1: Calculate predictions with Prophet
    freq_amp_pairs = calculate_fft(series, fft_threshold)
    preds_prophet = prophet_predictions(series, freq_amp_pairs)
    
    # Step 2: Calculate Residuals
    residuals = series - preds_prophet

    # Step 3: Fit ARIMA on the Residuals
    arima_forecast = statsforecast_arima(residuals.dropna())  # Ensure non-na data for ARIMA
    
    # Step 4: Combine the Predictions of Prophet and ARIMA
    combined_forecast = preds_prophet.add(arima_forecast)  # Using fill_value to handle NaNs

    # Prepare the result DataFrame
    results = pd.DataFrame({
        'original_data': series,
        'prophet_forecast': preds_prophet,
        'arima_residual_forecast': arima_forecast,
        'combined_forecast': combined_forecast
    })

    return results


In [9]:
# Function to find the maximum number of consecutive NaNs filled in a column
# As Angelica Asked
def max_consecutive_nans_filled(df, column):
    """

    This function calculates and returns the maximum number 
    of consecutive NaNs in a column that is to be filled

    """
    # Identify consecutive NaNs
    na_groups = df[column].isna().astype(int).groupby(df[column].notna().cumsum()).sum()
    # Get the maximum number of consecutive NaNs that would be interpolated
    max_consecutive_nans = na_groups.max()
    nans_before = df[column].isna().sum()

    print(f"NaNs in {column}: {nans_before}")
    print(f"Max consecutive NaNs filled for '{column}': {max_consecutive_nans}")
    return

def fill_nans_with_prophet(series):
    """
    Fills NaNs in the original time series data using predictions from the Prophet model.

    Parameters:
    - series (pd.Series): The input time series data with potential NaNs.

    Returns:
    - pd.Series: The time series with NaNs filled using Prophet predictions.
    """
    # Check if there are any NaNs to fill
    if series.isna().any():
        # Calculate significant frequencies for seasonal adjustments
        freq_amp = calculate_fft(series, n_top_seasonalities=12, threshold_pc=0.02)
        
        # Generate predictions with Prophet
        predictions = prophet_predictions(series, freq_amp)
        
        # Fill NaNs in the original series with predictions
        filled_series = series.combine_first(predictions)
        
        return filled_series
    else:
        # Return original series if no NaNs
        return series
def process_dataframe(df):
    """
    Iterates over each column of the DataFrame, applying Prophet-based NaN filling where applicable.

    Parameters:
    - df (pd.DataFrame): The DataFrame with multiple time series columns, potentially containing NaNs.

    Returns:
    - pd.DataFrame: The DataFrame with NaNs filled where possible.
    """
    for column in df.columns:
        # Check if the column data type is numeric (Prophet requires numeric types)
        if column != "Datetime":
            print(f"Processing column: {column}")
            df[column] = fill_nans_with_prophet(df[column])
        else:
            print(f"Skipping column: {column} (non-numeric data)")
    return df

## Filling in the NaNs

### So First, we try to fill in the columns that can be filled using other columns
### NIV_Outturn = - (BM_Bid_Acceptances + BM_Offer_Acceptances) 
### and 
### BSAD_Total = BSAD_Turn_Down + BSAD_Turn_Up

In [10]:
# Replace the "No Data Available" by 0s in the BSAD columns where applicable
# if all three are missing we just let them be replaced by NaNs

# Replace "No Data Available" in "BSAD_Turn_Up" with 0 if "BSAD_Total" is equal to other column
merged_df.loc[(merged_df["BSAD_Turn_Up"].isna()) & (merged_df["BSAD_Total"] == merged_df["BSAD_Turn_Down"]), "BSAD_Turn_Up"] = 0

# Replace "No Data Available" in "BSAD_Turn_Down" with 0 if "BSAD_Total" is equal to other column
merged_df.loc[(merged_df["BSAD_Turn_Down"].isna()) & (merged_df["BSAD_Total"] == merged_df["BSAD_Turn_Up"]), "BSAD_Turn_Down"] = 0    

# Replace 'NIV_Outturn' with NaN if both 'BM_Bid_Acceptances' and 'BM_Offer_Acceptances' are NaN and 'NIV_Outturn' is 0
merged_df.loc[(merged_df['NIV_Outturn'] == 0) & merged_df['BM_Bid_Acceptances'].isna() & merged_df['BM_Offer_Acceptances'].isna(), 'NIV_Outturn'] = np.nan

# Replace 'NIV_Outturn' with the negative of the sum of 'BM_Offer_Acceptances' and 'BM_Bid_Acceptances' 
# if 'NIV_Outturn' is zero and neither of the other two columns contains NaN
merged_df.loc[(merged_df['NIV_Outturn'] == 0) & merged_df['BM_Offer_Acceptances'].notna() & merged_df['BM_Bid_Acceptances'].notna(), 'NIV_Outturn'] = -(merged_df['BM_Offer_Acceptances'] + merged_df['BM_Bid_Acceptances'])

# Extrapolate 'BM_Bid_Acceptances' with condition to set both columns to NaN if bid check fails
bid_values = -merged_df['NIV_Outturn'] - merged_df['BM_Offer_Acceptances']
merged_df.loc[merged_df['BM_Bid_Acceptances'].isna() & merged_df['NIV_Outturn'].notna(), 'BM_Bid_Acceptances'] = bid_values.where(bid_values <= 0)
merged_df.loc[merged_df['BM_Bid_Acceptances'].isna(), 'BM_Offer_Acceptances'] = np.nan

# Extrapolate 'BM_Offer_Acceptances' with condition to set both columns to NaN if offer check fails
offer_values = -merged_df['NIV_Outturn'] - merged_df['BM_Bid_Acceptances']
merged_df.loc[merged_df['BM_Offer_Acceptances'].isna() & merged_df['NIV_Outturn'].notna(), 'BM_Offer_Acceptances'] = offer_values.where(offer_values >= 0)
merged_df.loc[merged_df['BM_Offer_Acceptances'].isna(), 'BM_Bid_Acceptances'] = np.nan

### Markt the rows where there are missing values for each variable

In [11]:
for column in merged_df.columns:
    merged_df[f'{column}_missing'] = merged_df[column].isnull().astype(int)

In [12]:
merged_df = merged_df.set_index("Datetime")

### Then, we fill the other NaNs using Prophet

In [13]:
# Process the DataFrame
processed_df = process_dataframe(merged_df[:])
print(processed_df.head())

Processing column: System_Price


17:47:31 - cmdstanpy - INFO - Chain [1] start processing
17:56:09 - cmdstanpy - INFO - Chain [1] done processing


Processing column: NIV_Outturn


17:57:40 - cmdstanpy - INFO - Chain [1] start processing
17:59:57 - cmdstanpy - INFO - Chain [1] done processing


Processing column: BM_Bid_Acceptances


18:01:01 - cmdstanpy - INFO - Chain [1] start processing
18:06:52 - cmdstanpy - INFO - Chain [1] done processing


Processing column: BM_Offer_Acceptances


18:08:01 - cmdstanpy - INFO - Chain [1] start processing
18:23:22 - cmdstanpy - INFO - Chain [1] done processing


Processing column: BSAD_Turn_Up


18:24:38 - cmdstanpy - INFO - Chain [1] start processing
18:26:29 - cmdstanpy - INFO - Chain [1] done processing


Processing column: BSAD_Turn_Down


18:27:52 - cmdstanpy - INFO - Chain [1] start processing
18:30:56 - cmdstanpy - INFO - Chain [1] done processing


Processing column: BSAD_Total


18:32:07 - cmdstanpy - INFO - Chain [1] start processing
18:35:17 - cmdstanpy - INFO - Chain [1] done processing


Processing column: EPEX_Intraday_Volume


18:36:25 - cmdstanpy - INFO - Chain [1] start processing
18:38:05 - cmdstanpy - INFO - Chain [1] done processing


Processing column: Loss_of_Load_Prob


18:39:01 - cmdstanpy - INFO - Chain [1] start processing
18:39:43 - cmdstanpy - INFO - Chain [1] done processing


Processing column: Total_Load


18:40:40 - cmdstanpy - INFO - Chain [1] start processing
18:45:06 - cmdstanpy - INFO - Chain [1] done processing


Processing column: Demand_Outturn


18:46:00 - cmdstanpy - INFO - Chain [1] start processing
18:51:46 - cmdstanpy - INFO - Chain [1] done processing


Processing column: Biomass


18:53:21 - cmdstanpy - INFO - Chain [1] start processing
18:57:11 - cmdstanpy - INFO - Chain [1] done processing


Processing column: Fossil_Gas


18:58:24 - cmdstanpy - INFO - Chain [1] start processing
19:02:12 - cmdstanpy - INFO - Chain [1] done processing


Processing column: Fossil_Hard_Coal


19:03:42 - cmdstanpy - INFO - Chain [1] start processing
19:19:59 - cmdstanpy - INFO - Chain [1] done processing


Processing column: Fossil_Oil


19:22:07 - cmdstanpy - INFO - Chain [1] start processing
19:23:57 - cmdstanpy - INFO - Chain [1] done processing


Processing column: Hydro_Pumped_Storage


19:24:42 - cmdstanpy - INFO - Chain [1] start processing
19:28:35 - cmdstanpy - INFO - Chain [1] done processing


Processing column: Hydro_Run-of-River_and_Poundage


19:30:17 - cmdstanpy - INFO - Chain [1] start processing
19:42:47 - cmdstanpy - INFO - Chain [1] done processing


Processing column: Nuclear


19:45:29 - cmdstanpy - INFO - Chain [1] start processing
20:08:08 - cmdstanpy - INFO - Chain [1] done processing


Processing column: Solar


20:09:27 - cmdstanpy - INFO - Chain [1] start processing
20:14:14 - cmdstanpy - INFO - Chain [1] done processing


Processing column: Wind_Onshore


20:16:05 - cmdstanpy - INFO - Chain [1] start processing
20:28:27 - cmdstanpy - INFO - Chain [1] done processing


Processing column: Wind_Offshore


20:30:25 - cmdstanpy - INFO - Chain [1] start processing
20:37:26 - cmdstanpy - INFO - Chain [1] done processing


Processing column: Day_Ahead_Price


20:39:15 - cmdstanpy - INFO - Chain [1] start processing
20:45:48 - cmdstanpy - INFO - Chain [1] done processing


Processing column: Intraday_Price


20:47:35 - cmdstanpy - INFO - Chain [1] start processing
20:54:17 - cmdstanpy - INFO - Chain [1] done processing


Processing column: Datetime_missing
Processing column: System_Price_missing
Processing column: NIV_Outturn_missing
Processing column: BM_Bid_Acceptances_missing
Processing column: BM_Offer_Acceptances_missing
Processing column: BSAD_Turn_Up_missing
Processing column: BSAD_Turn_Down_missing
Processing column: BSAD_Total_missing
Processing column: EPEX_Intraday_Volume_missing
Processing column: Loss_of_Load_Prob_missing
Processing column: Total_Load_missing
Processing column: Demand_Outturn_missing
Processing column: Biomass_missing
Processing column: Fossil_Gas_missing
Processing column: Fossil_Hard_Coal_missing
Processing column: Fossil_Oil_missing
Processing column: Hydro_Pumped_Storage_missing
Processing column: Hydro_Run-of-River_and_Poundage_missing
Processing column: Nuclear_missing
Processing column: Solar_missing
Processing column: Wind_Onshore_missing
Processing column: Wind_Offshore_missing
Processing column: Day_Ahead_Price_missing
Processing column: Intraday_Price_missing
  

### Save the filled df in case we need to retrieve it

In [17]:
# In order to save the merged to csv for easier retrieval
processed_df.reset_index(inplace=True)
processed_df.to_csv("merged_df_prophet_filled.csv", index=False)

# STOP HERE


# STOP

In [None]:
break

## Create Data for the next 48 observations

In [4]:
merged_df = pd.read_csv("merged_df_prophet_filled.csv")

In [8]:
# for predictions for different variables,we use different lenghts for training the ensemble model
merged_df.set_index('Datetime', inplace=True)
merged_df.index = pd.to_datetime(merged_df.index)

merged_df_2024 = merged_df[(merged_df.index >= '2023-10-01') & (merged_df.index < '2025-01-01')] 

In [9]:
# Creating rows to append generated predictions
# Generate a date range that starts after the last date in merged_df_2024
date_range_df_temp = pd.DataFrame({'value': [None] * 48}, index=pd.date_range(start=merged_df.index[-1] + pd.Timedelta(minutes=30), periods=48, freq='30T'))
# Concatenate without resetting the index, preserving the datetime index
df_with_preds = pd.concat([merged_df, date_range_df_temp])

In [None]:
from concurrent.futures import ThreadPoolExecutor

# Function to handle the prediction for a single column
def predict_for_column(column):
    if column != "System_Price":
        # Creating prediction using Prophet and ARIMA for the column
        prophet_arima_preds = prophet_predictions(merged_df[[column]], 45)
        # Extracting the predictions for the next 48 observations
        predictions = prophet_arima_preds['combined_forecast'].iloc[-48:].values
        return column, predictions
    return None

# Parallelizing the process
with ThreadPoolExecutor() as executor:
    # Run the prediction function for each column in parallel
    results = list(executor.map(predict_for_column, merged_df.columns))

# Update df_with_preds with predictions
for result in results:
    if result:  # Ensure result is not None
        column, predictions = result
        df_with_preds.loc[df_with_preds.index[-48:], column] = predictions

# Display the updated DataFrame
print(df_with_preds.head())

45
45
45
45
45
15
45
6
45
5
41
32


02:18:58 - cmdstanpy - INFO - Chain [1] start processing
02:20:15 - cmdstanpy - INFO - Chain [1] start processing


45


02:31:16 - cmdstanpy - INFO - Chain [1] done processing


45
45
45


## Feature Engineering

In [None]:
# Date related booleans
import holidays

# Get British holidays
uk_holidays = holidays.UnitedKingdom()

df_with_preds['Datetime'] = df_with_preds.index

# Add a boolean column for British holidays
df_with_preds['is_british_holiday'] = df_with_preds['Datetime'].isin(uk_holidays)

print(df_with_preds.head())
df_with_preds['day_of_week'] = df_with_preds['Datetime'].dt.dayofweek
df_with_preds['is_weekday'] = df_with_preds['Datetime'].dt.weekday < 5  # Monday (0) to Friday (4)
df_with_preds['is_weekend'] = df_with_preds['day_of_week'] >= 5  # Saturday and Sunday

df_with_preds['hour_of_day'] = df_with_preds['Datetime'].dt.hour
df_with_preds['is_peak_hour'] = df_with_preds['hour_of_day'].isin([7, 8, 9, 18, 19, 20])  # Example peak hours

# Month, Quarter, and Seasons
df_with_preds['month'] = df_with_preds['Datetime'].dt.month
df_with_preds['quarter'] = df_with_preds['Datetime'].dt.quarter
"""
# Daylight Saving Time
def is_dst(date):
    return bool(pytz.timezone('Europe/London').dst(date))
df_with_preds['is_dst'] = df_with_preds['Datetime'].apply(is_dst)
"""
df_with_preds['is_working_day'] = (~df_with_preds['is_british_holiday']) & (~df_with_preds['is_weekend'])

def get_season(date):
    year = date.year
    seasons = {
        'Winter': (pd.Timestamp(f'{year}-12-21'), pd.Timestamp(f'{year+1}-03-20')),
        'Spring': (pd.Timestamp(f'{year}-03-21'), pd.Timestamp(f'{year}-06-20')),
        'Summer': (pd.Timestamp(f'{year}-06-21'), pd.Timestamp(f'{year}-09-22')),
        'Fall':   (pd.Timestamp(f'{year}-09-23'), pd.Timestamp(f'{year}-12-20'))
    }
    for season, (start, end) in seasons.items():
        if start <= date <= end:
            return season
    return 'Unknown'

# Add a season column
df_with_preds['season'] = df_with_preds['Datetime'].apply(get_season)

df_with_preds['is_christmas_season'] = df_with_preds['Datetime'].between(pd.Timestamp('2024-12-20'), pd.Timestamp('2024-12-31'))
df_with_preds['is_summer_vacation'] = df_with_preds['Datetime'].dt.month.isin([7, 8])
# Reset Datetime as the index
df_with_preds = df_with_preds.set_index('Datetime')

In [None]:
# Variable that tracks the difference between total load and demand
df_with_preds["Load-Demand"] = df_with_preds["Total_Load"] - df_with_preds["Demand_Outturn"]

# Recalculate the LoLP using the Normal CDF (we only use the difference btw Total load and demand outturn)
# For instructions, we consulted:
# approximation since we do not have some of the information
# https://bscdocs.elexon.co.uk/category-3-documents/loss-of-load-probability-calculation-methodolgy-statement
df_with_preds['LoLP'] = 1 - norm.cdf(df_with_preds["Load-Demand"], loc=0, scale=np.sqrt(700))
# Calculate the LoLP lag 1 as proxy for prediction of not enough electricity for next day since the load and demand are super autocorellated
df_with_preds['LoLP_lag1'] = df_with_preds['LoLP'].shift(1).copy()


# Feature engineering to create wind+solar variable, ignoring NaNs (if there is NaN in one of them, the sum is not NaN)
df_with_preds["Wind_Solar"] = df_with_preds[["Solar", "Wind_Onshore", "Wind_Offshore"]].sum(axis=1, skipna=True)
# Sum all columns except 'GMT Time', ignoring NaNs
df_with_preds['Total_Generation'] = df_with_preds[['Biomass', 'Fossil_Gas', 'Fossil_Hard_Coal', 'Fossil_Oil',
                                                    'Hydro_Pumped_Storage', 'Hydro_Run-of-River_and_Poundage', 'Nuclear',
                                                    'Solar', 'Wind_Onshore', 'Wind_Offshore']].sum(axis=1, skipna=True)


# Total_Load = Total Generation + Exports - Imports - Stored Energy
# So we create a column that is the difference between exports, imports and stored energy
df_with_preds["Exports-Imports-Stored"] = df_with_preds["Total_Load"] - df_with_preds["Total_Generation"]
df_with_preds["Generation-Demand"] = df_with_preds["Total_Generation"] - df_with_preds["Demand_Outturn"]

# Recalculate the LoLP using Generation, using the Normal CDF (we only use the difference btw Total Generation and demand outturn)
# For instructions, we consulted:
# https://bscdocs.elexon.co.uk/category-3-documents/loss-of-load-probability-calculation-methodolgy-statement
df_with_preds['LoLP_Gen'] = 1 - norm.cdf(df_with_preds["Total_Generation"] - df_with_preds["Demand_Outturn"], loc=0, scale=np.sqrt(700))
# Calculate the LoLP lag 1 as proxy for prediction of not enough electricity for next day since the load and demand are super autocorellated
df_with_preds['LoLP_Gen_lag1'] = df_with_preds['LoLP_Gen'].shift(1).copy()


# Caolumn with Day Ahead Price but lag 48, since they are predictions for next day
df_with_preds['Day_Ahead_Price_lag48'] = df_with_preds['Day_Ahead_Price'].shift(48).copy()

ValueError: Length of values (17616) does not match length of index (118368)

### Finally, we save the df with predictions for all columns to a csv for easier retrieval

In [None]:
df_with_preds.reset_index(inplace=True)
# In order to save the merged to csv for easier retrieval
df_with_preds.to_csv("merged_df_with_preds.csv", index=False)