# **1. Import Libraries**

In [None]:
# Reloading modules automatically when they change
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys

## Little Hack to import the utils module to the notebook
##################################################################
# Add the parent directory to the path so we can import the utils
original_sys_path = sys.path.copy()
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(parent_dir)
from src.utils import *
sys.path = original_sys_path # Reset the path to the original
##################################################################

# **2. Load Data**

In [None]:
# Load the data
file_path = '../data/spot_prices_fi_2016_2023.csv'

# Define dict where predictions and metrices are stored
predictions = {}
metrices = {}
horizon = 24

# Preprocess the data
processed_data = preprocess_data(file_path, date_col="date", price_col="elspot-fi")

In [None]:
# Display the first few rows of the preprocessed data
processed_data

In [None]:
# Remove first two rows as it is only one row for 2015 (because of the UTC conversion in data preprocessing)
processed_data = processed_data.iloc[2:]

In [None]:
# plot the data
fig = go.Figure()
fig.add_trace(go.Scatter(x=processed_data.index, y=processed_data.y, mode='lines', name='Spot Price'))
# add zoom
fig.update_layout(xaxis_rangeslider_visible=True)
fig.update_layout(title='Spot Price Over Time', xaxis_title='Time', yaxis_title='Spot Price')
fig.show()

In [None]:
df = processed_data.copy()

# **3. Model Training**

#### **3.1. Simple Baseline Models**

In [None]:
na = NaiveModel()

# year on year training
result_na, na_metrics = year_on_year_training(df, na, refit=True)
predictions['Naive'] = [{'predictions': result_na}]
metrices['Naive'] = na_metrics
print(na_metrics)

In [None]:
# Plot the predictions
plot_spot_price_predictions(df['y'], result_na, 'Naive Model')

In [None]:
ha = HistoricalAverageModel()

# year on year training
result_ha, ha_metrics = year_on_year_training(df, ha, refit=True)
predictions['Historical Average'] = [{'predictions': result_ha}]
metrices['Historical Average'] = ha_metrics
print(ha_metrics)

In [None]:
# Plot the predictions
plot_spot_price_predictions(df['y'], result_ha, 'Historical Average Model')

In [None]:
# Create a WindowAverageModel instance with a window size of 24 hours
wa = WindowAverageModel(window_size=horizon)
wa_predictions, wa_metrics = year_on_year_training(df, wa, refit=True)
predictions['Window Average'] = [{'predictions': wa_predictions}]
metrices['Window Average'] = wa_metrics
print(wa_metrics)

In [None]:
# Plot the predictions
plot_spot_price_predictions(df['y'], wa_predictions, 'Window Average Model')

In [None]:
ea = ExponentialAverage(alpha=0.2)
ea_predictions, ea_metrics = year_on_year_training(df, ea, refit=True)
predictions['Exponential Average'] = [{'predictions': ea_predictions}]
metrices['Exponential Average'] = ea_metrics
print(ea_metrics)

In [None]:
# Plot the predictions
plot_spot_price_predictions(df['y'], ea_predictions, 'Exponential Average Model')

#### **3.2. Linear Regression with Time Component**

In [None]:
df_time = extract_time_features(df)

In [None]:
# Linear regression
lr_time_features = LinearRegression(fit_intercept=False)

# year on year training
lr_time_predictions, lr_time_metrics, lr_time_features_coeffs = year_on_year_training(df_time, lr_time_features)
predictions['LR (Time Component)'] = [{'predictions' :lr_time_predictions}]
metrices['LR (Time Component)'] = lr_time_metrics

In [None]:
print(lr_time_metrics)

In [None]:
plot_year_over_year_coefficients(lr_time_features_coeffs, keyword="weekday", model_name='LR (Time Component)')

In [None]:
plot_year_over_year_coefficients(lr_time_features_coeffs, keyword="weekend", model_name='LR (Time Component)')

#### **3.3. Download External Features from Fingrid**

Note: Uncomment the code to download the external features from Fingrid.

In [None]:
# # Get the available data types
# datasets = fetch_data("datasets", params = {'pageSize': 20000, 'orderBy': 'id'})

In [None]:
# # Get the dataset infrormation and descriptions
# for dataset in datasets:
#     print(f"{dataset['id']} - {dataset['nameEn']} ({dataset['dataPeriodEn']})")
#     print("Description:", dataset['descriptionEn'])
#     print("-"*20)

In [None]:
# # download datasets by id
# start_time = df.index[0].strftime('%Y-%m-%dT%H:%M:%S.000Z')
# end_time = df.index[-1].strftime('%Y-%m-%dT%H:%M:%S.000Z')
# dataset_ids = ['247']
# for dataset_id in dataset_ids:
#     # check if dataset is already downloaded
#     if os.path.exists(f'../data/{dataset_id}.csv'):
#         print(f'Dataset {dataset_id} already downloaded.')
#         print("-"*20)
#         continue
#     try:
#         data = fetch_data("data", params = {'datasets': dataset_id, 'startTime': start_time, 'endTime': end_time, 'format': 'json', 'oneRowPerTimePeriod': 'true', 'pageSize': 20000, 'locale': 'en', 'sortBy': 'startTime', 'sortOrder': 'asc'})
#     except Exception as e:
#         print(f"Error fetching dataset {dataset_id}: {e}")
#         print("-"*20)
#         continue

#     # convert data to dataframe
#     data = pd.DataFrame(data)
#     data['startTime'] = pd.to_datetime(data['startTime'])
#     data.set_index('startTime', inplace=True)
#     # drop endTime
#     data.drop(columns='endTime', inplace=True)
#     data.sort_index(inplace=True)
#     # localize index to none
#     data.index = data.index.tz_localize(None)
#     print(f'Dataset {dataset_id} length:', len(data))
#     print("-"*20)
#     data.to_csv(f'../data/{dataset_id}.csv')
#     time.sleep(60)

#### **3.4. Time + External Features**

In [None]:
features_to_add = ['246', '247', '165', '242'] # ids of the previously downloaded dataset
len_ext_features = len(features_to_add) 
df_ext_features = add_external_features(df_time, features_to_add)

In [None]:
# rename column name
df_ext_features.rename(
    {'Electricity production prediction - premilinary': 'electricity_production_forecast',
     'Electricity consumption forecast - next 24 hours': 'electricity_consumption_forecast',
     'Solar power generation forecast - updated once a day': 'solar_power_generation_forecast',
     'Wind power generation forecast - updated once a day': 'wind_power_generation_forecast'}, axis=1, inplace=True)

In [None]:
# plot external variables
fig = go.Figure()
for col in df_ext_features.columns[-len_ext_features:]:
    fig.add_trace(go.Scatter(x=df_ext_features.index, y=df_ext_features[col], mode='lines', name=col))
# add zoom
fig.update_layout(xaxis_rangeslider_visible=True)
fig.update_layout(title='External Features Over Time', xaxis_title='Time', yaxis_title='Feature Value')
fig.show()

In [None]:
# fill missing values with given fill functions
missing_mapping = {
    'electricity_consumption_forecast': ['ffill','bfill'],
    'electricity_production_forecast': ['ffill','bfill'],
    'wind_power_generation_forecast': ['ffill','bfill'],
    'solar_power_generation_forecast': ['interpolate','bfill','ffill'],
}

df_ext_features = fill_missing_values(df_ext_features, missing_mapping)

In [None]:
# plot external variables
fig = go.Figure()
for col in df_ext_features.columns[-len_ext_features:]:
    fig.add_trace(go.Scatter(x=df_ext_features.index, y=df_ext_features[col], mode='lines', name=col))
# add zoom
fig.update_layout(xaxis_rangeslider_visible=True)
fig.update_layout(title='External Features Over Time', xaxis_title='Time', yaxis_title='Feature Value')
fig.show()

In [None]:
# check missing values
print(df_ext_features.isnull().sum())

In [None]:
# Linear regression
lr_ext = LinearRegression(fit_intercept=False)

# year on year training
lr_ext_predictions, lr_ext_metrics, lr_ext_features_coeffs = year_on_year_training(df_ext_features, lr_ext)
predictions['LR (Time + External Features)'] = [{'predictions' :lr_ext_predictions}]
metrices['LR (Time + External Features)'] = lr_ext_metrics

In [None]:
print(lr_ext_metrics)

In [None]:
model = LinearRegression(fit_intercept=False)
plot_year_over_year_coefficients(lr_ext_features_coeffs, keyword="weekday")

In [None]:
# Plot weekend coefficients
plot_year_over_year_coefficients(lr_ext_features_coeffs, keyword="weekend")

In [None]:
# plot electricity production forecast
plot_year_over_year_coefficients(lr_ext_features_coeffs, keyword="electricity_production_forecast")

In [None]:
# plot electricity consumption forecast
plot_year_over_year_coefficients(lr_ext_features_coeffs, keyword="electricity_consumption_forecast")

In [None]:
# plot wind power generation forecast
plot_year_over_year_coefficients(lr_ext_features_coeffs, keyword="wind_power_generation_forecast")

In [None]:
# plot solar power generation forecast
plot_year_over_year_coefficients(lr_ext_features_coeffs, keyword="solar_power_generation_forecast")

#### **3.5. With LEAR Features**

[LEAR Paper](https://www.sciencedirect.com/science/article/pii/S0306261921004529?via%3Dihub)

##### **3.5.1. Time + Price Lags**

In [None]:
# List of lags for historical prices and forecasts
price_lags = [1, 2, 3, 7]

# Creating lag features for day-ahead prices
df_lear_price = create_daily_lag_features(df_time, 'y', price_lags, average=True)

df_lear_price = df_lear_price.copy() # avoiding fragmentation


In [None]:
df_lear_price

In [None]:
# Verifying the number of features
print(f'Total number of features: {len(df_lear_price.columns)}')

In [None]:
print(df_lear_price.columns.to_list())

In [None]:
df_lear_price.dropna(inplace=True)

In [None]:
lear_price = LinearRegression(fit_intercept=False)
lear_price_predictions, lear_price_metrics, lear_price_coeffs = year_on_year_training(df_lear_price, lear_price)
predictions['LR (Time + Price Lags)'] = [{'predictions': lear_price_predictions}]
metrices['LR (Time + Price Lags)'] = lear_price_metrics

In [None]:
print(lear_price_metrics)

In [None]:
col_names = ["y_lag", "electricity_production", "electricity_consumption", "wind_power", "solar_power"]

In [None]:
col_names = ["weekday", "weekend", "y_lag"]

for col in col_names:
    plot_year_over_year_coefficients(lear_price_coeffs, keyword=col)

##### **3.5.2. Time + Price Lags + External Features**

In [None]:
# external features ids 
features_to_add = ['246', '247', '165', '242'] # ids of the previously downloaded dataset

# add external features
df_lear_price_ext = add_external_features(df_lear_price, features_to_add)

In [None]:
df_lear_price_ext.rename(
    {'Electricity production prediction - premilinary': 'electricity_production_forecast',
     'Electricity consumption forecast - next 24 hours': 'electricity_consumption_forecast',
     'Solar power generation forecast - updated once a day': 'solar_power_generation_forecast',
     'Wind power generation forecast - updated once a day': 'wind_power_generation_forecast'}, axis=1, inplace=True)

In [None]:
# missing values mapping
missing_mapping = {
    'electricity_consumption_forecast': ['ffill','bfill'],
    'electricity_production_forecast': ['ffill','bfill'],
    'wind_power_generation_forecast': ['ffill','bfill'],
    'solar_power_generation_forecast': ['interpolate','bfill','ffill'],
}

# fill missing values
df_lear_price_ext = fill_missing_values(df_lear_price_ext, missing_mapping)

In [None]:
# Linear regression
lear_price_ext = LinearRegression(fit_intercept=False)

# year on year training
lear_price_ext_predictions, lear_price_ext_metrics, lear_price_ext_features_coeffs = year_on_year_training(df_lear_price_ext, lear_price_ext)
predictions['LR (Time + Price Lags + External Features)'] = [{'predictions': lear_price_ext_predictions}]
metrices['LR (Time + Price Lags + External Features)'] = lear_price_ext_metrics

In [None]:
print(lear_price_ext_metrics)

In [None]:
cols = ["weekend", "weekday", 'y_lag', 'electricity_production_forecast', 'electricity_consumption_forecast', 'wind_power_generation_forecast', 'solar_power_generation_forecast']

for col in cols:
    plot_year_over_year_coefficients(lear_price_ext_features_coeffs, keyword=col)

##### **3.5.3. Time + Price Lags + External Features + External Lags**

In [None]:
# forecast lags
forecast_lags = [1, 7]

# Creating lag features for electricity production forecast
df_lear_price_ext_extlags = create_daily_lag_features(df_lear_price_ext, 'electricity_production_forecast', forecast_lags, average=True)

# Creating lag features for electricity consumption forecast
df_lear_price_ext_extlags = create_daily_lag_features(df_lear_price_ext_extlags, 'electricity_consumption_forecast', forecast_lags, average=True)

# Creating lag features for wind power generation forecast
df_lear_price_ext_extlags = create_daily_lag_features(df_lear_price_ext_extlags, 'wind_power_generation_forecast', forecast_lags, average=True)

# Creating lag features for solar power generation forecast
df_lear_price_ext_extlags = create_daily_lag_features(df_lear_price_ext_extlags, 'solar_power_generation_forecast', forecast_lags, average=True)

In [None]:
# print out the columns
print(df_lear_price_ext_extlags.columns.to_list())

In [None]:
# drop missing values
df_lear_price_ext_extlags.dropna(inplace=True)

In [None]:
# Linear regression
lear_price_ext_extlags = LinearRegression(fit_intercept=False)

# year on year training
lear_price_ext_extlags_predictions, lear_price_ext_extlags_metrics, lear_price_ext_extlags_features_coeffs = year_on_year_training(df_lear_price_ext_extlags, lear_price_ext_extlags)
predictions['LR (Time + Price Lags + External Features + External Lags)'] = [{'predictions': lear_price_ext_extlags_predictions}]
metrices['LR (Time + Price Lags + External Features + External Lags)'] = lear_price_ext_extlags_metrics

In [None]:
print(lear_price_ext_extlags_metrics)

In [None]:
# plot coefficients
cols = ["weekend", "weekday", 'y_lag', 'electricity_production_forecast', 'electricity_consumption_forecast', 'wind_power_generation_forecast', 'solar_power_generation_forecast']

# **4. Visualization**

#### **4.1. Visualization of Predictions**

In [None]:
plot_predictions(predictions, df['y'])

In [None]:
plot_mae(predictions, df['y'])

In [None]:
plot_metrics(metrices)

In [None]:
# print mae for each model sorted by mean_squared_error
for key, value in sorted(metrices.items(), key=lambda x: x[1]['mean_squared_error']):
    print(f'{key}: {value["mean_squared_error"]}')

In [None]:
app = visualize_predictions(processed_data, predictions)

# run app and also show url
app.run_server(debug=True, use_reloader=False, jupyter_mode="external")

#### **4.2. Visualization of Predictions Frequencies**

In [None]:
top_k = 3
actual_extremes = calculate_price_extremes(pd.DataFrame(df['y'][df.index.year != 2016], columns=['y']), price_column='y', top_k=top_k)

# Calculate the accuracy of the predictions
for key, value in predictions.items():
    predicted_extremes = calculate_price_extremes(pd.DataFrame(value[0]['predictions'], columns=['y']), price_column='y', top_k=top_k)
    accuracy_dict = calculate_prediction_accuracy(actual_extremes, predicted_extremes, order=True, top_k=top_k, year_on_year=True)
    plot_prediction_accuracy_histogram(accuracy_dict, title=f"Accuracy of Top-k Hour Predictions [{key}]", year_on_year=True)

#### **4.3 Visualize Baseline and Fluctuations**

In [None]:
# load original data
file_path = '../data/spot_prices_fi_2016_2023.csv'
original_data = pd.read_csv(file_path, parse_dates=['date'], index_col='date')
original_data.index = pd.to_datetime(original_data.index, utc=True).tz_convert(None) 

In [None]:
original_data

In [None]:
# create empty dataframe with daily_fluctuation and baseline cols
df1 = pd.DataFrame(columns=['daily_fluctuation', 'baseline'])
for year, coeff in lear_price_ext_extlags_features_coeffs.items():
    # filter out the coeffs with keyword 'lag' on it
    lagged_coeff = coeff.filter(like='lag')

    # filter coeff without keyword 'lag'
    non_lagged_coeff = coeff[~coeff.index.str.contains('lag')]

    # now multiply the df_lear_price_ext_extlags features with the lagged coeff and non_lagged coeff and get values seperately under 'daily_fluctuation' and 'baseline' cols
    daily_fluctuation = df_lear_price_ext_extlags[df_lear_price_ext_extlags.index.year == year+1][non_lagged_coeff.index].mul(non_lagged_coeff.values).sum(axis=1)
    baseline = df_lear_price_ext_extlags[df_lear_price_ext_extlags.index.year == year+1][lagged_coeff.index].mul(lagged_coeff.values).sum(axis=1)

    # concat to df1
    df1 = pd.concat([df1, pd.DataFrame({'daily_fluctuation': daily_fluctuation, 'baseline': baseline})])

In [None]:
# plot the daily_fluctuation and baseline
fig = go.Figure()
fig.add_trace(go.Scatter(x=original_data.index, y=original_data['elspot-fi'], mode='lines', name='Actual', line=dict(color='green', width=2)))
fig.add_trace(go.Scatter(x=df1.index, y=df1['daily_fluctuation'], mode='lines', name='Daily Fluctuation', line=dict(color='red', width=2)))
fig.add_trace(go.Scatter(x=df1.index, y=df1['baseline'], mode='lines', name='Baseline', line=dict(color='blue', width=2)))
fig.add_trace(go.Scatter(x=df1.index, y=df1['baseline']+df1['daily_fluctuation'], mode='lines', name='Prediction', line=dict(color='black', width=2)))
# add predictions from exponential average
# fig.add_trace(go.Scatter(x=ea_predictions.index, y=ea_predictions.values, mode='lines', name='Exponential Average', line=dict(color='brown', width=2)))
fig.update_layout(title='Daily Fluctuation, Baseline, Actual Data', xaxis_title='Time', yaxis_title='Value')
# add range slider
# fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()

In [None]:
# check the sum of y_lag_ and hourly coeffs for each year
for year, coeffs in lear_price_ext_extlags_features_coeffs.items():
    # print the sum of y_lag_ coeffs and hourly coeffs
    print(f'Year {year}:')
    print('Lag coeffs sum:', coeffs.filter(like='y_lag_').sum().round(5))
    print('Hourly coeffs sum:' ,coeffs.filter(like='hour').sum().round(5))
    print('-'*20)


#### **4.4. Export Predictions**

In [None]:
# merge daily_fluctuation and baseline with original data, if the index does not match, insert nan
original_data = original_data.merge(df1, left_index=True, right_index=True, how='left')


In [None]:
original_data.to_csv('../data/predictions_2016_2023(time+external+price_lags+external_lags).csv')