# Comparison on difficult cases

With this notebook, you can compare how changes to OpenSTEF improve it's performance on concrete cases where the performance of OpenSTEF was not that great. 

At Alliander, the operations team identified a number of cases where improvements of the forecast accuracy would be very valuable.
These cases are stored as 'fixed' train/test data experiments that can be used to systematically measure improvement ideas to OpenSTEF.

This notebook uses the `train_pipeline_common` function from OpenSTEF, which allows detailed modifications of the input data.

The notebook is structured as follows:
- Generate benchmark (needed once)
  - For each experiment:
    - read data
    - train model
    - generate forecast
  - Calculate performance metrics
  - (Optionally: repeat N times to make outcome more robust)
- Test improvement idea (import local OpenSTEF)
  - For each experiment:
    - read data
    - train model
    - generate forecast
  - Calculate performance metrics
  - (Optionally: repeat N times to make outcome more robust)
- Compare benchmark and improvement idea

In [None]:
import pandas as pd
import numpy as np
from datetime import timedelta
from tqdm.notebook import tqdm
import plotly.express as px
from typing import Dict, List

# Set plotly as the default pandas plotting backend
pd.options.plotting.backend = "plotly"
import plotly.io as pio

pio.renderers.default = "plotly_mimetype+notebook"

# Import required stuff from OpenSTEF
from openstef.data_classes.prediction_job import PredictionJobDataClass
from openstef.data_classes.model_specifications import ModelSpecificationDataClass

from openstef.metrics.figure import plot_feature_importance
from openstef.pipeline.train_model import train_model_pipeline
from openstef.pipeline.create_forecast import create_forecast_pipeline

## Generate Benchmark
Generate a benchmark using the latest OpenSTEF. Only needed once.

In [None]:
# Load experiment


def get_experiments(difficult_location_number: int = 1):
    """
    Returns a list of dicts, with each dict being an experiment with:
    - name
    - dataset
    - test_length # length counted from the end of the data that will be considered the Test period
    -"""
    complete_data = pd.read_csv(
        f"data/model_input_difficult_location_{difficult_location_number}.csv",
        parse_dates=True,
        index_col=0,
    )

    # Generate the experiments where we forecast for 1 day each time
    # Hardcoded on 10 days for now
    # Dataset contains both the 120 days of training data + the 1 day of test data
    result = [
        dict(  # Predict day 1
            name=f"N1_loc{difficult_location_number}",
            dataset=complete_data.iloc[0 * 24 * 4 : 120 * 24 * 4 + 24 * 4],
            test_length=24 * 4,
        ),
        dict(  # Predict day 2
            name=f"N2_loc{difficult_location_number}",
            dataset=complete_data.iloc[1 * 24 * 4 : 121 * 24 * 4 + 24 * 4],
            test_length=24 * 4,
        ),
        dict(  # Predict day 3
            name=f"N3_loc{difficult_location_number}",
            dataset=complete_data.iloc[2 * 24 * 4 : 122 * 24 * 4 + 24 * 4],
            test_length=24 * 4,
        ),
        dict(  # Predict day 4
            name=f"N4_loc{difficult_location_number}",
            dataset=complete_data.iloc[3 * 24 * 4 : 123 * 24 * 4 + 24 * 4],
            test_length=24 * 4,
        ),
        dict(  # Predict day 5
            name=f"N5_loc{difficult_location_number}",
            dataset=complete_data.iloc[4 * 24 * 4 : 124 * 24 * 4 + 24 * 4],
            test_length=24 * 4,
        ),
        dict(  # Predict day 6
            name=f"N6_loc{difficult_location_number}",
            dataset=complete_data.iloc[5 * 24 * 4 : 125 * 24 * 4 + 24 * 4],
            test_length=24 * 4,
        ),
        dict(  # Predict day 7
            name=f"N7_loc{difficult_location_number}",
            dataset=complete_data.iloc[6 * 24 * 4 : 126 * 24 * 4 + 24 * 4],
            test_length=24 * 4,
        ),
        dict(  # Predict day 8
            name=f"N8_loc{difficult_location_number}",
            dataset=complete_data.iloc[7 * 24 * 4 : 127 * 24 * 4 + 24 * 4],
            test_length=24 * 4,
        ),
        dict(  # Predict day 9
            name=f"N9_loc{difficult_location_number}",
            dataset=complete_data.iloc[8 * 24 * 4 : 128 * 24 * 4 + 24 * 4],
            test_length=24 * 4,
        ),
        dict(  # Predict day 10
            name=f"N10_loc{difficult_location_number}",
            dataset=complete_data.iloc[9 * 24 * 4 : 129 * 24 * 4 + 24 * 4],
            test_length=24 * 4,
        ),
    ]

    return result

In [None]:
difficult_location_number = 1
experiments = get_experiments(difficult_location_number)

In [None]:
# Shared prediction job specs
pj = dict(
    id=1,  # Should be updated for each experiment
    model="xgb",
    quantiles=[0.05, 0.1, 0.3, 0.7, 0.9, 0.95],
    name="benchmark",
)


# These are not relevant...
pj_specs_that_should_be_optional_in_future_openstef_versions = dict(
    forecast_type="demand",
    lat=52.0,
    lon=5.0,
    horizon_minutes=47 * 60,
    description="description",
    resolution_minutes=15,
    hyper_params={},
    feature_names=None,
)

pj.update(pj_specs_that_should_be_optional_in_future_openstef_versions)

pj = PredictionJobDataClass(**pj)

In [None]:
forecasts = pd.DataFrame()

for i, experiment in tqdm(enumerate(experiments)):
    print(f"Starting experiment: {experiment['name']}")
    #############
    # Data prep
    # Update pj
    pj['id'] = i
    pj['description'] = experiment['name']
    
    # Split train and test
    train = experiment['dataset'].iloc[:-experiment['test_length']]
    realised = experiment['dataset'].iloc[-experiment['test_length']:]['load']
    
    test = experiment['dataset'].copy(deep=True)
    # Set the test data to NaN for the last test_length rows
    test.iloc[-experiment['test_length']:,0] = np.nan # This assumes the load column is the first one!
    # For forecasting we use the last 14 days of data as historical data, which is used for the lagged features
    # So we select the last 14 days of the training data + the day to forecast
    test = test.iloc[-15*24*4:]
    
    ##############
    # train model
    models = train_model_pipeline(
    pj,
    train,
    check_old_model_age=False,
    mlflow_tracking_uri="./mlflow_trained_models",
    artifact_folder="./mlflow_artifacts",
    )
    
    #################
    # Generate forecast
    forecast = create_forecast_pipeline(pj, test, mlflow_tracking_uri="./mlflow_trained_models",)
    # Add realised to forecast
    forecast['load'] = realised
    # Only keep region that was actually forecasted
    forecast = forecast.iloc[-experiment['test_length']:]
    
    #######
    # Store forecast / Concatenate results
    forecasts = pd.concat([forecasts, forecast], axis=0)


In [None]:
# Get all column names that start with "quantile"
quantile_columns = [col for col in forecasts.columns if col.startswith("quantile")]

In [None]:
lambda x: 'quantile' in x.name and x.name != quantile_columns[0]
# apply lambda function to the columns
forecasts[quantile_columns]

In [None]:
##########
# Plot results
fig = forecasts[['load', 'forecast'] + quantile_columns].plot()
fig.update_traces(
        line=dict(color="red", width=2),
        selector=lambda x: 'load' in x.name)
fig.update_traces(
        line=dict(color="blue", width=2),
        selector=lambda x: 'forecast' in x.name)

# Show a green area between all quantile_columns
fig.update_traces(
        fill="tonexty",
        fillcolor="rgba(0,255,0,0.2)",
        selector=lambda x: 'quantile' in x.name)

# Set all quantile traces to be green with half transparency
fig.update_traces(
        line=dict(color="green", width=1),
        opacity=0.5,
        selector=lambda x: 'quantile' in x.name)

fig.show()



In [None]:
############
# Calculate metrics
    

In [None]:
display(forecasts)