# Analyzing pertubed inputs

In this notebook, we are evaluating how modifications of the inputdata affect the accuracy of the forecasting model.
To this end, the `train_pipeline_common` function is used from OpenSTEF, which allows detailed modifications of the input data.

As an example, this notebook investigates how maintenance affects forecasting the load on a single transformer.
Maintenance is simulated by multiplying the load by  0 (the transformer is turned off), or a factor of 2 (another transformer at the station is turned off, increasing the load on this transformer).
Here, we consider two cases;
- Maintenance in the train set
- Maintenance in the T-.. features.


This notebook can be addapted to perform other analyses where modifications on the inputdata need to be made.

In [None]:
import pandas as pd
import numpy as np
from datetime import timedelta
from tqdm.notebook import tqdm
import plotly.express as px

# Set plotly as the default pandas plotting backend
pd.options.plotting.backend = 'plotly'
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook"

# Import required stuff from OpenSTEF
from openstef.data_classes.prediction_job import PredictionJobDataClass
from openstef.data_classes.model_specifications import ModelSpecificationDataClass

from openstef.metrics.figure import plot_feature_importance
from openstef.pipeline.train_model import train_pipeline_common

## Load data from csv and inspect

In [None]:
load_feat_df = pd.read_csv('data/get_model_input_pid_435.csv', parse_dates=True, index_col=0)
# drop last two days since load here is na
load_feat_df = load_feat_df.iloc[:-196, :]
load_feat_df[['load']].plot()
load_feat_df.head()


In [None]:
# Specify shared specs
test_days = 2
# define properties of training/prediction. We call this a 'prediction_job'
pj = PredictionJobDataClass(
    id=1,
    model='xgb',
    quantiles=[50],
    horizon_minutes=24*60,
    save_train_forecasts=True, # so in the train set, the model is also evaluated.
    # Required arguments which are not meaningful in an analyses
    forecast_type="demand",
    lat=52.0,
    lon=5.0,
    resolution_minutes=15,
    name="Example", 
       )

## Example on using train pipeline to train a model and generate a forecast

In [None]:
test_indices = load_feat_df[load_feat_df.index>(load_feat_df.index.max()-timedelta(days=test_days))] # last `test_day` days

model, report, train_data, validation_data, test_data = train_pipeline_common(
    pj,
    ModelSpecificationDataClass(id=pj['id']),
    load_feat_df,
    [pj.horizon_minutes/60.],
    test_data_predefined=test_indices,
)

In [None]:
test_data[['load','forecast']].plot()
fig = train_data.resample('15T').mean()[['load','forecast']].plot()
fig.update_traces(connectgaps=False)


In [None]:
plot_feature_importance(model.feature_importance_dataframe)
# Note that gaps in the trainset are due to that data being in the validation set used for early stopping.

In [None]:
def add_kpi(df: pd.DataFrame(columns=['load','forecast']), kpi_df: pd.DataFrame, identifier: dict[str, [float, str]]) -> pd.DataFrame:
    """calculate kpis and add to kpi_df"""
    abs_err = test_data[['load','forecast']].diff(axis=1).iloc[:,1].abs()
    kpi = dict(
        mae=abs_err.mean()/1_000_000,
        rmae=abs_err.mean()/(test_data.load.max()-test_data.load.min()),
    )
    kpi.update(identifier)
    kpi_df = pd.concat([kpi_df, pd.DataFrame(kpi, index=[len(kpi_df)])])
    return kpi_df

In [None]:
kpi_df = pd.DataFrame()
n_simulations = 60 # increase this to increase the number of simulations
# Now, let's simulate n days of maintenance; half off, half high
n_days_modified_options = [0,2,4,6,12,18]
for i in tqdm(range(10)):
    # Sample dates to simulate maintenance
    n_days_modified=np.random.choice(n_days_modified_options)
    all_days = [date for date in load_feat_df.index.date if date < (test_data.index.min()-timedelta(days=2)).date()] # also not final 2 days of train set, so it doesn't affect most important features of testset
    mod_days = np.random.choice(all_days, n_days_modified, replace=False)
    
    # Modify load
    mod_df = load_feat_df.copy(deep=True)
    mod_df.loc[[x for x in mod_df.index if x.date() in mod_days[::2]],'load'] = 0
    mod_df.loc[[x for x in mod_df.index if x.date() in mod_days[1::2]],'load'] *= 2
    
    # Train and evaluate model
    model, report, train_data, validation_data, test_data = train_pipeline_common(
        pj,
        ModelSpecificationDataClass(id=pj['id']),
        mod_df,
        [pj.horizon_minutes/60.],
        test_data_predefined=test_indices,
    )

    # Calculate metric
    kpi_df = add_kpi(test_data[['load','forecast']], kpi_df=kpi_df, identifier=dict(n_days_modified=n_days_modified))
    
    #Plot the first 10 testsets
    if i<10:
        print(mod_days)
        fig = pd.concat([train_data[['load','forecast']], validation_data[['load', 'forecast']], test_data[['load','forecast']]],
                  axis=0).sort_index().resample('15T').mean().plot()
        fig.add_annotation(dict(x=train_data.index.max(), showarrow=False, text='End of train'))
        fig.update_traces(connectgaps=False)
        # Add highlighted areas of interest
        for date in mod_days:
            fig.add_shape(
                type="rect",
                x0=date,
                x1=date+timedelta(days=1),
                y0=0,
                y1=1,
                yref='paper',
                fillcolor="green",
                opacity=0.2,
            )
        fig.show()
            

In [None]:
# Plot KPIs
fig = px.box(kpi_df, x="n_days_modified", y="rmae")
fig.update_layout(yaxis=dict(rangemode='tozero'), width=300, height=300, margin=dict(t=0,b=0,l=0,r=0))
fig.show()

## Repeat, but now with the maintenance after training, but before the days under examination
Specify last 4 days of testing, apply maintenance on first two days and evaluate performance on last 2

In [None]:
non_train_indices = load_feat_df[load_feat_df.index>(load_feat_df.index.max()-timedelta(days=4))]
maintenance_indices = non_train_indices[:non_train_indices.index.min()+timedelta(days=2)]
test_indices = non_train_indices[maintenance_indices.index.max():]

In [None]:
kpi_t2d_df = pd.DataFrame()
n_days_simulation = 40 # Increase to make this more extensive

# Now, let's simulate n days of maintenance; after training and before forecast.
# 0<=n<=2, and maintenance causes load of either 0 or double
for i in tqdm(range(n_days_simulation)):
    # Sample which days to apply maintenance
    options = [(0,0),(0,1),(0,2),(1,0),(1,1),(2,0)] #n_days_high, n_days_low
    modification = options[np.random.choice(range(len(options)), size=1)[0]] 
    all_days = load_feat_df[~load_feat_df.index.isin(maintenance_indices)].index.date
    high_days = np.random.choice(list(set(maintenance_indices.index.date)), modification[0], replace=False)
    low_days = np.random.choice(list(set(maintenance_indices.index.date).difference(high_days)), modification[1], replace=False)
    
    # Modify load
    mod_df = load_feat_df.copy(deep=True)
    mod_df.loc[[x for x in mod_df.index if x.date() in low_days],'load'] = 0
    mod_df.loc[[x for x in mod_df.index if x.date() in high_days],'load'] *= 2
    
    # Train and evaluate model
    model, report, train_data, validation_data, test_data = train_pipeline_common(
        pj,
        ModelSpecificationDataClass(id=pj['id']),
        mod_df,
        [pj.horizon_minutes/60.],
        test_data_predefined=non_train_indices, # last 4 days
    )

    # Calculate error only over test_indices: last two days
    kpi_t2d_df = add_kpi(test_data.loc[test_indices.index.min():test_indices.index.max(),['load','forecast']],
                     kpi_df=kpi_t2d_df, 
                     identifier=dict(mod=str(modification)))
    
    #Plot the first 10 testsets
    if i<10:
        print(modification)
        fig = test_data[['load','forecast']].resample('15T').mean().plot()
        fig.update_traces(connectgaps=False)
    

In [None]:
fig = px.box(kpi_t2d_df, x="mod", y="rmae")
fig.update_layout(yaxis=dict(rangemode='tozero'), width=300, height=300, margin=dict(t=0,b=0,l=0,r=0))
fig.show()