In [2]:
import pandas as pd
import os
import pickle
os.chdir('..')
import matplotlib.pyplot as plt
import plotly.express as px
from utils import *

# ml imports
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer
from xgboost import XGBRegressor

# disable warnings
import warnings
warnings.filterwarnings('ignore')

# logging
import wandb
from wandb.xgboost import WandbCallback
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mnikolaushouben[0m ([33mwattcast[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
df_first = pd.read_csv(r'data\raw_data\UTC_id1_data.csv', index_col=0, parse_dates=True, sep=';').iloc[:,:1]

df_first = df_first.resample('60min').mean()

df_first.columns = ['power']

In [4]:
# Data cleaning
#-----------------------#
df = remove_duplicate_index(df_first)

df = remove_days(df, 0.2)

df[df<=0] = 1e-6 # because of log transform

df_clean = df.copy().fillna(method='ffill')

Removed 0 days with less than 20.0% of average total energy consumption of all days


In [5]:
px.line(df_clean)

## Wandb

### Testing

In [11]:
config_features = {
    'holidays': True,
    'holidays_country': 'AT',
    'peaks': True,
    'lagged_days': 1,
    'boxcox': True,
    'lam': None,
    'cumsum': True,
    'datetime': True,
    }

model_params = { 
                'tree_method': 'gpu_hist', 
                'predictor': 'gpu_predictor',
                'objective': 'reg:pseudohubererror',
                'eval_metric': 'mae',
                'early_stopping_rounds': 20,
                'n_estimators': 1000,
                'max_depth': 12,
                'learning_rate': 0.01,
                'reg_lambda': 0.3,
                'num_parallel_tree': 3,

    }


#plotting parameters
rows_to_plot = 6 # min 1
outputs_per_row = 6

### Lets train with wandb

In [12]:
def lets_train():
    wandb.init()
    wandb.config.update(config_features)
    wandb.config.update(model_params)

    config = wandb.config

    #-----------------------#

    df = df_clean.copy()
    # Feature Engineering
    #-----------------------#
    if config.boxcox:
        df, config.lam = boxcox_transform(df)

    # scaling the data

    scaler = MinMaxScaler()
    df[df.columns] = scaler.fit_transform(df[df.columns])


    # long to wide format, because we are doing daily predictions
    df_pivot = timeseries_dataframe_pivot(df).dropna()
    n_timesteps_per_output = df_pivot.shape[1]

    # peaks
    if config.peaks:
        for i in range(1,config.lagged_days+1):
            df_peak_feature = timeseries_peak_feature_extractor(df)
            df_peak_feature_shifted = df_peak_feature.shift(i)
            df_peak_feature_shifted.columns = [f"{col}_lag_{i}" for col in df_peak_feature_shifted.columns]
            df_pivot = pd.concat([df_pivot, df_peak_feature_shifted], axis=1)
            df_pivot.index = pd.to_datetime(df_pivot.index)
    df_pivot = df_pivot.dropna()

    # past information
    if config.cumsum:
        df = calc_rolling_sum_of_load(df, 7)
        df["load_of_last_day"] = df.iloc[:,:n_timesteps_per_output].sum(axis=1).shift(1)

    # holidays
    if config.holidays:
        years = get_year_list(df_pivot)
        df_holidays_dummies = get_holidays(years, config.holidays_country)
        df_holidays = create_holiday_features(df_pivot, df_holidays_dummies)
        df_pivot = pd.concat([df_pivot, df_holidays], axis=1)

    # datetime features
    if config.datetime:
        df_pivot = create_datetime_features(df_pivot)

    df_final = df_pivot.astype('float32').iloc[1:-1,:] # last row is nan

    #-----------------------#

    # Splits 

    #-----------------------#
    test_size = 0.2
    train, test = train_test_split(df_final, test_size=test_size, shuffle=True, random_state=42)
    test_idx = unpivot_timeseries_dataframe(test.iloc[:, :n_timesteps_per_output]).index

    #features and target
    X_train = train.iloc[:,n_timesteps_per_output:]
    y_train = train.iloc[:,:n_timesteps_per_output]

    X_test = test.iloc[:,n_timesteps_per_output:]
    y_test = test.iloc[:,:n_timesteps_per_output]


    scaler_features = MinMaxScaler()
    X_train[X_train.columns] = scaler_features.fit_transform(X_train[X_train.columns])
    X_test[X_test.columns] = scaler_features.transform(X_test[X_test.columns])

    #-----------------------#

    # fit the model and make predictions

    #-----------------------#

    model = XGBRegressor(
        # set the parameters
        tree_method = 'gpu_hist',
        predictor = 'gpu_predictor',
        objective = 'reg:pseudohubererror',
        eval_metric = config.eval_metric,
        early_stopping_rounds=config.early_stopping_rounds,
        callbacks=[WandbCallback()],
        
        # tunable parameters
        learning_rate=config.learning_rate,
        max_depth=config.max_depth,
        n_estimators=config.n_estimators,
        reg_lambda=config.reg_lambda,
        )

    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

    predictions = model.predict(X_test)
    predictions_reshaped = post_process_xgb_predictions(predictions, config.boxcox, scaler, config.lam)
    gt = df_first.loc[test_idx].values.flatten()


    #-----------------------#

    # Preparing the data for plotting and metrics

    # -----------------------#

    # per timestep
    df_compare = pd.DataFrame({
                            'gt':gt,
                            'per_timestep': predictions_reshaped
                            }
                            )
    
    
    eval_score = min(model.evals_result()['validation_0'][f'{config.eval_metric}'])
    wandb.log({f"{config.eval_metric}": eval_score})
    
    val_score = calculate_dtw_per_day(df_compare, predictions)
    wandb.log({"dtw": val_score})

    # per day
    predictions_per_day = predictions_reshaped.reshape(predictions.shape).sum(axis=1)
    gt_per_day = gt.reshape(predictions.shape).sum(axis=1)
    df_daily_sum = pd.DataFrame({ 
                                'gt_sums':gt_per_day,
                                'per_day': predictions_per_day
                                }, index=list(set(test_idx.date))
                                ).sort_index()


    # Plotting and logging

    # -----------------------#
    fig, ax = plt.subplots(rows_to_plot,1 , figsize=(40,20))
    for i in range(rows_to_plot):
        start = int(i*outputs_per_row*n_timesteps_per_output)
        end = int(start + outputs_per_row*n_timesteps_per_output)
        ax[i].plot(df_compare['gt'][start:end], label='gt')
        ax[i].plot(df_compare['per_timestep'][start:end], label='predictions')
    wandb.log({"Side-by-Side": wandb.Image(fig)})

    fig_daily = df_daily_sum.plot(title='Daily Sums of Load Profiles', kind='bar', figsize=(20,10))

    wandb.log({"Daily Sums": wandb.Image(fig_daily)})


    fig = px.line(df_compare, title='Load Profiles')

    # -----------------------#
    wandb.finish()





In [13]:
lets_train()

VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
best_iteration,▁
best_score,▁
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
mae,▁
validation_0-mae,█▇▇▆▅▅▄▄▄▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂

0,1
best_iteration,41.0
best_score,0.05527
epoch,61.0
mae,0.05527


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016933333330477276, max=1.0…

IndexError: tuple index out of range

### Execution

In [14]:
sweep_config = {
    'method': 'bayes', #grid, random
    'metric': {
        'name': 'dtw',
        'goal': 'minimize'
    },
    'parameters': {
        'learning_rate': {
            'values': [0.01,0.1,0.3, 0.4]
},
        'max_depth': {
            'values': [3, 5, 12, 15]
},
        'n_estimators': {
            'values': [500, 1000, 2000]
},
        'reg_lambda': {
            'values': [0.3, 0.5, 0.7]
    },

    
    'boxcox': {
        'values': [True, False]
}
}
}


sweep_id = wandb.sweep(sweep_config, project="XGBoost")
wandb.agent(sweep_id, lets_train)



Create sweep with ID: q2l0fpos
Sweep URL: https://wandb.ai/wattcast/XGBoost/sweeps/q2l0fpos


[34m[1mwandb[0m: Agent Starting Run: 1n2nb6jt with config:
[34m[1mwandb[0m: 	boxcox: False
[34m[1mwandb[0m: 	learning_rate: 0.3
[34m[1mwandb[0m: 	max_depth: 12
[34m[1mwandb[0m: 	n_estimators: 500
[34m[1mwandb[0m: 	reg_lambda: 0.5
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01693333333435779, max=1.0)…



0,1
best_iteration,▁
best_score,▁
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇██
mae,▁
validation_0-mae,█▆▄▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_iteration,21.0
best_score,0.04249
epoch,41.0
mae,0.04249


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: cov8n3rg with config:
[34m[1mwandb[0m: 	boxcox: False
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	n_estimators: 2000
[34m[1mwandb[0m: 	reg_lambda: 0.7
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016666666666666666, max=1.0…



0,1
best_iteration,▁
best_score,▁
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
mae,▁
validation_0-mae,█▆▅▄▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_iteration,61.0
best_score,0.03801
epoch,80.0
mae,0.03801


[34m[1mwandb[0m: Sweep Agent: Waiting for job.


### Grid Search

In [8]:
wandb.init(
    project="WattCast",
    )

wandb.config.update(config_features)
wandb.config.update(model_params)

config = wandb.config

#-----------------------#

df = df_clean.copy()
# Feature Engineering
#-----------------------#
if config.boxcox:
    df, config.lam = boxcox_transform(df)

# scaling the data

scaler = MinMaxScaler()
df[df.columns] = scaler.fit_transform(df[df.columns])


# long to wide format, because we are doing daily predictions
df_pivot = timeseries_dataframe_pivot(df).dropna()
n_timesteps_per_output = df_pivot.shape[1]

    # peaks
if config.peaks:
    df_peak_feature = timeseries_peak_feature_extractor(df)
    for i in range(1, config.lagged_days+1):
        df_peak_feature_shifted = df_peak_feature.shift(i)
        df_peak_feature_shifted.columns = [f"{col}_lag_{i}" for col in df_peak_feature_shifted.columns]
        df_pivot = pd.concat([df_pivot, df_peak_feature_shifted], axis=1)
        df_pivot.index = pd.to_datetime(df_pivot.index)
df_pivot = df_pivot.dropna()

# past information
if config.cumsum:
    df_pivot["load_of_last_day"] = df.iloc[:,:n_timesteps_per_output].sum(axis=1).shift(1)
    df_pivot["load_of_same_day_last_week"] = df.iloc[:,:n_timesteps_per_output].sum(axis=1).shift(7)


# holidays
if config.holidays:
    years = get_year_list(df_pivot)
    df_holidays_dummies = get_holidays(years, config.holidays_country)
    df_holidays = create_holiday_features(df_pivot, df_holidays_dummies)
    df_pivot = pd.concat([df_pivot, df_holidays], axis=1)

# datetime features
if config.datetime:
    df_pivot = create_datetime_features(df_pivot)

df_final = df_pivot.astype('float32').iloc[1:-1,:] # last row is nan

#-----------------------#

# Splits 


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01693333333435779, max=1.0)…

In [9]:

#-----------------------#
test_size = 0.2
train, test = train_test_split(df_final, test_size=test_size, shuffle=False)
test_idx = unpivot_timeseries_dataframe(test.iloc[:, :n_timesteps_per_output]).index

#features and target
X_train = train.iloc[:,n_timesteps_per_output:]
y_train = train.iloc[:,:n_timesteps_per_output]

X_test = test.iloc[:,n_timesteps_per_output:]
y_test = test.iloc[:,:n_timesteps_per_output]


scaler_features = MinMaxScaler()
X_train[X_train.columns] = scaler_features.fit_transform(X_train[X_train.columns])
X_test[X_test.columns] = scaler_features.transform(X_test[X_test.columns])

#-----------------------#

In [10]:
X_test

Unnamed: 0,peak_count_lag_1,height_highest_peak_lag_1,time_highest_peak_lag_1,height_second_highest_peak_lag_1,time_second_highest_peak_lag_1,load_of_last_day,load_of_same_day_last_week,days_until_next_holiday,days_since_last_holiday,day_of_week_sin,day_of_week_cos,month_sin,month_cos,is_weekend
2022-11-25,0.166667,0.271966,0.400000,0.633081,0.273684,0.403937,0.442624,0.128713,0.237624,0.277479,0.000000,0.25,0.933013,0.0
2022-11-26,0.333333,0.189569,0.305263,0.653486,0.494737,0.429488,0.528913,0.118812,0.247525,0.000000,0.356896,0.25,0.933013,1.0
2022-11-27,0.333333,0.246689,0.389474,0.685829,0.168421,0.418183,0.533166,0.108911,0.257426,0.099031,0.801938,0.25,0.933013,1.0
2022-11-28,0.333333,0.620460,0.578947,0.711662,0.705263,0.479039,0.502134,0.099010,0.267327,0.500000,1.000000,0.25,0.933013,0.0
2022-11-29,0.333333,0.396671,0.378947,0.685847,0.663158,0.493858,0.570835,0.089109,0.277228,0.900969,0.801938,0.25,0.933013,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-11-03,0.166667,0.556980,0.221053,0.710922,0.747368,0.558307,1.084563,0.346535,0.019802,0.277479,0.000000,0.25,0.933013,0.0
2023-11-04,0.166667,0.308650,0.978947,0.636569,0.842105,0.475298,0.535451,0.336634,0.029703,0.000000,0.356896,0.25,0.933013,1.0
2023-12-01,0.166667,0.409543,0.852632,0.686741,0.757895,0.458552,0.591410,0.069307,0.297030,0.277479,0.000000,0.50,1.000000,0.0
2023-12-02,0.500000,0.703370,0.600000,0.825139,0.863158,0.454328,0.604623,0.059406,0.306931,0.000000,0.356896,0.50,1.000000,1.0


In [12]:
param_grid = {
    'tree_method': ['gpu_hist'], 
    'predictor': ['gpu_predictor'],
    'objective': ['reg:pseudohubererror'],
    'learning_rate': [0.01,0.1, 0.3],
    'n_estimators': [100, 300, 500],
    'reg_lambda': [0, 0.1, 0.3],
    'max_depth': [12],
    'subsample': [0.7]
}

model_params = { 
                'tree_method': 'gpu_hist', 
                'predictor': 'gpu_predictor',
                'objective': 'reg:pseudohubererror',
                'n_estimators': 1500,
                'max_depth': 12,
                'learning_rate': 0.05,
                'reg_lambda': 0.8,
    }



In [13]:

xgb = XGBRegressor(**model_params)

# model evaluation

dtw_scorer = make_scorer(dtw_metric, greater_is_better=False)

random_search = RandomizedSearchCV(xgb, param_distributions=param_grid,
                                   n_iter=2, scoring=dtw_scorer, n_jobs=-1, cv=3, verbose=3, random_state=42)

random_search.fit(X_train, y_train)

model_1 = random_search.best_estimator_


Fitting 3 folds for each of 2 candidates, totalling 6 fits


In [15]:
random_search.best_params_

# decent params: 

# params = {'tree_method': 'gpu_hist',
#  'subsample': 0.7,
#  'reg_lambda': 0.3,
#  'predictor': 'gpu_predictor',
#  'objective': 'reg:pseudohubererror',
#  'n_estimators': 500,
#  'max_depth': 12,
#  'learning_rate': 0.01}

{'tree_method': 'gpu_hist',
 'subsample': 0.7,
 'reg_lambda': 0.3,
 'predictor': 'gpu_predictor',
 'objective': 'reg:pseudohubererror',
 'n_estimators': 500,
 'max_depth': 12,
 'learning_rate': 0.01}

In [14]:

predictions = model_1.predict(X_test)
predictions_reshaped = post_process_xgb_predictions(predictions, config.boxcox, scaler, config.lam)
gt = df_first.loc[test_idx].values.flatten()

#-----------------------#

# Preparing the data for plotting and metrics

# -----------------------#

# per timestep
df_compare = pd.DataFrame({
                        'gt':gt,
                        'per_timestep': predictions_reshaped
                        }
                        )


px.line(df_compare, x=df_compare.index, y=['gt', 'per_timestep'], title="Per timestep predictions")


In [None]:

# save the model to disk

filename = 'finalized_model.sav'
pickle.dump(model_1, open(filename, 'wb'))

### Benchmarks

In [None]:
# create the persistence benchmarks
gt_long = df_first.loc[test_idx]
benchmark_day = gt_long.shift(n_timesteps_per_day).fillna(0).values.flatten() # previous day
benchmark_week = gt_long.shift(n_timesteps_per_day*7).fillna(0).values.flatten() # same day of previous week
gt = gt_long.values.flatten()