#### XGBoost Regressor

In [4]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
from datetime import datetime, timedelta, date, time
import calendar
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
color_pal = sns.color_palette()

#### Set parameters

In [5]:
# define model name for result files
model_name = "XGBoost"  

# define quantiles 
quantiles = [0.025, 0.25, 0.5, 0.75, 0.975]

# set parametrs for model training and evaluation
test_start_date = "2024-02-22"  # test/evaluation data start
test_size = 168                 # prediction intervall for one split (168 hours = 1 week)
n_splits = 52                   # number of splits for TimeSeriesSplit

# Set Quantil-training data size in hours which is used for evaluation and final prediction
train_windows = {
    "q1_train_window": 8760 * 7,  # hours
    "q2_train_window": 8760 * 7,  # hours
    "q3_train_window": 8760 * 7,  # hours
    "q4_train_window": 8760 * 7,  # hours
    "q5_train_window": 8760 * 7,  # hours
}

# Switch features on or off
time_based_features = 2         # 0 for no time based features, 1 for dummy variables, 2 for categorical variables
lag_1week = 1                   # 0 for no lag features, 1 for lag features
lag_2week_mean = 1              # 0 for no lag features, 1 for lag features
lag_4week_mean = 1             # 0 for no lag features, 1 for lag features

In [6]:
# model parameters for each quantile
quantile_params = {
0.025: {'colsample_bytree': 0.624208763235312,
         'gamma': 0.0145891331311504,
         'learning_rate': 0.0474215920097818,
         'max_depth': 3,
         'min_child_weight': 10,
         'n_estimators': 273,
         'reg_alpha': 9.260376159150187,
         'reg_lambda': 6.670148918073076,
         'subsample': 0.624208763235312},
 0.25: {'colsample_bytree': 0.8945737421719431,
        'gamma': 4.616670111074299,
        'learning_rate': 0.0100758033430619,
        'max_depth': 4,
        'min_child_weight': 5,
        'n_estimators': 1559,
        'reg_alpha': 1.1010644173403994,
        'reg_lambda': 0.4844650974192357,
        'subsample': 0.8945737421719431},
 0.5: {'colsample_bytree': 0.5021766877363583,
       'gamma': 2.469117654739822,
       'learning_rate': 0.0121476064807998,
       'max_depth': 15,
       'min_child_weight': 1,
       'n_estimators': 2754,
       'reg_alpha': 0.6210633503918601,
       'reg_lambda': 7.862171551394882,
       'subsample': 0.5021766877363583},
 0.75: {'colsample_bytree': 0.839927608049344,
        'gamma': 0.8171978525643473,
        'learning_rate': 0.0320710915421366,
        'max_depth': 5,
        'min_child_weight': 4,
        'n_estimators': 1937,
        'reg_alpha': 0.0107431179093526,
        'reg_lambda': 5.208242779679924,
        'subsample': 0.839927608049344},
 0.975: {'colsample_bytree': 0.6567883304387866,
         'gamma': 0.1031657186631934,
         'learning_rate': 0.0299069104096565,
         'max_depth': 3,
         'min_child_weight': 6,
         'n_estimators': 921,
         'reg_alpha': 8.672234724227451,
         'reg_lambda': 6.019190115011864,
         'subsample': 0.6567883304387866}
         
}

#### Load Data

In [7]:
df = pd.read_csv('../data/combined_data/combined_energy_data.csv', parse_dates=["Datetime"], index_col="Datetime")

# define time zone as ezrope/berlin to account for time shifts in original data
df.index = pd.to_datetime(df.index, utc=True).tz_convert('Europe/Berlin')

# convert back to utc to rmeove time shifts
df.index = df.index.tz_convert('UTC')

# remove tz awareness
df.index = df.index.tz_localize(None)

#### Prepare and Clean Data for Model

In [None]:
df = df.loc['2016-01-01':]
df.drop_duplicates(inplace=True)
nan_count = df['target'].isna().sum()
nan_indices = df[df['target'].isna()].index
full_range = pd.date_range(start=df.index.min(), end=df.index.max(), freq='H')
missing_hours = full_range.difference(df.index)
duplicates = df[df.duplicated()]

print(f"NaN count in target column: {nan_count}")
print("Index", nan_indices)
print("Missing hours:", missing_hours)
print("Duplicates:", duplicates)

Anzahl der NaN-Werte in der target-Spalte: 0
Indizes der NaN-Werte in Spalte 'A': DatetimeIndex([], dtype='datetime64[ns]', name='Datetime', freq=None)
Fehlende Stunden: DatetimeIndex([], dtype='datetime64[ns]', freq='H')
Duplikate: Empty DataFrame
Columns: [ghi, rain, target, temperature, wind_speed_100m, wind_speed_10m, public_holiday]
Index: []


#### Create Time Based Features and Lag Features

In [9]:
# function for time based dummy features
if time_based_features == 1:
    def create_time_dummies(df):

        df['hour'] = df.index.hour.astyep.astype(int)
        df['dayofweek'] = df.index.dayofweek.astype(int)
        df['month'] = df.index.month.astype(int)
        df['year'] = df.index.year.astype(int)
        
        df_dummies = pd.get_dummies(df, columns=['hour', 'dayofweek', 'month', 'year'], dtype=int, drop_first=True)
        
        return df_dummies

    df = create_time_dummies(df)

# function for categorical time based features
if time_based_features == 2:
    
    def create_time_features(df):
        df = df.copy()
        df['hour'] = df.index.hour.astype("category")
        df['dayofweek'] = df.index.dayofweek.astype("category")
        df['month'] = df.index.month.astype("category")
        df['year'] = df.index.year.astype("category")
        df['weekofyear'] = df.index.isocalendar().week.astype("category")
    
        return df

    df = create_time_features(df)

In [10]:
# Lag 1 week, mean 2 week, mean 4 week
if lag_1week == 1:
    df['lag_1week'] = df['target'].shift(168)

if lag_2week_mean == 1:
    df['lag_2week_mean'] = (
        df['target'].shift(168) + 
        df['target'].shift(2*168)
    ) / 2

if lag_4week_mean == 1:
    df['lag_4week_mean'] = (
        df['target'].shift(168) + 
        df['target'].shift(2*168) +
        df['target'].shift(3*168) +
        df['target'].shift(4*168)
    ) / 4

# Function to add shifted rolling mean features ---
def add_rolling_mean_shifted(df, column, shift_hours, window_hours, name):
    
    df[name] = df[column].shift(shift_hours).rolling(window=window_hours).mean()
    return df

# Add Shifted Rolling Means (all shifted 1 week back)
rolling_configs = [
    (168, 24, 'rolling_1day_shifted'),
    (168, 168, 'rolling_1week_shifted'),
    (168, 336, 'rolling_2week_shifted'),
    (168, 502, 'rolling_3week_shifted'),
    (168, 672, 'rolling_4week_shifted'),
    (168, 1440, 'rolling_2month_shifted'),
    (168, 2160, 'rolling_3month_shifted'),
    (168, 8760, 'rolling_1year_shifted')
]

for shift, window, name in rolling_configs:
    df = add_rolling_mean_shifted(df, column='target', shift_hours=shift, window_hours=window, name=name)

In [11]:
 # Feature scaling
columns_to_scale = ['ghi', 'rain', 'temperature', 'wind_speed_100m','wind_speed_10m','lag_1week', 'lag_2week_mean', 'lag_4week_mean',
                            'rolling_1day_shifted','rolling_1week_shifted', 'rolling_2week_shifted', 'rolling_3week_shifted', 'rolling_4week_shifted',
                              'rolling_2month_shifted', 'rolling_3month_shifted', 'rolling_1year_shifted']

#### Rolling Window and Model

In [None]:
from sklearn.model_selection import TimeSeriesSplit
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler

data = df.copy()

test_start_date = pd.Timestamp(test_start_date)
test_size = test_size
n_splits = n_splits

results = pd.DataFrame(index=data.loc[test_start_date:].index)
results["target"] = data["target"].reindex(results.index)

# Loop quantiles
for i, quantile in enumerate(quantiles):
    train_window = train_windows[f"q{i + 1}_train_window"]
    predictions = []
    param_set = quantile_params[quantile]

    for split in range(n_splits):
        test_start = test_start_date + pd.Timedelta(hours=split * test_size)
        test_end = test_start + pd.Timedelta(hours=test_size - 1)

        train_end = test_start - pd.Timedelta(hours=1)
        train_start = max(train_end - pd.Timedelta(hours=train_window), data.index[0])

        train_data = data.loc[train_start:train_end]
        test_data = data.loc[test_start:test_end]

        X_tr = train_data.drop(columns=["target"])
        y_tr = train_data["target"]
        X_test = test_data.drop(columns=["target"])
        y_test = test_data["target"]

        time_features = ["hour", "dayofweek", "month", "year", "weekofyear"]
        for col in time_features:
            if col in X_tr.columns:
                X_tr[col] = X_tr[col].astype("category")
                X_test[col] = X_test[col].astype("category")

        scaler = StandardScaler()
        X_tr[columns_to_scale] = scaler.fit_transform(X_tr[columns_to_scale])
        X_test[columns_to_scale] = scaler.transform(X_test[columns_to_scale])

        model = XGBRegressor(
            objective="reg:quantileerror",
            quantile_alpha=quantile,
            learning_rate=param_set["learning_rate"],
            max_depth=param_set["max_depth"],
            subsample=param_set["subsample"],
            colsample_bytree=param_set["colsample_bytree"],
            gamma=param_set["gamma"],
            min_child_weight=param_set["min_child_weight"],
            reg_alpha=param_set["reg_alpha"],
            reg_lambda=param_set["reg_lambda"],
            n_estimators=param_set["n_estimators"],
            tree_method="hist",
            enable_categorical=True,
            random_state=42
        )

        model.fit(X_tr, y_tr, verbose=False)

        y_pred = model.predict(X_test)

        pred_df = pd.DataFrame({
            "index": test_data.index,
            f"q{quantile}": y_pred
        }).set_index("index")

        results.loc[pred_df.index, f"q{quantile}"] = pred_df[f"q{quantile}"]

In [13]:
results.dropna(subset=[col for col in results.columns if col.startswith("q")], inplace=True)

# sort quantile columns if quantile crossing occurs
def fix_quantile_crossing(results):
    
    quantile_columns = [col for col in results.columns if col.startswith('q')]
    
    for idx in results.index:
        sorted_values = sorted(results.loc[idx, quantile_columns].values)
        results.loc[idx, quantile_columns] = sorted_values
    
    return results

In [14]:
# safe results

folder = "results"
os.makedirs(folder, exist_ok=True)
results.to_csv(f"{folder}/{model_name}.csv", index=True)

#### Evaluation

In [15]:
# calucate quantile losses of all predictions
quantile_losses = {}

for q in quantiles:
    
    y_pred = results[f'q{q}']
    y_true = results['target']
    
    # pinball loss function multiplied by 2
    quantile_loss = np.where(y_pred > y_true, 
                             2 * (1 - q) * (y_pred - y_true), 
                             2 * q * (y_true - y_pred))
    
    quantile_losses[f'Quantile_{q}'] = quantile_loss.mean()

# losses of all quantile
total_loss_score = sum(quantile_losses.values())

# show results
print("Average loss by quantile:")
for quantile, loss in quantile_losses.items():
    print(f"{quantile}: {loss}")

print(f"\nTotal loss score over all quantiles: {total_loss_score}")

Average loss by quantile:
Quantile_0.025: 0.3279046904582244
Quantile_0.25: 1.3962762908760882
Quantile_0.5: 1.4854316157770682
Quantile_0.75: 1.2452596906620066
Quantile_0.975: 0.2675266956138962

Total loss score over all quantiles: 4.722398983387284


In [16]:
# filter only relevant target horizons
results['hour'] = results.index.hour
results['dayofweek'] = results.index.dayofweek

horizons_dict = {}

# target horizons mapping with dayofweek and hour
target_horizons = [
    {"dayofweek": 4, "hour": 12, "name": "36"},  # Freitag 12:00 Stunde: 36
    {"dayofweek": 4, "hour": 16, "name": "40"},  # Freitag 16:00 Stunde: 40
    {"dayofweek": 4, "hour": 20, "name": "44"},  # Freitag 20:00 Stunde: 44
    {"dayofweek": 5, "hour": 12, "name": "60"},  # Samstag 12:00 Stunde: 60
    {"dayofweek": 5, "hour": 16, "name": "64"},  # Samstag 16:00 Stunde: 64
    {"dayofweek": 5, "hour": 20, "name": "68"},  # Samstag 20:00 Stunde: 68
]

# filter results for target horizons
for horizon in target_horizons:
    horizon_data = results[(results["dayofweek"] == horizon["dayofweek"]) & (results["hour"] == horizon["hour"])]
    horizon_data = horizon_data.drop(columns=["hour", "dayofweek"])

    horizons_dict[horizon["name"]] = horizon_data

In [17]:
# quantile losses target horizons
def calculate_quantile_losses(horizons_dict, quantiles):
    all_quantile_losses = {}
    
    for key, df in horizons_dict.items():
        quantile_losses = {}
        for q in quantiles:
            y_pred = df[f'q{q}']
            y_true = df['target']
            quantile_loss = np.where(y_pred > y_true, 2 * (1 - q) * (y_pred - y_true), 2 * q * (y_true - y_pred))
            quantile_losses[f'q{q}'] = quantile_loss.mean()
        
        total_loss_score = sum(quantile_losses.values())
        quantile_losses['Total_Loss_Score'] = total_loss_score
        all_quantile_losses[key] = quantile_losses
    
    return all_quantile_losses

quantile_loss_results = calculate_quantile_losses(horizons_dict, quantiles)

horizon_results_df = pd.DataFrame(quantile_loss_results).T
horizon_results_df

Unnamed: 0,q0.025,q0.25,q0.5,q0.75,q0.975,Total_Loss_Score
36,0.541793,1.855171,1.666464,1.325239,0.273044,5.661712
40,0.398925,1.660752,1.511442,1.193345,0.252959,5.017422
44,0.218967,1.078848,1.14659,0.975798,0.308522,3.728725
60,0.292899,1.279497,1.602615,1.400417,0.316587,4.892016
64,0.229682,1.122383,1.318611,1.151482,0.225665,4.047822
68,0.249951,1.062891,1.264852,1.029637,0.171857,3.779188


#### Final Evaluation Score

In [18]:
horizon_results_df.sum()

q0.025               1.932216
q0.25                8.059543
q0.5                 8.510575
q0.75                7.075917
q0.975               1.548634
Total_Loss_Score    27.126885
dtype: float64