# 1. Setup

In [None]:
import sys
from pathlib import Path
sys.path.append(str(Path().resolve().parent))

In [None]:
import json
import numpy as np
import optuna
import pandas as pd
import xgboost as xgb
from optuna.samplers import TPESampler
from pandas import DataFrame, Series
from sklearn.metrics import mean_absolute_error as MAE
from xgboost import DMatrix

In [None]:
from utils.loading import load_all_raw_data
from utils.preprocessing import process_all_dfs
from utils.merging import merge_all_dfs
from utils.feature_engineering import *

In [None]:
pd.set_option(
    "display.float_format",
    lambda x: f"{x:.2e}" if abs(x) < 0.01 and x != 0 else f"{x:.2f}",
)
pd.set_option('display.max_columns', 100)
pd.set_option("display.max_rows", 100)

In [None]:
RAW_DATA_PATH = "../data/raw_data/"
ADDITIONAL_DATA_PATH = "../data/additional_data/"

SEGMENT_C = ["county", "product_type", "is_business"]
CATEGORICAL_C = ["county", "product_type", "is_business", "is_consumption"]
TARGET_C = [
    "county",
    "product_type",
    "is_business",
    "is_consumption",
    "datetime",
]
RAND = 10

In [None]:
processed_dfs = process_all_dfs(
    load_all_raw_data(RAW_DATA_PATH, ADDITIONAL_DATA_PATH)
)

df = merge_all_dfs(processed_dfs, how="left")
df = add_dst_flag(df)
df = add_cyclic_datetime_features(df, drop_raw=True)

In [None]:
for lag in [2, 3, 7]:
    df = df.merge(
        get_lag(processed_dfs["train"][TARGET_C + ["target"]], lag=lag),
        how="left",
        on=TARGET_C,
    )

In [None]:
for window in [24, 24 * 3, 24 * 7, 24 * 14]:
    df = df.merge(
        get_moving_average(
            processed_dfs["train"]
            .set_index("datetime")
            .sort_index()
            .groupby(CATEGORICAL_C, observed=True, as_index=False),
            columns=["target"],
            window=window,
            # ).dropna(),
        ),
        how="left",
        on=TARGET_C,
    )

In [None]:
df["t_over_cap"] = (df["2d_lag_target"] / df["installed_capacity"]).astype(
    "float32"
)
df["t_over_eic"] = (df["2d_lag_target"] / df["eic_count"]).astype("float32")
df["cap_per_eic"] = (df["installed_capacity"] / df["eic_count"]).astype(
    "float32"
)

# 2. Baseline Model

In [None]:
FEATURES_TO_DROP = ["datetime", "data_block_id", "date"]

In [None]:
BASELINE_MODELS_DIR = Path("../models/xgb_baseline")
BASELINE_MODELS_DIR.mkdir(parents=True, exist_ok=True)
FH = 7  # weekly retraining
ITERS = 1000
VERBOSE = 0
ESR = 50
baseline_params = {
    "learning_rate": 0.1,
    "max_depth": 7,
    "random_state": RAND,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "objective": "reg:absoluteerror",
    "eval_metric": "mae",
    "tree_method": "hist",  # GPU
    "device": "cuda",  # GPU
    "n_jobs": -1,
}

## Fixed vs Expanding Splits

In [None]:
train_fix_splits, test_fix_splits = get_split_bounds(
    df["datetime"], expanding=False, fh=30
)
train_exp_splits, test_exp_splits = get_split_bounds(
    df["datetime"], expanding=True, fh=30
)

In [None]:
def drop_split(
    df: DataFrame, bounds: tuple, to_drop: list
) -> tuple[DataFrame, Series]:
    """
    Filters a DataFrame by datetime bounds, drops specified columns,
    and splits into features and target.

    Parameters
    ----------
    df : DataFrame
        Input DataFrame containing 'datetime' and 'target' columns.
    bounds : tuple
        A tuple specifying the datetime range to filter df.
    to_drop : list
        List of column names to drop from the df.

    Returns
    -------
    X : DataFrame
        DataFrame with specified columns dropped and 'target' removed.
    y : Series
        Target values corresponding to the 'target' column in the
        filtered DataFrame.
    """
    start, end = bounds[0], bounds[1]
    subset = df[(df["datetime"] >= start) & (df["datetime"] <= end)].drop(
        to_drop, axis=1
    )
    X, y = subset.drop(["target"], axis=1), subset["target"]
    return X, y

In [None]:
def load_train_save(
    df: DataFrame,
    split: dict,
    kind: str,
    expanding: bool,
    params: dict,
    to_drop: list,
    models_dir: Path,
    i: int,
    save: bool = True,
    num_boost_round: int = 1000,
    early_stopping_rounds: int = 50,
    verbose_eval: int = 0,
):
    exp_prefix = "fix" if not expanding else "exp"
    model_path = models_dir / f"{kind}_{exp_prefix}_{i}.ubj"
    meta_path = models_dir / f"{kind}_{exp_prefix}_{i}_meta.json"

    need_to_train = True
    if model_path.exists() and meta_path.exists():
        try:
            with open(meta_path, "r", encoding="utf-8") as fin:
                meta = json.load(fin)
            if (meta.get("train_start") == str(split["train"][0])) and (
                meta.get("train_end") == str(split["train"][1])
            ):
                need_to_train = False
                booster = xgb.Booster()
                booster.load_model(str(model_path))
            else:
                need_to_train = True
        except Exception:
            need_to_train = True

    X_test, y_test = drop_split(df, split["test"], to_drop)
    dtest = DMatrix(X_test, y_test, enable_categorical=True)
    del X_test

    if need_to_train:
        X_train, y_train = drop_split(df, split["train"], to_drop)
        dtrain = DMatrix(X_train, y_train, enable_categorical=True)
        del X_train, y_train

        evals = [(dtrain, "train")]
        if "val" in kind:
            evals.append((dtest, "val"))

        booster = xgb.train(
            params=params,
            dtrain=dtrain,
            num_boost_round=num_boost_round,
            evals=evals,
            early_stopping_rounds=early_stopping_rounds,
            verbose_eval=verbose_eval,
        )

        if save:
            booster.save_model(str(model_path))
            meta = {
                "train_start": str(split["train"][0]),
                "train_end": str(split["train"][1]),
                "kind": kind,
                "expanding": str(expanding),
            }
            with open(meta_path, "w", encoding="utf-8") as fout:
                json.dump(meta, fout, ensure_ascii=False, indent=2)

    return booster, dtest, y_test

In [None]:
mae_baseline = [
    np.empty(len(s))
    for s in [
        train_fix_splits,
        test_fix_splits,
        train_exp_splits,
        test_exp_splits,
    ]
]
splits_list = [
    ("baseline_val", False, train_fix_splits),
    ("baseline_test", False, test_fix_splits),
    ("baseline_val", True, train_exp_splits),
    ("baseline_test", True, test_exp_splits),
]
for i_sample, (kind, expanding, splits) in enumerate(splits_list):
    for i, split in enumerate(splits):
        booster, dtest, y_test = load_train_save(
            df,
            split,
            kind,
            expanding,
            baseline_params,
            FEATURES_TO_DROP,
            BASELINE_MODELS_DIR,
            i,
            True,
            ITERS,
            ESR,
            VERBOSE,
        )

        preds = booster.predict(dtest)
        mae_baseline[i_sample][i] = MAE(y_test, preds)

In [None]:
print("Validation MAE on fixed train data:", np.mean(mae_baseline[1]))
print("Validation MAE on expanding train data:", np.mean(mae_baseline[3]))

Validation MAE on fixed train data: 59.38811238606771
Validation MAE on expanding train data: 58.713759104410805


Since the MAE on the expanding training data is lower, the full training dataset will be used for the Optuna search without applying a sliding window.

## Single XGBoost Model Baseline

In [None]:
start_ts = df["datetime"].min()
april_test = get_month_splits(start_ts, 19, 1, 1, 1)
april_test

[{'train': (Timestamp('2021-09-01 00:00:00'),
   Timestamp('2023-03-31 23:00:00')),
  'test': (Timestamp('2023-04-01 00:00:00'),
   Timestamp('2023-04-30 23:00:00'))}]

In [None]:
baseline_params

{'learning_rate': 0.1,
 'max_depth': 7,
 'random_state': 10,
 'subsample': 0.8,
 'colsample_bytree': 0.8,
 'objective': 'reg:absoluteerror',
 'eval_metric': 'mae',
 'tree_method': 'hist',
 'device': 'cuda',
 'n_jobs': -1}

In [None]:
FEATURES_TO_DROP

['datetime', 'data_block_id', 'date']

In [None]:
BASELINE_MODELS_DIR

PosixPath('../models/xgb_baseline')

In [None]:
ITERS = 1000
ESR = 50
VERBOSE = 0

In [None]:
mae_baseline_test = []

In [None]:
for i, split in enumerate(april_test):
    booster, dtest, y_test = load_train_save(
        df,
        split,
        "test_2m",
        True,
        baseline_params,
        FEATURES_TO_DROP,
        BASELINE_MODELS_DIR,
        i,
        True,
        ITERS,
        ESR,
        VERBOSE,
    )
    preds = booster.predict(dtest)
    mae_baseline_test.append(MAE(y_test, preds))

In [None]:
mae_baseline_test

[72.80978393554688]

## Separate XGBoost Models for Consumption and Production

In [None]:
start_ts = df["datetime"].min()
splits = get_month_splits(start_ts, 17, 1, 1, 4)
splits

[{'train': (Timestamp('2021-09-01 00:00:00'),
   Timestamp('2023-01-31 23:00:00')),
  'test': (Timestamp('2023-02-01 00:00:00'),
   Timestamp('2023-02-28 23:00:00'))},
 {'train': (Timestamp('2021-09-01 00:00:00'),
   Timestamp('2023-02-28 23:00:00')),
  'test': (Timestamp('2023-03-01 00:00:00'),
   Timestamp('2023-03-31 23:00:00'))},
 {'train': (Timestamp('2021-09-01 00:00:00'),
   Timestamp('2023-03-31 23:00:00')),
  'test': (Timestamp('2023-04-01 00:00:00'),
   Timestamp('2023-04-30 23:00:00'))},
 {'train': (Timestamp('2021-09-01 00:00:00'),
   Timestamp('2023-04-30 23:00:00')),
  'test': (Timestamp('2023-05-01 00:00:00'),
   Timestamp('2023-05-31 23:00:00'))}]

In [None]:
baseline_params

{'learning_rate': 0.1,
 'max_depth': 7,
 'random_state': 10,
 'subsample': 0.8,
 'colsample_bytree': 0.8,
 'objective': 'reg:absoluteerror',
 'eval_metric': 'mae',
 'tree_method': 'hist',
 'device': 'cuda',
 'n_jobs': -1}

In [None]:
FEATURES_TO_DROP

['datetime', 'data_block_id', 'date']

In [None]:
BASELINE_MODELS_DIR

PosixPath('../models/xgb_baseline')

In [None]:
ITERS = 1000
ESR = 50
VERBOSE = 1

In [None]:
mae_split_models = []

for i, split in enumerate(splits):
    for iscons in [False, True]:
        booster, dtest, y_test = load_train_save(
            df.loc[df["is_consumption"] == iscons],
            split,
            f"test_4m_iscons_{iscons}",
            True,
            baseline_params,
            FEATURES_TO_DROP,
            BASELINE_MODELS_DIR,
            i,
            True,
            ITERS,
            ESR,
            VERBOSE,
        )
        preds = booster.predict(dtest)
        mae_split_models.append(
            {
                "split": i,
                "type": ["production", "consumption"][iscons],
                "mae": MAE(y_test, preds),
            }
        )

In [None]:
mae_split_models_df = DataFrame(mae_split_models)
mae_split_models_df

Unnamed: 0,split,type,mae
0,0,production,12.07
1,0,consumption,56.77
2,1,production,37.66
3,1,consumption,77.91
4,2,production,74.57
5,2,consumption,68.41
6,3,production,91.56
7,3,consumption,55.41


In [None]:
mae_split_models_df.groupby(["split"])["mae"].mean()

split
0   34.42
1   57.78
2   71.49
3   73.49
Name: mae, dtype: float64

In [None]:
mae_split_models_df.groupby(["type"])["mae"].mean()

type
consumption   64.63
production    53.96
Name: mae, dtype: float64

# 3. Optuna Search

In [None]:
# Splits for 3 models with different time period
optuna_train_lv = get_month_splits(start_ts, 12, 1, 3, 3)
optuna_train_lv

[{'train': (Timestamp('2021-09-01 00:00:00'),
   Timestamp('2022-08-31 23:00:00')),
  'test': (Timestamp('2022-09-01 00:00:00'),
   Timestamp('2022-09-30 23:00:00'))},
 {'train': (Timestamp('2021-09-01 00:00:00'),
   Timestamp('2022-11-30 23:00:00')),
  'test': (Timestamp('2022-12-01 00:00:00'),
   Timestamp('2022-12-31 23:00:00'))},
 {'train': (Timestamp('2021-09-01 00:00:00'),
   Timestamp('2023-02-28 23:00:00')),
  'test': (Timestamp('2023-03-01 00:00:00'),
   Timestamp('2023-03-31 23:00:00'))}]

In [None]:
def objective(trial):
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.03, 0.3),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "lambda": trial.suggest_float("lambda", 0.001, 100),
        "alpha": trial.suggest_float("alpha", 0.001, 100),
        "gamma": trial.suggest_float("gamma", 0, 10),
        "grow_policy": trial.suggest_categorical(
            "grow_policy", ["depthwise", "lossguide"]
        ),
        "tree_method": "hist",
        "device": "cuda",
        "objective": "reg:absoluteerror",
        "eval_metric": "mae",
    }
    num_boost_round = trial.suggest_int("num_boost_round", 500, 2500, step=500)

    cv_scores = np.empty(len(optuna_train_lv))

    for i, split in enumerate(optuna_train_lv):
        params["random_state"] = RAND + i
        X_train, y_train = drop_split(df, split["train"], FEATURES_TO_DROP)
        dtrain = DMatrix(X_train, y_train, enable_categorical=True)
        del X_train, y_train

        X_val, y_val = drop_split(df, split["test"], FEATURES_TO_DROP)
        dval = DMatrix(X_val, y_val, enable_categorical=True)
        del X_val

        evals = [(dtrain, "train"), (dval, "val")]

        booster = xgb.train(
            params=params,
            dtrain=dtrain,
            evals=evals,
            num_boost_round=num_boost_round,
            early_stopping_rounds=ESR,
            verbose_eval=VERBOSE,
        )

        preds = booster.predict(dval)
        cv_scores[i] = MAE(y_val, preds)

    return np.mean(cv_scores)

In [None]:
STORAGE = "sqlite:///../optuna_db/optuna_study_long_val_rand_incr.db"
n_trials = 120
n_startup_trials = 20

In [None]:
optuna.logging.set_verbosity(optuna.logging.INFO)

In [None]:
study_lvri = optuna.create_study(
    storage=STORAGE,
    sampler=TPESampler(n_startup_trials=n_startup_trials, multivariate=True),
    pruner=optuna.pruners.SuccessiveHalvingPruner(),
    study_name="xgb_optuna",
    direction="minimize",
    load_if_exists=True,
)
existing_trials = len(study_lvri.trials)

if existing_trials >= n_trials:
    print("Number of existing trials >= n_trials. Skipping optimization.")
else:
    remaining = n_trials - existing_trials
    print(f"Run {remaining} trials to reach {n_trials}")
    study_lvri.optimize(
        objective,
        n_trials=remaining,
        show_progress_bar=True,
        n_jobs=1,
    )

[I 2025-11-24 19:34:48,303] Using an existing study with name 'xgb_optuna' instead of creating a new one.


Number of existing trials >= n_trials. Skipping optimization.


In [None]:
study_lvri.best_value

44.08116086324056

In [None]:
best_params = study_lvri.best_params.copy()
num_boost_round = best_params.pop("num_boost_round")
best_params.update({"random_state": RAND})
for k, v in best_params.items():
    print(k, v)
print(num_boost_round)

learning_rate 0.03876143929687533
max_depth 10
min_child_weight 15
subsample 0.6036800627210576
colsample_bytree 0.9850053687946877
lambda 0.8860473392127588
alpha 14.228611523759795
gamma 3.526361333846049
grow_policy lossguide
random_state 10
2500


In [None]:
april_test

[{'train': (Timestamp('2021-09-01 00:00:00'),
   Timestamp('2023-03-31 23:00:00')),
  'test': (Timestamp('2023-04-01 00:00:00'),
   Timestamp('2023-04-30 23:00:00'))}]

In [None]:
mae_tests = []
OPTUNA_MODELS_DIR = Path("../models/xgb_optuna")
OPTUNA_MODELS_DIR.mkdir(parents=True, exist_ok=True)

for i, split in enumerate(april_test):
    X_train, y_train = drop_split(df, split["train"], FEATURES_TO_DROP)
    dtrain = xgb.DMatrix(X_train, y_train, enable_categorical=True)
    del X_train, y_train
    
    booster = xgb.train(
        params=best_params,
        dtrain=dtrain,
        num_boost_round=num_boost_round,
        verbose_eval=True,
    )
    del dtrain

    model_path = OPTUNA_MODELS_DIR / f"optuna_split_{i}.ubj"
    meta_path = OPTUNA_MODELS_DIR / f"optuna_split_{i}_meta.json"

    meta = {
        "train_start": str(split["train"][0]),
        "train_end": str(split["train"][1]),
        "test_start": str(split["test"][0]),
        "test_end": str(split["test"][1]),
        "params": best_params,
        "num_boost_round": num_boost_round,
    }

    booster.save_model(model_path)
    with open(meta_path, "w", encoding="utf-8") as fout:
        json.dump(meta, fout, ensure_ascii=False, indent=2)

    X_test, y_test = drop_split(df, split["test"], FEATURES_TO_DROP)
    dtest = xgb.DMatrix(X_test, enable_categorical=True)
    del X_test
    preds = booster.predict(dtest)
    mae_tests.append(MAE(y_test, preds))

In [None]:
print(mae_tests)

[71.47074890136719]
