# 1. Setup

In [None]:
import sys
from pathlib import Path
sys.path.append(str(Path().resolve().parent))

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import seaborn as sns
from pandas import DataFrame, Series, Timedelta
from sklearn.metrics import mean_absolute_error as MAE
from xgboost import XGBRegressor

In [None]:
from utils.loading import load_all_raw_data
from utils.preprocessing import process_all_dfs
from utils.merging import merge_all_dfs
from utils.feature_engineering import *

In [None]:
pd.set_option(
    "display.float_format",
    lambda x: f"{x:.2e}" if abs(x) < 0.01 and x != 0 else f"{x:.2f}",
)
pd.set_option('display.max_columns', 100)
pd.set_option("display.max_rows", 100)

In [None]:
RAW_DATA_PATH = "../data/raw_data/"
ADDITIONAL_DATA_PATH = "../data/additional_data/"

SEGMENT_C = ["county", "product_type", "is_business"]
CATEGORICAL_C = ["county", "product_type", "is_business", "is_consumption"]
TARGET_C = [
    "county",
    "product_type",
    "is_business",
    "is_consumption",
    "datetime",
]
RAND = 10

In [None]:
processed_dfs = process_all_dfs(
    load_all_raw_data(RAW_DATA_PATH, ADDITIONAL_DATA_PATH)
)

# processed_dfs.keys()
df = merge_all_dfs(processed_dfs, how="left")
df = add_dst_flag(df)
df = add_cyclic_datetime_features(df, drop_raw=True)

In [None]:
for lag in [2, 3, 7]:
# for lag in range(2, 15):
    df = df.merge(
        get_lag(processed_dfs["train"][TARGET_C + ["target"]], lag=lag),
        how="left",
        on=TARGET_C,
    )

In [None]:
for window in [24, 24 * 3, 24 * 7, 24 * 14]:
    # for window in [24 * i for i in range(1, 15)]:
    df = df.merge(
        get_moving_average(
            processed_dfs["train"]
            .set_index("datetime")
            .sort_index()
            .groupby(CATEGORICAL_C, observed=True, as_index=False),
            columns=["target"],
            window=window,
            # ).dropna(),
        ),
        how="left",
        on=TARGET_C,
    )

In [None]:
df["t_over_cap"] = (df["2d_lag_target"] / df["installed_capacity"]).astype(
    "float32"
)
df["t_over_eic"] = (df["2d_lag_target"] / df["eic_count"]).astype("float32")
df["cap_per_eic"] = (df["installed_capacity"] / df["eic_count"]).astype(
    "float32"
)

In [None]:
FEATURES_TO_DROP = ["datetime", "data_block_id", "date"]

In [None]:
VAL_SPLITS = 2
FH = 7 # weekly retraining

In [None]:
# train - val - test
# .64 - .16 - .20

In [None]:
val_dt_start = (
    df["datetime"].min() + (df["datetime"].max() - df["datetime"].min()) * 0.64
).normalize()
test_dt_start = (
    df["datetime"].min() + (df["datetime"].max() - df["datetime"].min()) * 0.8
).normalize()
print(
    f"Validation period starts: {val_dt_start}",
    f"Test period starts: {test_dt_start}",
    sep="\n",
)

Validation period starts: 2022-10-14 00:00:00
Test period starts: 2023-01-24 00:00:00


In [None]:
train_days_range = (
    val_dt_start - Timedelta(hours=1) - df["datetime"].min()
).days
train_days_range

407

In [None]:
splits_fixed = split_by_equal_days(
    dt=df.loc[df["datetime"] < test_dt_start, "datetime"],
    train_days=train_days_range,
    fh=FH, 
    n_splits=VAL_SPLITS,
    expanding=False,
)

# new_splits = []

# for i, d in enumerate(splits_fixed):
#     count = len(
#         df.loc[
#             (df["datetime"] >= d["val"][0]) & (df["datetime"] <= d["val"][1])
#         ]
#     )
#     if count < 24:
#         print(f"Split {i} only has {count} rows, removed")
#     else:
#         new_splits.append(d)

# splits_fixed = new_splits
# for i, d in enumerate(splits_fixed):
#     print(i, "train", d["train"])
#     print(i, "valid", d["val"])

In [None]:
# xgb_mae_history = []

# xgb_p = {
#     "n_estimators": 100,
#     "learning_rate": 0.1,
#     "max_depth": 7,
#     "random_state": RAND,
#     "subsample": 0.8,
#     "colsample_bytree": 0.8,
#     "objective": "reg:absoluteerror",
#     "enable_categorical": True,
#     "early_stopping_rounds": 20,
#     "eval_metric": "mae",
#     "n_jobs": -1,
# }
# xgbr = XGBRegressor(**xgb_p)

# for split in splits_fixed:
#     df_train = df[
#         (df["datetime"] >= split["train"][0])
#         & (df["datetime"] <= split["train"][1])
#     ].drop(FEATURES_TO_DROP, axis=1)
#     df_val = df[
#         (df["datetime"] >= split["val"][0])
#         & (df["datetime"] <= split["val"][1])
#     ].drop(FEATURES_TO_DROP, axis=1)

#     X_train, y_train = (
#         df_train.drop(["target"], axis=1),
#         df_train["target"],
#     )
#     X_val, y_val = df_val.drop(["target"], axis=1), df_val["target"]
#     # X_test, y_test = df_test.drop(["target"], axis=1), df_test["target"]

#     eval_set = [
#         (X_train, y_train),
#         (X_val, y_val),
#         # (X_test, y_test)
#     ]
#     # print(f"{i+1} split")

#     # Naive baseline
#     naive_mae.append(
#         {
#             "validation_0": MAE(
#                 eval_set[0][1].loc[X_train["2d_lag_target"].notna()],
#                 eval_set[0][0]["2d_lag_target"].loc[
#                     X_train["2d_lag_target"].notna()
#                 ],
#             ),
#             "validation_1": MAE(
#                 eval_set[1][1].loc[X_val["2d_lag_target"].notna()],
#                 eval_set[1][0]["2d_lag_target"].loc[
#                     X_val["2d_lag_target"].notna()
#                 ],
#             ),
#         }
#     )

#     XGBRegressor.fit(
#         X_train,
#         y_train,
#         eval_set=eval_set,
#         verbose=0,
#         # verbose=25,
#     )
#     xgb_mae_history.append(model.evals_result())

In [None]:
OPTUNA_ESTIMATORS = 1000
OPTUNA_ESR = 50
VERBOSE = 100

In [None]:
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 500, 2000, step=500),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 12),

        "early_stopping_rounds": OPTUNA_ESR,
        "random_state": RAND,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "objective": "reg:absoluteerror",
        "eval_metric": "mae",
        "enable_categorical": True,
        "n_jobs": -1,

    }
    model = XGBRegressor(**params)

    splits_fixed = split_by_equal_days(
        dt=df.loc[df["datetime"] < test_dt_start, "datetime"],
        train_days=train_days_range,
        fh=7, # weekly retraining
        n_splits=VAL_SPLITS,
        expanding=False,
    )
    cv_predicts = np.empty(VAL_SPLITS)

    for idx, split in enumerate(splits_fixed):
        df_train = df[
            (df["datetime"] >= split["train"][0])
            & (df["datetime"] <= split["train"][1])
        ].drop(FEATURES_TO_DROP, axis=1)
        df_val = df[
            (df["datetime"] >= split["val"][0])
            & (df["datetime"] <= split["val"][1])
        ].drop(FEATURES_TO_DROP, axis=1)
    
        X_train, y_train = (
            df_train.drop(["target"], axis=1),
            df_train["target"],
        )
        X_val, y_val = df_val.drop(["target"], axis=1), df_val["target"]
        # X_test, y_test = df_test.drop(["target"], axis=1), df_test["target"]
        eval_set = [
            (X_train, y_train),
            (X_val, y_val),
            # (X_test, y_test)
        ]

        model.fit(
            X_train,
            y_train,
            eval_set=eval_set,
            verbose=VERBOSE,
        )

        preds = model.predict(X_val)
        cv_predicts[idx] = MAE(y_val, preds)

    return np.mean(cv_predicts)

In [None]:
study = optuna.create_study(
    direction="minimize",
    pruner=optuna.pruners.SuccessiveHalvingPruner(),
    study_name="xgb_optuna",
)

study.optimize(objective, n_trials=5, show_progress_bar=True, n_jobs=1)
# >1h 10 n_jobs=-1

In [None]:
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
# for split in [[xgb_mae_history[i]["validation_1"]["mae"][-1] for i in range(split * 5, split * 5 + 5)] for split in range(4)]:
#     print(np.round(np.mean(split), 3))

# for split in [[lgbm_mae_history[i]["valid_1"]["l1"][-1] for i in range(split * 5, split * 5 + 5)] for split in range(4)]:
#     print(np.round(np.mean(split), 3))

# for split in [[cb_mae_history[i]["validation_1"]["MAE"][-1] for i in range(split * 5, split * 5 + 5)] for split in range(4)]:
#     print(np.round(np.mean(split), 3))

# [[i for i in range(split * 5, split * 5 + 5)] for split in range(4)]

# residuals = y_test - y_pred

# plt.scatter(y_pred, residuals, alpha=0.5)
# plt.axhline(0, color="red", linestyle="--")
# plt.xlabel("Predicted")
# plt.ylabel("Residuals")
# plt.title("Residuals vs Predicted")
# plt.show()

# fi = pd.Series(xgbr.feature_importances_, index=X_train.columns)
# fi = fi.sort_values(ascending=False).head(20)

# fi.plot.barh(figsize=(8,6))
# plt.xlabel("Feature Importance")
# plt.gca().invert_yaxis()
# plt.title("Top 20 Important Features")
# plt.show()