In [1]:
import warnings
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score

from xgboost import XGBRegressor
from tqdm.auto import tqdm

import optuna

sns.set(style="ticks", palette="muted", color_codes=True)

warnings.filterwarnings("ignore")

plt.rcParams['axes.labelsize'] = 15
plt.rcParams['axes.titlesize'] = 20

seed = 42

In [2]:
train = pd.read_parquet("train_temporal_regression.parquet.gz")

In [3]:
unpkl_encoders = open("encoders.pkl", "rb")
encoders = pickle.load(unpkl_encoders)

In [4]:
selected_powiats = \
train.groupby(["voivodship", "powiat_voivod"])['CAQI_idx'].mean().\
    groupby("voivodship").nlargest(1).index.get_level_values(2)

df_subset = train.query("powiat_voivod in @selected_powiats")

In [5]:
df = encoders.fit_transform(df_subset)
# df = encoders.fit_transform(train)

X=df.drop("CAQI_idx", axis=1)
y=df["CAQI_idx"]

In [6]:
LEN = len(df['powiat_voivod'].unique())
print(LEN)

16


In [7]:
tss = TimeSeriesSplit(
    n_splits=10, 
    max_train_size=None, 
    test_size=None, 
    gap=LEN*15
)

In [8]:
# Define the objective function for Optuna
def objective(trial):
    # Define the hyperparameters to optimize
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500, 50),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.2),
        'gamma': trial.suggest_float('gamma', 0, 10),
        'alpha': trial.suggest_float('alpha', 0, 10),
        'lambda': trial.suggest_float('lambda', 0, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'booster': 'gbtree',
    }

    # Define the XGBoost regressor with the hyperparameters to optimize
    model = XGBRegressor(**params)

    # Evaluate the XGBoost regressor using TimeSeriesSplit cross-validation
    scores = cross_val_score(model, X=X, y=y, cv=tss, scoring='neg_root_mean_squared_error')

    # Return the mean of the negative mean squared error scores as the objective value for Optuna
    return scores.mean()*-1

# Create an Optuna study and optimize the objective function
study = optuna.create_study(direction='minimize', 
                            sampler=optuna.samplers.TPESampler(warn_independent_sampling=False))

study.optimize(objective, n_trials=100, show_progress_bar=True)

# Print the best hyperparameters found by Optuna
print()
print('Best hyperparameters: {}'.format(study.best_params))

[32m[I 2023-03-28 13:20:25,963][0m A new study created in memory with name: no-name-01705bc3-7f4b-4b41-99de-c69b0ccf4a99[0m


  0%|          | 0/100 [00:00<?, ?it/s]

[32m[I 2023-03-28 13:20:57,169][0m Trial 0 finished with value: 11.532062373612508 and parameters: {'n_estimators': 250, 'max_depth': 6, 'learning_rate': 0.007593236651807639, 'gamma': 7.3102612944746115, 'alpha': 7.980877826093808, 'lambda': 9.425851942497323, 'subsample': 0.5406880515174297, 'colsample_bytree': 0.5444078568393409, 'min_child_weight': 5}. Best is trial 0 with value: 11.532062373612508.[0m
[32m[I 2023-03-28 13:21:08,898][0m Trial 1 finished with value: 9.938962768627842 and parameters: {'n_estimators': 100, 'max_depth': 4, 'learning_rate': 0.14465829983854817, 'gamma': 1.9226599479427042, 'alpha': 9.46131163427754, 'lambda': 0.12611600056453232, 'subsample': 0.6236650435732408, 'colsample_bytree': 0.5995389572245535, 'min_child_weight': 3}. Best is trial 1 with value: 9.938962768627842.[0m
[32m[I 2023-03-28 13:21:26,659][0m Trial 2 finished with value: 9.993587996926077 and parameters: {'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.13828046063264504, 

[32m[I 2023-03-28 13:35:50,011][0m Trial 20 finished with value: 9.725623010642035 and parameters: {'n_estimators': 150, 'max_depth': 7, 'learning_rate': 0.055900602068540854, 'gamma': 4.495758120366551, 'alpha': 9.919496525886373, 'lambda': 6.386180593476539, 'subsample': 0.7028543788055065, 'colsample_bytree': 0.7661231507894555, 'min_child_weight': 7}. Best is trial 18 with value: 9.714091899446114.[0m
[32m[I 2023-03-28 13:36:28,313][0m Trial 21 finished with value: 9.803817106872682 and parameters: {'n_estimators': 150, 'max_depth': 8, 'learning_rate': 0.06115140952553455, 'gamma': 4.5560237809755435, 'alpha': 8.197573388375748, 'lambda': 5.878811424528164, 'subsample': 0.7068520793311688, 'colsample_bytree': 0.7700344577824996, 'min_child_weight': 7}. Best is trial 18 with value: 9.714091899446114.[0m
[32m[I 2023-03-28 13:37:12,779][0m Trial 22 finished with value: 9.721533374218687 and parameters: {'n_estimators': 200, 'max_depth': 7, 'learning_rate': 0.04873006052684402,

[32m[I 2023-03-28 13:54:55,261][0m Trial 40 finished with value: 9.685373832449514 and parameters: {'n_estimators': 250, 'max_depth': 7, 'learning_rate': 0.017523641311981704, 'gamma': 1.81148021049186, 'alpha': 9.33624532577643, 'lambda': 3.6118675741908293, 'subsample': 0.9414934682475861, 'colsample_bytree': 0.5706328561667104, 'min_child_weight': 9}. Best is trial 32 with value: 9.667613696739684.[0m
[32m[I 2023-03-28 13:55:54,061][0m Trial 41 finished with value: 9.707128798395868 and parameters: {'n_estimators': 250, 'max_depth': 7, 'learning_rate': 0.015153677001493828, 'gamma': 1.7948175519769003, 'alpha': 9.400931454919473, 'lambda': 3.384054171371988, 'subsample': 0.9441725888023299, 'colsample_bytree': 0.5851902640042984, 'min_child_weight': 9}. Best is trial 32 with value: 9.667613696739684.[0m
[32m[I 2023-03-28 13:56:59,467][0m Trial 42 finished with value: 9.713247829851605 and parameters: {'n_estimators': 250, 'max_depth': 8, 'learning_rate': 0.019529310488428424

[32m[I 2023-03-28 14:19:28,717][0m Trial 60 finished with value: 9.654789075336966 and parameters: {'n_estimators': 350, 'max_depth': 6, 'learning_rate': 0.01391074733064928, 'gamma': 2.791459402390517, 'alpha': 6.508246760578731, 'lambda': 3.868154113224536, 'subsample': 0.7686405634445621, 'colsample_bytree': 0.7222765806858807, 'min_child_weight': 7}. Best is trial 57 with value: 9.654057655233215.[0m
[32m[I 2023-03-28 14:20:41,704][0m Trial 61 finished with value: 10.049369390887632 and parameters: {'n_estimators': 350, 'max_depth': 6, 'learning_rate': 0.007448566887866156, 'gamma': 2.7932553271674325, 'alpha': 5.725369875198998, 'lambda': 3.9432086936746664, 'subsample': 0.7580247767450796, 'colsample_bytree': 0.7232150562482328, 'min_child_weight': 7}. Best is trial 57 with value: 9.654057655233215.[0m
[32m[I 2023-03-28 14:21:54,932][0m Trial 62 finished with value: 9.687437153248784 and parameters: {'n_estimators': 350, 'max_depth': 6, 'learning_rate': 0.0134149065324480

[32m[I 2023-03-28 14:43:51,536][0m Trial 80 finished with value: 9.648169185017691 and parameters: {'n_estimators': 450, 'max_depth': 5, 'learning_rate': 0.01639159204094288, 'gamma': 4.602027167366717, 'alpha': 7.896655772987791, 'lambda': 2.8215722287597744, 'subsample': 0.7881763097864617, 'colsample_bytree': 0.6375971701389493, 'min_child_weight': 4}. Best is trial 77 with value: 9.620122131833506.[0m
[32m[I 2023-03-28 14:45:05,739][0m Trial 81 finished with value: 9.627631339004262 and parameters: {'n_estimators': 450, 'max_depth': 5, 'learning_rate': 0.017904914196997312, 'gamma': 4.552187741225728, 'alpha': 7.951570521780045, 'lambda': 2.9649863326186043, 'subsample': 0.7879717445759209, 'colsample_bytree': 0.634271599384877, 'min_child_weight': 2}. Best is trial 77 with value: 9.620122131833506.[0m
[32m[I 2023-03-28 14:46:17,973][0m Trial 82 finished with value: 9.628215941600851 and parameters: {'n_estimators': 450, 'max_depth': 5, 'learning_rate': 0.01764856577731028,