In [1]:
import warnings
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score

from xgboost import XGBRegressor
from tqdm.auto import tqdm

import optuna

sns.set(style="ticks", palette="muted", color_codes=True)

warnings.filterwarnings("ignore")

plt.rcParams['axes.labelsize'] = 15
plt.rcParams['axes.titlesize'] = 20

seed = 42

In [2]:
train = pd.read_parquet("train_temporal_regression.parquet.gz")

In [3]:
unpkl_encoders = open("encoders.pkl", "rb")
encoders = pickle.load(unpkl_encoders)

In [4]:
selected_powiats = \
train.groupby(["voivodship", "powiat_voivod"])['CAQI_idx'].mean().\
    groupby("voivodship").nlargest(1).index.get_level_values(2)

df_subset = train.query("powiat_voivod in @selected_powiats")

In [5]:
df = encoders.fit_transform(df_subset)
# df = encoders.fit_transform(train)

X=df.drop("CAQI_idx", axis=1)
y=df["CAQI_idx"]

In [6]:
LEN = len(df['powiat_voivod'].unique())
print(LEN)

16


In [7]:
tss = TimeSeriesSplit(
    n_splits=10, 
    max_train_size=None, 
    test_size=None, 
    gap=LEN*15
)

In [8]:
# Define the objective function for Optuna
def objective(trial):
    # Define the hyperparameters to optimize
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 200, 500, 50),
        'eta': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
        'booster' : "gblinear",
        'eval_metric' : "rmse"
    }

    # Define the XGBoost regressor with the hyperparameters to optimize
    model = XGBRegressor(**params)

    # Evaluate the XGBoost regressor using TimeSeriesSplit cross-validation
    scores = cross_val_score(model, X=X, y=y, cv=tss, scoring='neg_root_mean_squared_error')

    # Return the mean of the negative mean squared error scores as the objective value for Optuna
    return scores.mean()*-1

# Create an Optuna study and optimize the objective function
study = optuna.create_study(direction='minimize', 
                            sampler=optuna.samplers.TPESampler(warn_independent_sampling=False))

study.optimize(objective, n_trials=100, show_progress_bar=True)

# Print the best hyperparameters found by Optuna
print()
print('Best hyperparameters: {}'.format(study.best_params))

[32m[I 2023-03-27 22:15:25,103][0m A new study created in memory with name: no-name-4af4558d-5284-4bd8-ae03-a176b00efa09[0m


  0%|          | 0/100 [00:00<?, ?it/s]

[32m[I 2023-03-27 22:15:41,365][0m Trial 0 finished with value: 11.454465752799829 and parameters: {'n_estimators': 500, 'learning_rate': 0.014503729526887586}. Best is trial 0 with value: 11.454465752799829.[0m
[32m[I 2023-03-27 22:15:54,400][0m Trial 1 finished with value: 10.718747138166682 and parameters: {'n_estimators': 300, 'learning_rate': 0.0970859847093839}. Best is trial 1 with value: 10.718747138166682.[0m
[32m[I 2023-03-27 22:16:16,091][0m Trial 2 finished with value: 10.739501932460731 and parameters: {'n_estimators': 500, 'learning_rate': 0.11413649403157394}. Best is trial 1 with value: 10.718747138166682.[0m
[32m[I 2023-03-27 22:16:26,947][0m Trial 3 finished with value: 10.802541438424294 and parameters: {'n_estimators': 300, 'learning_rate': 0.06254992081813109}. Best is trial 1 with value: 10.718747138166682.[0m
[32m[I 2023-03-27 22:16:38,030][0m Trial 4 finished with value: 10.733071322420178 and parameters: {'n_estimators': 300, 'learning_rate': 0.30

[32m[I 2023-03-27 22:24:03,870][0m Trial 39 finished with value: 10.727600010894355 and parameters: {'n_estimators': 250, 'learning_rate': 0.31124251122410157}. Best is trial 31 with value: 10.685227263801762.[0m
[32m[I 2023-03-27 22:24:11,649][0m Trial 40 finished with value: 10.722027122740842 and parameters: {'n_estimators': 200, 'learning_rate': 0.12664551913213418}. Best is trial 31 with value: 10.685227263801762.[0m
[32m[I 2023-03-27 22:24:19,257][0m Trial 41 finished with value: 10.706010266273971 and parameters: {'n_estimators': 200, 'learning_rate': 0.3998759489426337}. Best is trial 31 with value: 10.685227263801762.[0m
[32m[I 2023-03-27 22:24:27,053][0m Trial 42 finished with value: 10.715562034913889 and parameters: {'n_estimators': 200, 'learning_rate': 0.4108777745224247}. Best is trial 31 with value: 10.685227263801762.[0m
[32m[I 2023-03-27 22:24:34,727][0m Trial 43 finished with value: 10.729333656794378 and parameters: {'n_estimators': 200, 'learning_rate

[32m[I 2023-03-27 22:30:49,632][0m Trial 78 finished with value: 10.71956465616486 and parameters: {'n_estimators': 250, 'learning_rate': 0.33497666905828993}. Best is trial 67 with value: 10.674119139725658.[0m
[32m[I 2023-03-27 22:30:57,363][0m Trial 79 finished with value: 10.721941363072613 and parameters: {'n_estimators': 200, 'learning_rate': 0.38115533450480643}. Best is trial 67 with value: 10.674119139725658.[0m
[32m[I 2023-03-27 22:31:05,414][0m Trial 80 finished with value: 10.697336411017412 and parameters: {'n_estimators': 200, 'learning_rate': 0.449619544040034}. Best is trial 67 with value: 10.674119139725658.[0m
[32m[I 2023-03-27 22:31:14,552][0m Trial 81 finished with value: 10.712542133962675 and parameters: {'n_estimators': 200, 'learning_rate': 0.30607071659527596}. Best is trial 67 with value: 10.674119139725658.[0m
[32m[I 2023-03-27 22:31:24,466][0m Trial 82 finished with value: 10.737441833803452 and parameters: {'n_estimators': 200, 'learning_rate'