In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os
sys.path.append(os.path.abspath('../src'))

In [2]:
import optuna
from optimize_params import xgb_objective, lgbm_objective, cat_objective

In [3]:
from data_loader import DiabetesLoader
from preprocessing import get_preprocessor, add_important_interaction
from config import OPTIMAL_COLS

loader = DiabetesLoader('../data/raw/train.csv')
X_train, y_train, X_val, y_val = loader.get_data()
pipeline = get_preprocessor()
X_train_processed = pipeline.fit_transform(X_train)
important_cols = ['family_history_diabetes_1', 'Relative_Activity', 'Age_BMI_Risk', 'Chronic_Metabolic_Load', 'physical_activity_minutes_per_week', 'Diet_Activity_Score', 'age', 'Age_WHR_Risk', 'Lipid_Accumulation', 'triglycerides']
X_train_full = add_important_interaction(X_train_processed, important_cols)
X_train_optimal = X_train_full[OPTIMAL_COLS]
print(f"optimal X: {X_train_optimal.shape}")

Loading data from /app/data/raw/train.csv
Data Loaded. Shape: (700000, 25)
Train X: (560000, 24)  Train y: (560000,)
Val X:   (140000, 24)    Val y:   (140000,)
combining 9 cols
added 36 cols as interactions
optimal X: (560000, 20)


In [17]:
study = optuna.create_study(direction="maximize")
study.optimize(lambda trial: xgb_objective(trial, ratio=0.6049, X=X_train_optimal, y=y_train), n_trials=30)

[I 2025-12-24 17:14:03,185] A new study created in memory with name: no-name-d5df61e3-b3ce-4edb-a115-0dd12ddede7a
[I 2025-12-24 17:14:21,891] Trial 0 finished with value: 0.7132098624655804 and parameters: {'xgb_n_estimators': 485, 'xgb_max_depth': 3, 'xgb_lr': 0.03307690878597566, 'xgb_min_child': 3, 'xgb_gamma': 0.47374708997630516, 'xgb_subsample': 0.6558152655441026}. Best is trial 0 with value: 0.7132098624655804.
[I 2025-12-24 17:14:37,249] Trial 1 finished with value: 0.7147269641938628 and parameters: {'xgb_n_estimators': 344, 'xgb_max_depth': 4, 'xgb_lr': 0.03487598987371712, 'xgb_min_child': 3, 'xgb_gamma': 0.16230735027018955, 'xgb_subsample': 0.6814755569049262}. Best is trial 1 with value: 0.7147269641938628.
[I 2025-12-24 17:14:50,002] Trial 2 finished with value: 0.7025479126984795 and parameters: {'xgb_n_estimators': 359, 'xgb_max_depth': 4, 'xgb_lr': 0.01193353986403808, 'xgb_min_child': 4, 'xgb_gamma': 0.008233596265394327, 'xgb_subsample': 0.7567139354545847}. Best i

In [18]:
print("Best xgb Found:")
print(study.best_params)

Best xgb Found:
{'xgb_n_estimators': 546, 'xgb_max_depth': 4, 'xgb_lr': 0.09522141307403631, 'xgb_min_child': 5, 'xgb_gamma': 0.2829267954406464, 'xgb_subsample': 0.8714391222089948}


In [4]:
study = optuna.create_study(direction="maximize")
study.optimize(lambda trial: lgbm_objective(trial, ratio=0.6049, X=X_train_optimal, y=y_train), n_trials=30)

[I 2025-12-25 03:56:36,319] A new study created in memory with name: no-name-8654d4d9-d4fd-4d7b-a18d-e53312e0c7b0
[I 2025-12-25 03:56:53,238] Trial 0 finished with value: 0.6944387309746943 and parameters: {'lgbm_n_estimators': 442, 'lgbm_lr': 0.012437625215308933, 'lgbm_depth': 12, 'lgbm_num_leaves': 1640, 'lgbm_leaf_size': 44714}. Best is trial 0 with value: 0.6944387309746943.
[I 2025-12-25 03:57:03,947] Trial 1 finished with value: 0.6510846269196988 and parameters: {'lgbm_n_estimators': 387, 'lgbm_lr': 0.06487972465079908, 'lgbm_depth': 9, 'lgbm_num_leaves': 680, 'lgbm_leaf_size': 96694}. Best is trial 0 with value: 0.6944387309746943.
[I 2025-12-25 03:57:16,556] Trial 2 finished with value: 0.7187629520975886 and parameters: {'lgbm_n_estimators': 214, 'lgbm_lr': 0.08055327033889308, 'lgbm_depth': 8, 'lgbm_num_leaves': 1694, 'lgbm_leaf_size': 4076}. Best is trial 2 with value: 0.7187629520975886.
[I 2025-12-25 03:57:35,317] Trial 3 finished with value: 0.6528057749947027 and param

In [5]:
print("Best lgbm Found:")
print(study.best_params)

Best lgbm Found:
{'lgbm_n_estimators': 241, 'lgbm_lr': 0.05186210130983924, 'lgbm_depth': 10, 'lgbm_num_leaves': 925, 'lgbm_leaf_size': 650}


In [4]:
study = optuna.create_study(direction="maximize")
study.optimize(lambda trial: cat_objective(trial, ratio=0.6049, X=X_train_optimal, y=y_train), n_trials=30)

[I 2025-12-25 07:39:43,551] A new study created in memory with name: no-name-715ce6aa-ce42-47b0-bad3-e2c8c41e2a29
[I 2025-12-25 07:40:38,031] Trial 0 finished with value: 0.7059356508768407 and parameters: {'iterations': 451, 'depth': 5, 'learning_rate': 0.018538132998428825, 'l2_leaf_reg': 5, 'subsample': 0.5488760001735922}. Best is trial 0 with value: 0.7059356508768407.
[I 2025-12-25 07:41:58,892] Trial 1 finished with value: 0.722160041012919 and parameters: {'iterations': 595, 'depth': 7, 'learning_rate': 0.06820964361162837, 'l2_leaf_reg': 6, 'subsample': 0.7599214009138101}. Best is trial 1 with value: 0.722160041012919.
[I 2025-12-25 07:45:04,529] Trial 2 finished with value: 0.7211389374495528 and parameters: {'iterations': 633, 'depth': 10, 'learning_rate': 0.04632039989719817, 'l2_leaf_reg': 9, 'subsample': 0.7094148753173131}. Best is trial 1 with value: 0.722160041012919.
[I 2025-12-25 07:47:02,690] Trial 3 finished with value: 0.7215154735729105 and parameters: {'iterati

In [5]:
print("Best lgbm Found:")
print(study.best_params)

Best lgbm Found:
{'iterations': 762, 'depth': 6, 'learning_rate': 0.07494446029772849, 'l2_leaf_reg': 10, 'subsample': 0.8406007197850853}
