In [None]:
!pip install lightgbm optuna
!pip install optuna-integration[lightgbm]
import json
import numpy as np
import pandas as pd
import lightgbm as lgb
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from optuna.integration import LightGBMPruningCallback
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings("ignore", message=".*step.*is already reported.*")



In [None]:
def parameter_optimization(
    X,
    y,
    save_path: str,
    n_trials: int = 100,
    n_splits: int = 5,
    random_state: int = 2025,
    max_boost_round: int = 500,
    timeout: float = None,
    categorical_features: list = None
):
    if hasattr(X, 'iloc'):
        xi = lambda idx: X.iloc[idx]
        yi = lambda idx: y.iloc[idx]
    else:
        xi = lambda idx: X[idx]
        yi = lambda idx: y[idx]

    def objective(trial):
        param = {
            'objective': 'regression',
            'metric': 'rmse',
            'boosting_type': 'gbdt',
            'verbosity': -1,
            'num_leaves': trial.suggest_int('num_leaves', 20, 150),
            'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1e-1, log=True),
            'max_depth': trial.suggest_int('max_depth', 3, 12),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-4, 10.0, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-4, 10.0, log=True),
            'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
            'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
            'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 20, 500),
            'random_state': random_state
        }

        kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
        rmses = []

        for train_idx, valid_idx in kf.split(X):
            X_tr, X_va = xi(train_idx), xi(valid_idx)
            y_tr, y_va = yi(train_idx), yi(valid_idx)
            # LightGBM Dataset with categorical features
            dtrain = lgb.Dataset(
                X_tr, y_tr,
                categorical_feature=categorical_features if hasattr(X, 'iloc') else None
            )
            dvalid = lgb.Dataset(
                X_va, y_va,
                reference=dtrain,
                categorical_feature=categorical_features if hasattr(X, 'iloc') else None
            )

            gbm = lgb.train(
                param,
                dtrain,
                num_boost_round=max_boost_round,
                valid_sets=[dvalid],
                valid_names=['valid'],
                callbacks=[
                    lgb.log_evaluation(period=0),
                    LightGBMPruningCallback(trial, 'rmse', valid_name='valid'),
                    lgb.early_stopping(stopping_rounds=50)
                ]
            )
            preds = gbm.predict(X_va, num_iteration=gbm.best_iteration)
            rmses.append(np.sqrt(mean_squared_error(y_va, preds)))

        return float(np.mean(rmses))

    sampler = optuna.samplers.TPESampler(seed=random_state)
    study = optuna.create_study(
        direction='minimize', sampler=sampler,
        pruner=optuna.pruners.MedianPruner(n_startup_trials=10,
                                           n_warmup_steps=50),
        study_name='lgbm_regression_opt'
    )

    study.optimize(objective, n_trials=n_trials, timeout=timeout)

    best_params = study.best_params
    best_rmse = study.best_value

    with open(save_path, 'w', encoding='utf-8') as f:
        json.dump(best_params, f, ensure_ascii=False, indent=4)
