In [None]:
import json
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
def training(
    X,
    y,
    params_json_path: str,
    test_size: float = 0.1,
    valid_size: float = 0.2,
    random_state: int = 2025,
    categorical_features: list = None,
    save_model_path: str = None,
    save_log_filename: str = "training_log.csv"
):
    """

    引数:
      X, y                   : 説明変数と目的変数 (DataFrame/ndarray)
      params_json_path       : ハイパーパラメータJSONファイルのパス
      test_size              : 全体からのテスト分割比率
      valid_size             : 訓練val分割時の検証比率
      random_state           : 乱数シード
      categorical_features   : カテゴリ変数列名のリスト
      save_model_path        : 学習済みモデル (txt) 保存先パス
      save_log_filename      : ログを保存するファイル名（"log"フォルダ内）

    戻り値:
      model, test_rmse
    """

    if params_json_path is None or not os.path.isfile(params_json_path):
        raise ValueError(f"params_json_path is not valid: {params_json_path}")
    if save_model_path is None:
        raise ValueError("save_model_path is not valid")

    log_dir = "log"
    os.makedirs(log_dir, exist_ok=True)

    with open(params_json_path, 'r', encoding='utf-8') as f:
        params = json.load(f)

    params['n_estimators'] = 10000

    X_trainval, X_test, y_trainval, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state)
    X_train, X_valid, y_train, y_valid = train_test_split(
        X_trainval, y_trainval, test_size=valid_size, random_state=random_state)

    model = lgb.LGBMRegressor(random_state=random_state, **params)
    fit_kwargs = {
        'eval_set': [(X_valid, y_valid)],
        'eval_names': ['valid'],
        'eval_metric': 'rmse',
        'callbacks': [lgb.early_stopping(stopping_rounds=50),
                      lgb.log_evaluation(period=50)]
    }
    if categorical_features is not None:
        fit_kwargs['categorical_feature'] = categorical_features

    model.fit(X_train, y_train, **fit_kwargs)

    y_pred = model.predict(X_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    dirpath = os.path.dirname(save_model_path)
    if dirpath:
        os.makedirs(dirpath, exist_ok=True)
    model.booster_.save_model(save_model_path)

    log_data = model.evals_result_['valid']['rmse']
    log_df = pd.DataFrame({ 'rmse': log_data })
    log_df.index.name = 'iteration'
    log_path = os.path.join(log_dir, save_log_filename)
    log_df.to_csv(log_path)

    return model, test_rmse