In [1]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import matplotlib_fontja
from autofeat import AutoFeatClassifier
from openfe import OpenFE, transform
os.chdir('../')
from src.feature import *
from src.model import *

  from .autonotebook import tqdm as notebook_tqdm


データ準備

In [2]:
# データの読み込み
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
sample_submit = pd.read_csv("data/sample_submit.csv", header=None)

In [34]:
# Xとyの分割
target_col = '購入フラグ'
X_train_df = train.drop(columns=[target_col])
y_train_df = train[target_col]
X_test_df = test.copy()

特徴量エンジニアリング

In [35]:
# 学習・テストデータの結合
X_train_df.index = "train_" + X_train_df.index.astype(str)
X_test_df.index = "test_" + X_test_df.index.astype(str)
X_df = pd.concat([X_train_df, X_test_df], axis=0)

# 不要な列の削除
drop_cols = ['企業ID', '企業名']
X_df = X_df.drop(columns=drop_cols)

# 欠損値の補完
X_df = impute_missing_values(X_df)

# 指標の組み合わせによる新規指標の作成
#X_df = feature_engineering(X_df)

# カテゴリ変数を数値にエンコード
categorical_cols = ['業界', '上場種別', '特徴']
X_df = onehot_encode_categorical(X_df, categorical_cols)

# ２択（はい/いいえ）変数のエンコード
binary_cols = ['アンケート６']
X_df = encode_binary(X_df, binary_cols, true=1)

# アンケートデータの特徴量化
X_df = survey_features(X_df, None)

# 組織図の特徴量化
# 特定部門の有無
org_df = org_chart_features(X_df['組織図'])
# Transformer+PCA
embedding_df = pd.read_csv("data/組織図_embed.csv", index_col=0)
transform_pca_df = pca_reduction(embedding_df, n_components=5)
transform_pca_df.columns = [f'組織図_emb_pca_{i}' for i in range(transform_pca_df.shape[1])]
X_df = pd.concat([X_df, org_df, transform_pca_df], axis=1)
X_df = X_df.drop(columns=['組織図'])

# 今後のDX展望テキストデータの特徴量化
# 文章量
length_df = text_length_features(X_df['今後のDX展望'])
# Transformer+PCA
embedding_df = pd.read_csv("data/今後のDX展望_embed.csv", index_col=0)
transform_pca_df = pca_reduction(embedding_df, n_components=5)
transform_pca_df.columns = [f'今後のDX展望_emb_pca_{i}' for i in range(transform_pca_df.shape[1])]
X_df = pd.concat([X_df, length_df, transform_pca_df], axis=1)
X_df = pd.concat([X_df, length_df], axis=1)
X_df = X_df.drop(columns=['今後のDX展望'])

# 企業概要テキストデータの特徴量化
# Transformer+PCA
embedding_df = pd.read_csv("data/企業概要_embed.csv", index_col=0)
transform_pca_df = pca_reduction(embedding_df, n_components=5)
transform_pca_df.columns = [f'企業概要_emb_pca_{i}' for i in range(transform_pca_df.shape[1])]
X_df = pd.concat([X_df, transform_pca_df], axis=1)
X_df = X_df.drop(columns=['企業概要'])

# 学習・テストデータの再分割
X_train_df = X_df.loc[X_train_df.index].reset_index(drop=True)
X_test_df = X_df.loc[X_test_df.index].reset_index(drop=True)

In [38]:
from sklearn.impute import SimpleImputer
X_train_df_base = X_train_df.iloc[:, :34]

# 1. Imputerの定義
# strategyは 'mean' (平均), 'median' (中央値), 'constant' (固定値, fill_value=0など) から選択
imputer = SimpleImputer(strategy='median')

# 2. Imputerの実行
X_train_df_base_imp = imputer.fit_transform(X_train_df_base)

# 3. 正しいカラム名でDataFrameを再構築
X_train_df_base_imp = pd.DataFrame(
    X_train_df_base_imp,
    columns=imputer.get_feature_names_out(),  # ★ 修正点：imputerが保持している列名を使う
    index=X_train_df_base.index
)

auto_feat = AutoFeatClassifier(verbose=1)
X_train_df_base_imp_transformed = auto_feat.fit_transform(X_train_df_base_imp, y_train_df)

2025-11-13 10:35:00,822 INFO: [AutoFeat] The 2 step feature engineering process could generate up to 28441 features.
2025-11-13 10:35:00,823 INFO: [AutoFeat] With 742 data points this new feature matrix would use about 0.08 gb of space.
2025-11-13 10:35:00,826 INFO: [feateng] Step 1: transformation of original features


[feateng]               0/             34 features transformed

2025-11-13 10:35:02,338 INFO: [feateng] Generated 119 transformed features from 34 original features - done.
2025-11-13 10:35:02,340 INFO: [feateng] Step 2: first combination of features


[feateng]             800/          11628 feature tuples combined

  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[feateng]           10700/          11628 feature tuples combined

2025-11-13 10:35:05,432 INFO: [feateng] Generated 11489 feature combinations from 11628 original feature tuples - done.


[feateng]           11600/          11628 feature tuples combined

2025-11-13 10:35:05,458 INFO: [feateng] Generated altogether 11690 new features in 2 steps
2025-11-13 10:35:05,459 INFO: [feateng] Removing correlated features, as well as additions at the highest level
2025-11-13 10:35:05,675 INFO: [feateng] Generated a total of 8758 additional features
2025-11-13 10:35:05,746 INFO: [featsel] Feature selection run 1/5


[featsel] Scaling data...done.


2025-11-13 10:42:53,865 INFO: [featsel] Feature selection run 2/5
2025-11-13 10:51:49,826 INFO: [featsel] Feature selection run 3/5
2025-11-13 11:02:05,734 INFO: [featsel] Feature selection run 4/5
2025-11-13 11:33:01,481 INFO: [featsel] Feature selection run 5/5
2025-11-13 11:39:53,203 INFO: [featsel] 36 features after 5 feature selection runs
  if np.max(np.abs(correlations[c].ravel()[:i])) < 0.9:
2025-11-13 11:39:53,249 INFO: [featsel] 22 features after correlation filtering
2025-11-13 11:39:54,578 INFO: [featsel] 11 features after noise filtering
2025-11-13 11:39:54,589 INFO: [AutoFeat] Computing 11 new features.


[AutoFeat]     9/   11 new features

2025-11-13 11:39:57,286 INFO: [AutoFeat]    11/   11 new features ...done.


[AutoFeat]    10/   11 new features

2025-11-13 11:39:57,300 INFO: [AutoFeat] Final dataframe with 45 feature columns (11 new).
2025-11-13 11:39:57,302 INFO: [AutoFeat] Training final classification model.
2025-11-13 11:39:59,050 INFO: [AutoFeat] Trained model: largest coefficients:
2025-11-13 11:39:59,053 INFO: [1.10825075]
2025-11-13 11:39:59,057 INFO: 2.985120 * 営業CF/総資産
2025-11-13 11:39:59,057 INFO: 2.630542 * 長期借入金/流動資産
2025-11-13 11:39:59,058 INFO: 2.233158 * 営業利益/総資産
2025-11-13 11:39:59,058 INFO: 0.731469 * sqrt(アンケート１０)*sqrt(アンケート７)
2025-11-13 11:39:59,059 INFO: 0.509369 * 無形固定資産変動ソフトウェア関連/売上
2025-11-13 11:39:59,059 INFO: 0.449800 * 純資産/売上
2025-11-13 11:39:59,060 INFO: 0.375087 * sqrt(店舗数)/事業所数
2025-11-13 11:39:59,060 INFO: 0.039329 * アンケート７/従業員数
2025-11-13 11:39:59,061 INFO: 0.027523 * アンケート５*log(減価償却費)
2025-11-13 11:39:59,062 INFO: 0.002454 * アンケート４*exp(アンケート８)
2025-11-13 11:39:59,069 INFO: [AutoFeat] Final score: 0.7022


In [39]:
list(X_train_df_base_imp_transformed.columns)

['従業員数',
 '事業所数',
 '工場数',
 '店舗数',
 '資本金',
 '総資産',
 '流動資産',
 '固定資産',
 '負債',
 '短期借入金',
 '長期借入金',
 '純資産',
 '自己資本',
 '売上',
 '営業利益',
 '経常利益',
 '当期純利益',
 '営業CF',
 '減価償却費',
 '運転資本変動',
 '投資CF',
 '有形固定資産変動',
 '無形固定資産変動(ソフトウェア関連)',
 'アンケート１',
 'アンケート２',
 'アンケート３',
 'アンケート４',
 'アンケート５',
 'アンケート６',
 'アンケート７',
 'アンケート８',
 'アンケート９',
 'アンケート１０',
 'アンケート１１',
 '無形固定資産変動ソフトウェア関連/売上',
 'アンケート７/従業員数',
 'sqrt(店舗数)/事業所数',
 '純資産/売上',
 '営業利益/総資産',
 '営業CF/総資産',
 '長期借入金/流動資産',
 '営業利益*log(アンケート１１)',
 'アンケート５*log(減価償却費)',
 'アンケート４*exp(アンケート８)',
 'sqrt(アンケート１０)*sqrt(アンケート７)']

In [29]:
# 1. OpenFEの定義
# importance_score='auto': 特徴量重要度の計算方法（通常はそのままでOK）
# n_jobs: 並列処理数（CPUコア数に合わせる）
ofe = OpenFE()

# 2. 特徴量の探索と選別 (fit)
# 注意: ここで内部的にGBDTを回して「本当に使える特徴量」を探します。
# 数分〜数十分かかることがあります。
features = ofe.fit(
    data=X_train_df.iloc[:, :23],  
    label=y_train_df,      # 目的変数
    n_jobs=4,              # 並列数
    task='classification' # 自動判定されますが、明示してもOK
)

print(f"発見された有効な特徴量数: {len(features)}")

The number of candidate features is 2407
Start stage I selection.


100%|██████████| 16/16 [00:13<00:00,  1.16it/s]


943 same features have been deleted.
Meet early-stopping in successive feature-wise halving.


100%|██████████| 16/16 [00:11<00:00,  1.41it/s]


The number of remaining candidate features is 1254
Start stage II selection.


100%|██████████| 16/16 [00:08<00:00,  1.92it/s]


Finish data processing.
発見された有効な特徴量数: 1254


In [31]:
features[0]

<openfe.FeatureGenerator.Node at 0x24a73187140>

In [33]:
# 3. データの変換 (transform)
# 訓練データとテストデータに、発見された特徴量を追加します
X_train_df_ofe, X_test_df_ofe = transform(
    X_train_df.iloc[:, :23], 
    X_test_df.iloc[:, :23], # テストデータも同様にImputeしておく必要があります
    features, 
    n_jobs=4
)

SystemExit: None

In [9]:
X_train_df_ofe

NameError: name 'X_train_df_ofe' is not defined

In [None]:
def remove_correlated_features(df, threshold=0.95):
    """
    相関係数が threshold 以上の特徴量のうち、片方を削除する関数
    """
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    
    # 相関が閾値を超えているカラムを特定
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    
    print(f"削除対象の重複特徴量数: {len(to_drop)}")
    return df.drop(columns=to_drop), to_drop

# 実行（計算に少し時間がかかります）
# X_train_ofe は OpenFEで変換済みのデータ
print("相関チェック中...")
X_train_filtered, dropped_cols = remove_correlated_features(X_train_ofe, threshold=0.95)

# テストデータからも同じ列を削除
X_test_filtered = X_test_ofe.drop(columns=dropped_cols)

[<openfe.FeatureGenerator.Node at 0x1d38948edb0>,
 <openfe.FeatureGenerator.Node at 0x1d38adfc830>,
 <openfe.FeatureGenerator.Node at 0x1d3807f8ef0>,
 <openfe.FeatureGenerator.Node at 0x1d38af9c140>,
 <openfe.FeatureGenerator.Node at 0x1d387c10ce0>,
 <openfe.FeatureGenerator.Node at 0x1d386bcbe00>,
 <openfe.FeatureGenerator.Node at 0x1d38003d070>,
 <openfe.FeatureGenerator.Node at 0x1d3882b2f90>,
 <openfe.FeatureGenerator.Node at 0x1d386bcbbc0>,
 <openfe.FeatureGenerator.Node at 0x1d3fe8b76b0>,
 <openfe.FeatureGenerator.Node at 0x1d3861cbe00>,
 <openfe.FeatureGenerator.Node at 0x1d38003e5a0>,
 <openfe.FeatureGenerator.Node at 0x1d3861cacf0>,
 <openfe.FeatureGenerator.Node at 0x1d386bb48c0>,
 <openfe.FeatureGenerator.Node at 0x1d3fe8bab70>,
 <openfe.FeatureGenerator.Node at 0x1d3fc5abfe0>,
 <openfe.FeatureGenerator.Node at 0x1d3813ed1c0>,
 <openfe.FeatureGenerator.Node at 0x1d387b7b0b0>,
 <openfe.FeatureGenerator.Node at 0x1d386b99d00>,
 <openfe.FeatureGenerator.Node at 0x1d3869f5e80>,


In [5]:
list(X_df.columns)

['従業員数',
 '事業所数',
 '工場数',
 '店舗数',
 '資本金',
 '総資産',
 '流動資産',
 '固定資産',
 '負債',
 '短期借入金',
 '長期借入金',
 '純資産',
 '自己資本',
 '売上',
 '営業利益',
 '経常利益',
 '当期純利益',
 '営業CF',
 '減価償却費',
 '運転資本変動',
 '投資CF',
 '有形固定資産変動',
 '無形固定資産変動(ソフトウェア関連)',
 'アンケート１',
 'アンケート２',
 'アンケート３',
 'アンケート４',
 'アンケート５',
 'アンケート６',
 'アンケート７',
 'アンケート８',
 'アンケート９',
 'アンケート１０',
 'アンケート１１',
 '営業利益率',
 '総資産回転率',
 'ROA',
 '従業員一人当たり営業利益',
 '流動比率',
 '無形固定資産変動率',
 '営業CFマージン',
 '投資効率対CF',
 '自己資本比率',
 '負債比率',
 '規模収益性',
 '従業員一人当たり売上',
 '負債純資産倍率',
 '投資効率対利益',
 '新規投資度合い',
 '業界_IT',
 '業界_その他',
 '業界_その他サービス',
 '業界_アパレル・美容',
 '業界_エネルギー',
 '業界_エンタメ',
 '業界_ゲーム',
 '業界_コンサルティング',
 '業界_マスコミ',
 '業界_不動産',
 '業界_人材',
 '業界_化学',
 '業界_医療・福祉',
 '業界_商社',
 '業界_外食',
 '業界_専門サービス',
 '業界_小売',
 '業界_広告',
 '業界_建設・工事',
 '業界_教育',
 '業界_機械',
 '業界_機械関連サービス',
 '業界_生活用品',
 '業界_自動車・乗り物',
 '業界_製造',
 '業界_通信',
 '業界_通信機器',
 '業界_運輸・物流',
 '業界_金融',
 '業界_電気製品',
 '業界_食品',
 '上場種別_GR',
 '上場種別_PR',
 '上場種別_ST',
 '特徴_BtoB',
 '特徴_BtoC',
 '特徴_CtoC',
 'アンケート_pos_mean',
 'アンケート_neg_mean',
 'アン

予測モデルのパラメータ最適化

In [26]:
# 基本パラメータ
lgbm_params_base = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'n_estimators': 1000,
    'is_unbalance': True,
    'random_state': 42,
    'n_jobs': -1,
    'verbose': -1
    }

# 探索するパラメータを定義する「関数」
def define_lgbm_params(trial):
    return {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 80),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
    }

# LightGBMモデルの最適化実行
lgbm_results = optimize_model(
    X_train_df,
    y_train_df,
    model_name="lightgbm",
    params_base=lgbm_params_base,
    define_params_func=define_lgbm_params,
    n_trials=50,
    early_stopping_rounds=10
)

[I 2025-11-13 09:51:40,559] A new study created in memory with name: no-name-ad283421-08b8-4291-9a44-75b49dc3e047
[W 2025-11-13 09:51:40,601] Trial 0 failed with parameters: {'learning_rate': 0.07246912952833807, 'num_leaves': 37, 'max_depth': 6, 'min_child_samples': 21, 'subsample': 0.9008025363481637, 'colsample_bytree': 0.9230975973699538, 'reg_alpha': 3.6286083614056945, 'reg_lambda': 1.2557811308578888} because of the following error: LightGBMError('Feature (今後のDX展望_length) appears more than one time.').
Traceback (most recent call last):
  File "c:\Users\1631278\project\manufacturing-dx-purchase-prediction\venv\Lib\site-packages\optuna\study\_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "c:\Users\1631278\project\manufacturing-dx-purchase-prediction\src\model.py", line 67, in objective
    model.fit(X_train_fold, y_train_fold,
  File "c:\Users\1631278\project\manufacturing-dx-purchase-prediction\venv\Lib\site-pack

--- Optimizing lightgbm ---


LightGBMError: Feature (今後のDX展望_length) appears more than one time.

In [28]:
# 基本パラメータ
xgb_params_base = {
    'objective': 'binary:logistic', 
    'eval_metric': 'logloss',       
    'n_estimators': 1000,
    'scale_pos_weight': (y_train_df == 0).sum() / (y_train_df == 1).sum(), # 'is_unbalance' の代わり
    'random_state': 42,
    'n_jobs': -1,
    'verbosity': 0,                 # 'verbose': -1 -> 'verbosity': 0
    'booster': 'gbtree'             # デフォルトだが明記
    }

# 探索するパラメータを定義する「関数」
def define_xgb_params(trial):
    return {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
    }

# XGBoostモデルの最適化実行
xgb_results = optimize_model(
    X_train_df,
    y_train_df,
    model_name="xgboost",
    params_base=xgb_params_base,
    define_params_func=define_xgb_params,
    n_trials=50,
    early_stopping_rounds=10
)

[I 2025-11-13 09:52:38,086] A new study created in memory with name: no-name-18118595-ec00-4563-a635-8c71a79342b0
[W 2025-11-13 09:52:38,150] Trial 0 failed with parameters: {'learning_rate': 0.03351417499891992, 'max_depth': 11, 'min_child_weight': 6, 'subsample': 0.7651290060007709, 'colsample_bytree': 0.9267120428066175, 'reg_alpha': 7.285702262667401e-05, 'reg_lambda': 3.2167055395128123e-06} because of the following error: AttributeError("'DataFrame' object has no attribute 'dtype'").
Traceback (most recent call last):
  File "c:\Users\1631278\project\manufacturing-dx-purchase-prediction\venv\Lib\site-packages\optuna\study\_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "c:\Users\1631278\project\manufacturing-dx-purchase-prediction\src\model.py", line 72, in objective
    model.fit(X_train_fold, y_train_fold,
  File "c:\Users\1631278\project\manufacturing-dx-purchase-prediction\venv\Lib\site-packages\xgboost\core.py

--- Optimizing xgboost ---


AttributeError: 'DataFrame' object has no attribute 'dtype'

In [13]:
# 基本パラメータ
cat_params_base = {
    'objective': 'Logloss', # CatBoostでは'Logloss'を指定
        'iterations': 1000,
        'scale_pos_weight': (y_train_df == 0).sum() / (y_train_df == 1).sum(), # 不均衡データへの対処
        'random_seed': 42,
        'verbose': 0, # 学習ログを非表示
        'early_stopping_rounds': 10 # 早期停止
    }

# 探索するパラメータを定義する「関数」
def define_cat_params(trial):
    return {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'depth': trial.suggest_int('depth', 3, 10),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0, log=True), # L2正則化
    }

# CatBoostモデルの最適化実行
cat_results = optimize_model(
    X_train_df,
    y_train_df,
    model_name="catboost",
    params_base=cat_params_base,
    define_params_func=define_cat_params,
    n_trials=50,
    early_stopping_rounds=10
)

[I 2025-11-09 03:56:18,330] A new study created in memory with name: no-name-c884303b-4f8e-4469-bc8b-6d317d928e0e


--- Optimizing catboost ---


[I 2025-11-09 03:56:23,427] Trial 0 finished with value: 0.6691326390193811 and parameters: {'learning_rate': 0.01054795785623793, 'depth': 6, 'min_data_in_leaf': 10, 'subsample': 0.7436630566839998, 'l2_leaf_reg': 0.01833593206035085}. Best is trial 0 with value: 0.6691326390193811.
[I 2025-11-09 03:56:25,494] Trial 1 finished with value: 0.6699714402284782 and parameters: {'learning_rate': 0.019843299730670023, 'depth': 4, 'min_data_in_leaf': 66, 'subsample': 0.7905692553011006, 'l2_leaf_reg': 7.72263637152063e-08}. Best is trial 1 with value: 0.6699714402284782.
[I 2025-11-09 03:56:27,576] Trial 2 finished with value: 0.7279202279202279 and parameters: {'learning_rate': 0.06322721567411647, 'depth': 3, 'min_data_in_leaf': 82, 'subsample': 0.9973031418816185, 'l2_leaf_reg': 0.27159707655587373}. Best is trial 2 with value: 0.7279202279202279.
[I 2025-11-09 03:56:30,790] Trial 3 finished with value: 0.6703269170368376 and parameters: {'learning_rate': 0.010495922769357959, 'depth': 5,


--- Optimization Finished ---
Best trial for catboost:
  Value (Best F1 Score): 0.73141
  Best Params:
    learning_rate: 0.03155510447217814
    depth: 4
    min_data_in_leaf: 16
    subsample: 0.7133641906582164
    l2_leaf_reg: 1.63545550895305

--- Confusion Matrix (Best Trial at Threshold: 0.4900) ---
                          |Predicted Label              |
                          |-----------------------------|
                          | Negative (0) | Positive (1) |
-----------|--------------|--------------|--------------|
True Label | Negative (0) | 503          | 60           | (TN, FP)
           |--------------|--------------|--------------|
           | Positive (1) | 41           | 138          | (FN, TP)
-----------|--------------|--------------|--------------|

--- OOF Scores (at Best Threshold) ---
  Precision: 0.69697
  Recall:    0.77095
  F1 Score:  0.73210 (F1 score on total OOF predictions)
  (Note: 'Best F1 Score' above is the CV mean optimized by Optuna.)

-

In [14]:
# アンサンブルモデルの学習
ensemble_result = train_ensemble_models(
    X_train_df=X_train_df,
    y_train_df=y_train_df,
    X_test_df=X_test_df,
    lgb_best_params = lgbm_results['best_params'],
    xgb_best_params = xgb_results['best_params'],
    cat_best_params = cat_results['best_params'],
    sample_submit=sample_submit,
    n_folds=5,
    early_stopping_rounds=10,
    thresholds=np.arange(0.1, 0.5, 0.01),
    random_state=42
)

--- Start Ensemble Training ---
Fold 1/5 started...
Fold 2/5 started...
Fold 3/5 started...
Fold 4/5 started...
Fold 5/5 started...
--- Ensemble Training Finished ---
Optimal Weights - LGB: 0.3333, XGB: 0.3333, CAT: 0.3333

--- Evaluation ---
Ensemble Best F1: 0.73737 (Threshold: 0.44)
LightGBM: 0.72165 (Threshold: 0.46)
XGBoost:  0.73600 (Threshold: 0.47)
CatBoost: 0.73210 (Threshold: 0.49)

--- Fold-wise Stability Evaluation (using global thresholds) ---
  Fold 1 F1 -> Ensemble: 0.78049 | LGB: 0.75000 | XGB: 0.77500 | CAT: 0.77778
  Fold 2 F1 -> Ensemble: 0.77500 | LGB: 0.61538 | XGB: 0.72222 | CAT: 0.77922
  Fold 3 F1 -> Ensemble: 0.75325 | LGB: 0.76923 | XGB: 0.72727 | CAT: 0.72000
  Fold 4 F1 -> Ensemble: 0.72941 | LGB: 0.75610 | XGB: 0.74074 | CAT: 0.72289
  Fold 5 F1 -> Ensemble: 0.63889 | LGB: 0.71429 | XGB: 0.70769 | CAT: 0.65714
Ensemble Fold F1 Mean: 0.73541, Std: 0.05152
