In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
import optuna
from catboost import CatBoostClassifier

  from .autonotebook import tqdm as notebook_tqdm


データの読み込み

In [2]:
# データの読み込み
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
sample_submit = pd.read_csv("data/sample_submit.csv", header=None)

データの前処理

In [3]:
# 特徴量エンジニアリング関数
def feature_engineering(df):
    """財務指標などを追加する関数"""
    # 財務指標の作成 (分母が0になる可能性を考慮)
    df['自己資本比率'] = df['自己資本'] / (df['総資産'] + 1e-6)
    df['売上高営業利益率'] = df['営業利益'] / (df['売上'] + 1e-6)
    df['総資産回転率'] = df['売上'] / (df['総資産'] + 1e-6)
    df['負債比率'] = df['負債'] / (df['自己資本'] + 1e-6)
    # 新しい特徴量を追加
    df['従業員数_x_売上高営業利益率'] = df['従業員数'] * df['売上高営業利益率']
    # アンケートの平均と標準偏差を追加
    zenkaku_table = str.maketrans('0123456789', '０１２３４５６７８９')
    survey_cols = [f"アンケート{str(i).translate(zenkaku_table)}" for i in range(1, 12)]
    df['アンケート_平均'] = df[survey_cols].mean(axis=1)
    df['アンケート_標準偏差'] = df[survey_cols].std(axis=1)
    # 欠損値を-9999で埋める (LightGBMは欠損値を扱えるが、比率計算でのNaN/infに対応)
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(-9999, inplace=True)
    return df

# train/testに特徴量エンジニアリングを適用
train_df = feature_engineering(train.copy())
test_df = feature_engineering(test.copy())

# カテゴリ変数のエンコーディング
categorical_features = ['業界', '上場種別', '特徴']
for col in categorical_features:
    # trainとtestを結合して語彙を作成し、未知のカテゴリに対応
    combined_data = pd.concat([train_df[col], test_df[col]]).astype(str)
    le = LabelEncoder()
    le.fit(combined_data)
    train_df[col] = le.transform(train_df[col].astype(str))
    test_df[col] = le.transform(test_df[col].astype(str))

# 特徴量とターゲットの定義
target = '購入フラグ'
# テキストデータやIDなど、学習から除外するカラム
drop_cols = ['企業ID', '企業名', '企業概要', '組織図', '今後のDX展望', '購入フラグ']
# trainとtestで共通して存在するカラムのみを特徴量として使用
common_cols = list(set(train_df.columns) & set(test_df.columns))
features = [col for col in common_cols if col not in drop_cols]

X_train = train_df[features]
y_train = train_df[target]
X_test = test_df[features]

print(f"学習に使用する特徴量の数: {len(features)}")
print(f"学習データのサイズ: {X_train.shape}")
print(f"テストデータのサイズ: {X_test.shape}")

# 学習データの購入フラグ1と0の数と割合を表示
y_train_positive = (y_train==1).sum()
y_train_negative = (y_train==0).sum()
print(f"学習データの購入フラグ1の数: {y_train_positive}")
print(f"学習データの購入フラグ0の数: {y_train_negative}")
print(f"購入フラグ1の割合: {y_train_positive / len(y_train):.4f}")

# 学習時のpositiveデータの重要度を引き上げる
scale_pos_weight = y_train_negative / y_train_positive

学習に使用する特徴量の数: 44
学習データのサイズ: (742, 44)
テストデータのサイズ: (800, 44)
学習データの購入フラグ1の数: 179
学習データの購入フラグ0の数: 563
購入フラグ1の割合: 0.2412


In [4]:
# テキストデータの特徴量化

# TF-IDFの適用 (今後のDX展望)
#tfidf = TfidfVectorizer(max_features=50) # まずは50個の重要単語に絞る
#train_text_features = tfidf.fit_transform(train_df['今後のDX展望']).toarray()
#test_text_features = tfidf.transform(test_df['今後のDX展望']).toarray()
tfidf_vectorizer = TfidfVectorizer(max_features=100, ngram_range=(1, 2)) # 100個の重要単語・バイグラムも考慮
all_text = pd.concat([train_df['今後のDX展望'], test_df['今後のDX展望']])
tfidf_vectorizer.fit(all_text)
train_tfidf_features = tfidf_vectorizer.transform(train_df['今後のDX展望']).toarray()
test_tfidf_features = tfidf_vectorizer.transform(test_df['今後のDX展望']).toarray()

# 特徴量データフレームに変換
#train_text_df = pd.DataFrame(train_text_features, columns=[f'tfidf_{i}' for i in range(train_text_features.shape[1])])
#test_text_df = pd.DataFrame(test_text_features, columns=[f'tfidf_{i}' for i in range(test_text_features.shape[1])])
train_tfidf_df = pd.DataFrame(train_tfidf_features, columns=[f'tfidf_outlook_{i}' for i in range(train_tfidf_features.shape[1])])
test_tfidf_df = pd.DataFrame(test_tfidf_features, columns=[f'tfidf_outlook_{i}' for i in range(test_tfidf_features.shape[1])])

# 元のデータと結合
#X_train = pd.concat([X_train, train_text_df], axis=1)
#X_test = pd.concat([X_test, test_text_df], axis=1)
X_train = pd.concat([X_train, train_tfidf_df], axis=1)
X_test = pd.concat([X_test, test_tfidf_df], axis=1)

# featureリストも更新
features = list(X_train.columns)

print(f"学習に使用する特徴量の数: {len(features)}")
print(f"学習データのサイズ: {X_train.shape}")
print(f"テストデータのサイズ: {X_test.shape}")

学習に使用する特徴量の数: 144
学習データのサイズ: (742, 144)
テストデータのサイズ: (800, 144)


In [5]:
# --- 企業概要に対するTF-IDFの追加 ---
tfidf_desc_vectorizer = TfidfVectorizer(max_features=50, ngram_range=(1, 2)) # 特徴量は50個に絞る
all_desc_text = pd.concat([train_df['企業概要'], test_df['企業概要']])
tfidf_desc_vectorizer.fit(all_desc_text)

train_tfidf_desc = tfidf_desc_vectorizer.transform(train_df['企業概要']).toarray()
test_tfidf_desc = tfidf_desc_vectorizer.transform(test_df['企業概要']).toarray()

train_tfidf_desc_df = pd.DataFrame(train_tfidf_desc, columns=[f'tfidf_desc_{i}' for i in range(train_tfidf_desc.shape[1])])
test_tfidf_desc_df = pd.DataFrame(test_tfidf_desc, columns=[f'tfidf_desc_{i}' for i in range(test_tfidf_desc.shape[1])])

# 既存のデータフレームと結合
X_train = pd.concat([X_train, train_tfidf_desc_df], axis=1)
X_test = pd.concat([X_test, test_tfidf_desc_df], axis=1)


# --- DX展望の簡易的な感情分析特徴量の追加 ---
# ポジティブな単語とネガティブな単語を定義
positive_words = ['積極', '強化', '推進', '投資', '拡大', '創出']
negative_words = ['慎重', '課題', '懸念', '限定的', '検討']

# 各単語の出現回数をカウント
for word in positive_words:
    train_df[f'word_{word}'] = train_df['今後のDX展望'].str.count(word)
    test_df[f'word_{word}'] = test_df['今後のDX展望'].str.count(word)

for word in negative_words:
    train_df[f'word_{word}'] = train_df['今後のDX展望'].str.count(word)
    test_df[f'word_{word}'] = test_df['今後のDX展望'].str.count(word)

# ポジティブ/ネガティブスコアを作成
X_train['positive_score'] = train_df[[f'word_{w}' for w in positive_words]].sum(axis=1)
X_test['positive_score'] = test_df[[f'word_{w}' for w in positive_words]].sum(axis=1)
X_train['negative_score'] = train_df[[f'word_{w}' for w in negative_words]].sum(axis=1)
X_test['negative_score'] = test_df[[f'word_{w}' for w in negative_words]].sum(axis=1)

# featureリストも更新
features = list(X_train.columns)

print(f"学習に使用する特徴量の数: {len(features)}")
print(f"学習データのサイズ: {X_train.shape}")
print(f"テストデータのサイズ: {X_test.shape}")

学習に使用する特徴量の数: 196
学習データのサイズ: (742, 196)
テストデータのサイズ: (800, 196)


モデルの構築

In [7]:
# LightGBMモデルの学習（StratifiedKFoldによる交差検証）
NFOLDS = 5
folds = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=42)
oof_preds = np.zeros(train_df.shape[0])
sub_preds = np.zeros(test_df.shape[0])
feature_importance_df = pd.DataFrame()

for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X_train, y_train)):
    X_train_fold, y_train_fold = X_train.iloc[train_idx], y_train.iloc[train_idx]
    X_valid_fold, y_valid_fold = X_train.iloc[valid_idx], y_train.iloc[valid_idx]

    # LightGBMのパラメータ設定
    # is_unbalance=True または scale_pos_weight を追加
    model = lgb.LGBMClassifier(
        objective='binary',
        metric='binary_logloss',
        n_estimators=2000,
        learning_rate=0.01,
        num_leaves=31,
        random_state=42,
        n_jobs=-1,
        colsample_bytree=0.8,
        subsample=0.8,
        reg_alpha=0.1,
        reg_lambda=0.1,
        # === ここを追加 ===
        is_unbalance=True,
        #scale_pos_weight=scale_pos_weight  # (陰性サンプル数 / 陽性サンプル数)
        min_child_samples=10,
    )

    model.fit(X_train_fold, y_train_fold,
              eval_set=[(X_valid_fold, y_valid_fold)],
              eval_metric='logloss',
              callbacks=[lgb.early_stopping(100, verbose=False)])

    oof_preds[valid_idx] = model.predict_proba(X_valid_fold)[:, 1]
    sub_preds += model.predict_proba(X_test)[:, 1] / folds.n_splits
    
    # 特徴量重要度の保存
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = model.feature_importances_
    fold_importance_df["fold"] = n_fold + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)


print("-" * 30)
# --- 最適な閾値の探索 ---
thresholds = np.arange(0.1, 0.5, 0.01)
f1_scores = [f1_score(y_train, (oof_preds > t).astype(int)) for t in thresholds]
best_threshold = thresholds[np.argmax(f1_scores)]
best_f1 = np.max(f1_scores)

print(f"OOF（Out-of-Fold）予測でのベストF1スコア: {best_f1:.4f}")
print(f"ベストスコアを達成した閾値: {best_threshold:.2f}")

[LightGBM] [Info] Number of positive: 143, number of negative: 450
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003444 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7032
[LightGBM] [Info] Number of data points in the train set: 593, number of used features: 181
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.241147 -> initscore=-1.146403
[LightGBM] [Info] Start training from score -1.146403
[LightGBM] [Info] Number of positive: 143, number of negative: 450
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003577 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6931
[LightGBM] [Info] Number of data points in the train set: 593, number of used features: 177
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.241147 -> initscore=-1.146403
[LightGBM] [Info] Start training from score -1.146403
[LightGBM] [Info] Numb

In [None]:
OOF（Out-of-Fold）予測でのベストF1スコア: 0.6109
ベストスコアを達成した閾値: 0.22

In [18]:
NFOLDS = 5

params_base = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'n_estimators': 1000,
        'is_unbalance': True,  # 成功した設定は維持
        'random_state': 42,
        'n_jobs': -1,
        'verbose': -1
    }

def objective_lightgbm(trial):
    # パラメータの探索範囲を定義
    params_opt = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 80),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
    }
    params = params_base | params_opt

    # 交差検証ループ
    oof_preds_trial = np.zeros(train_df.shape[0])
    folds = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=42)
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X_train, y_train)):
        X_train_fold, y_train_fold = X_train.iloc[train_idx], y_train.iloc[train_idx]
        X_valid_fold, y_valid_fold = X_train.iloc[valid_idx], y_train.iloc[valid_idx]

        model = lgb.LGBMClassifier(**params)
        model.fit(X_train_fold, y_train_fold,
                  eval_set=[(X_valid_fold, y_valid_fold)],
                  callbacks=[lgb.early_stopping(50, verbose=False)])

        oof_preds_trial[valid_idx] = model.predict_proba(X_valid_fold)[:, 1]
    
    # このトライアルでの最適なF1スコアを閾値探索で見つける
    thresholds = np.arange(0.1, 0.5, 0.01)
    f1_scores = [f1_score(y_train, (oof_preds_trial > t).astype(int)) for t in thresholds]
    
    return np.max(f1_scores)

# # --- 最適化を実行する場合 (時間がかかることがあります) ---
study_lightgbm = optuna.create_study(direction='maximize')
study_lightgbm.optimize(objective_lightgbm, n_trials=50) # 50回の試行で探索

print('Best trial for LightGBM:')
trial_lightgbm = study_lightgbm.best_trial
print(f'  Value (Best F1 Score): {trial_lightgbm.value}')
print('  Best Params: ')
for key, value in trial_lightgbm.params.items():
    print(f'    {key}: {value}')

best_params_lightgbm = params_base | trial_lightgbm.params

[I 2025-10-17 00:49:56,340] A new study created in memory with name: no-name-e45d2f0f-69c0-4921-ba85-e027a07c6ddd
[I 2025-10-17 00:49:58,878] Trial 0 finished with value: 0.6558139534883721 and parameters: {'learning_rate': 0.010098354336778663, 'num_leaves': 66, 'max_depth': 6, 'min_child_samples': 40, 'subsample': 0.9462852032422089, 'colsample_bytree': 0.9858667034847988, 'reg_alpha': 0.005710096579921643, 'reg_lambda': 0.007618058451519296}. Best is trial 0 with value: 0.6558139534883721.
[I 2025-10-17 00:50:00,079] Trial 1 finished with value: 0.6617647058823529 and parameters: {'learning_rate': 0.028382926903424865, 'num_leaves': 49, 'max_depth': 11, 'min_child_samples': 40, 'subsample': 0.6068137866664104, 'colsample_bytree': 0.8905093443075198, 'reg_alpha': 7.300082402069525e-06, 'reg_lambda': 3.010627941031705e-06}. Best is trial 1 with value: 0.6617647058823529.
[I 2025-10-17 00:50:00,919] Trial 2 finished with value: 0.6618004866180048 and parameters: {'learning_rate': 0.059

Best trial for LightGBM:
  Value (Best F1 Score): 0.6873385012919897
  Best Params: 
    learning_rate: 0.07695074516306853
    num_leaves: 39
    max_depth: 3
    min_child_samples: 30
    subsample: 0.8915856859820709
    colsample_bytree: 0.6497439382521284
    reg_alpha: 0.018342465397323954
    reg_lambda: 0.0014512535195673494


In [19]:
best_params_lightgbm

{'objective': 'binary',
 'metric': 'binary_logloss',
 'n_estimators': 1000,
 'is_unbalance': True,
 'random_state': 42,
 'n_jobs': -1,
 'verbose': -1,
 'learning_rate': 0.07695074516306853,
 'num_leaves': 39,
 'max_depth': 3,
 'min_child_samples': 30,
 'subsample': 0.8915856859820709,
 'colsample_bytree': 0.6497439382521284,
 'reg_alpha': 0.018342465397323954,
 'reg_lambda': 0.0014512535195673494}

In [None]:
Best trial:
  Value (Best F1 Score): 0.6852791878172588
  Best Params: 
    learning_rate: 0.049226097491479354
    num_leaves: 24
    max_depth: 4
    min_child_samples: 35
    subsample: 0.9766010179115259
    colsample_bytree: 0.9743416839508543
    reg_alpha: 4.6044002895687174e-08
    reg_lambda: 0.0006790440644320187

FrozenTrial(number=43, state=1, values=[0.6852791878172588], datetime_start=datetime.datetime(2025, 10, 17, 0, 23, 5, 156096), datetime_complete=datetime.datetime(2025, 10, 17, 0, 23, 5, 997304), params={'learning_rate': 0.049226097491479354, 'num_leaves': 24, 'max_depth': 4, 'min_child_samples': 35, 'subsample': 0.9766010179115259, 'colsample_bytree': 0.9743416839508543, 'reg_alpha': 4.6044002895687174e-08, 'reg_lambda': 0.0006790440644320187}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=0.1, log=True, low=0.01, step=None), 'num_leaves': IntDistribution(high=80, log=False, low=20, step=1), 'max_depth': IntDistribution(high=12, log=False, low=3, step=1), 'min_child_samples': IntDistribution(high=100, log=False, low=5, step=1), 'subsample': FloatDistribution(high=1.0, log=False, low=0.6, step=None), 'colsample_bytree': FloatDistribution(high=1.0, log=False, low=0.6, step=None), 'reg_alpha': FloatDistribution(high=10.0, l

In [20]:
NFOLDS = 5

params_base = {
        'objective': 'Logloss', # CatBoostでは'Logloss'を指定
        'iterations': 1000,
        'scale_pos_weight': (y_train == 0).sum() / (y_train == 1).sum(), # 不均衡データへの対処
        'random_seed': 42,
        'verbose': 0, # 学習ログを非表示
        'early_stopping_rounds': 50 # 早期停止
    }

def objective_catboost(trial):
    # 不均衡データ対策のための重みを計算

    # パラメータの探索範囲を定義
    params_opt = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'depth': trial.suggest_int('depth', 3, 10),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0, log=True), # L2正則化
    }

    params = params_base | params_opt

    # 交差検証ループ
    oof_preds_trial = np.zeros(train_df.shape[0])
    folds = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=42)
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X_train, y_train)):
        X_train_fold, y_train_fold = X_train.iloc[train_idx], y_train.iloc[train_idx]
        X_valid_fold, y_valid_fold = X_train.iloc[valid_idx], y_train.iloc[valid_idx]

        model = CatBoostClassifier(**params)
        model.fit(X_train_fold, y_train_fold,
                  eval_set=[(X_valid_fold, y_valid_fold)],
                  use_best_model=True)

        oof_preds_trial[valid_idx] = model.predict_proba(X_valid_fold)[:, 1]
    
    # このトライアルでの最適なF1スコアを閾値探索で見つける
    thresholds = np.arange(0.1, 0.5, 0.01)
    f1_scores = [f1_score(y_train, (oof_preds_trial > t).astype(int)) for t in thresholds]
    
    return np.max(f1_scores)

study_catboost = optuna.create_study(direction='maximize')
study_catboost.optimize(objective_catboost, n_trials=50) # 50回の試行で探索

print('Best trial for CatBoost:')
trial_catboost = study_catboost.best_trial
print(f'  Value (Best F1 Score): {trial_catboost.value}')
print('  Best Params: ')
for key, value in trial_catboost.params.items():
    print(f'    {key}: {value}')

best_params_catboost = params_base | trial_catboost.params

[I 2025-10-17 00:52:14,062] A new study created in memory with name: no-name-946674c9-376d-4c76-b6b8-757f624bbdda
[I 2025-10-17 00:52:25,075] Trial 0 finished with value: 0.5747126436781609 and parameters: {'learning_rate': 0.018423847690538526, 'depth': 9, 'min_data_in_leaf': 66, 'subsample': 0.9521487048759372, 'l2_leaf_reg': 2.4858460314250773e-08}. Best is trial 0 with value: 0.5747126436781609.
[I 2025-10-17 00:52:29,042] Trial 1 finished with value: 0.6582278481012658 and parameters: {'learning_rate': 0.039858000909430626, 'depth': 4, 'min_data_in_leaf': 67, 'subsample': 0.8513048227639659, 'l2_leaf_reg': 6.014034306844379}. Best is trial 1 with value: 0.6582278481012658.
[I 2025-10-17 00:52:43,902] Trial 2 finished with value: 0.6368159203980099 and parameters: {'learning_rate': 0.05792108705590046, 'depth': 9, 'min_data_in_leaf': 88, 'subsample': 0.6815510882022893, 'l2_leaf_reg': 0.4130198022189027}. Best is trial 1 with value: 0.6582278481012658.
[I 2025-10-17 00:53:05,949] T

Best trial for CatBoost:
  Value (Best F1 Score): 0.6737967914438503
  Best Params: 
    learning_rate: 0.012893567859358891
    depth: 5
    min_data_in_leaf: 83
    subsample: 0.9700324489092532
    l2_leaf_reg: 0.004589013293714057


In [None]:
Best trial for CatBoost:
  Value (Best F1 Score): 0.685
  Best Params: 
    learning_rate: 0.01506366663492991
    depth: 4
    min_data_in_leaf: 93
    subsample: 0.7518354099061272
    l2_leaf_reg: 0.07096168454986888

{'learning_rate': 0.049226097491479354,
 'num_leaves': 24,
 'max_depth': 4,
 'min_child_samples': 35,
 'subsample': 0.9766010179115259,
 'colsample_bytree': 0.9743416839508543,
 'reg_alpha': 4.6044002895687174e-08,
 'reg_lambda': 0.0006790440644320187}

In [21]:


# --- 交差検証の準備 ---
NFOLDS = 5
folds = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=42)

# OOF予測値とテストデータ予測値を格納する配列を初期化
oof_preds_lgbm = np.zeros(train_df.shape[0])
sub_preds_lgbm = np.zeros(test_df.shape[0])
oof_preds_cat = np.zeros(train_df.shape[0])
sub_preds_cat = np.zeros(test_df.shape[0])


# --- モデル学習ループ ---
print("--- Starting LightGBM and CatBoost Training ---")
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X_train, y_train)):
    X_train_fold, y_train_fold = X_train.iloc[train_idx], y_train.iloc[train_idx]
    X_valid_fold, y_valid_fold = X_train.iloc[valid_idx], y_train.iloc[valid_idx]

    # --- LightGBMの学習 ---
    lgbm = lgb.LGBMClassifier(**best_params_lightgbm)
    lgbm.fit(X_train_fold, y_train_fold,
             eval_set=[(X_valid_fold, y_valid_fold)],
             callbacks=[lgb.early_stopping(50, verbose=False)])
    oof_preds_lgbm[valid_idx] = lgbm.predict_proba(X_valid_fold)[:, 1]
    sub_preds_lgbm += lgbm.predict_proba(X_test)[:, 1] / folds.n_splits

    # --- CatBoostの学習 ---
    cat = CatBoostClassifier(**best_params_catboost)
    cat.fit(X_train_fold, y_train_fold,
            eval_set=[(X_valid_fold, y_valid_fold)],
            use_best_model=True,
            verbose=0)
    oof_preds_cat[valid_idx] = cat.predict_proba(X_valid_fold)[:, 1]
    sub_preds_cat += cat.predict_proba(X_test)[:, 1] / folds.n_splits
    
    print(f"Fold {n_fold+1} finished.")

print("\n--- Training Finished ---")


# --- アンサンブルと閾値最適化 ---
# 2つのモデルのOOF予測値を単純平均
ensemble_oof_preds = (oof_preds_lgbm + oof_preds_cat) / 2

# 最適な閾値を探索
thresholds = np.arange(0.1, 0.5, 0.01)
f1_scores = [f1_score(y_train, (ensemble_oof_preds > t).astype(int)) for t in thresholds]
best_threshold_ensemble = thresholds[np.argmax(f1_scores)]
best_f1_ensemble = np.max(f1_scores)

print("-" * 40)
print(f"LightGBM単体のOOF F1スコア (参考): {f1_score(y_train, (oof_preds_lgbm > best_threshold_ensemble).astype(int)):.4f}")
print(f"CatBoost単体のOOF F1スコア (参考): {f1_score(y_train, (oof_preds_cat > best_threshold_ensemble).astype(int)):.4f}")
print("-" * 40)
print(f"アンサンブルモデルのベストOOF F1スコア: {best_f1_ensemble:.4f}")
print(f"アンサンブルモデルの最適閾値: {best_threshold_ensemble:.2f}")
print("-" * 40)

--- Starting LightGBM and CatBoost Training ---
Fold 1 finished.
Fold 2 finished.
Fold 3 finished.
Fold 4 finished.
Fold 5 finished.

--- Training Finished ---
----------------------------------------
LightGBM単体のOOF F1スコア (参考): 0.6793
CatBoost単体のOOF F1スコア (参考): 0.6590
----------------------------------------
アンサンブルモデルのベストOOF F1スコア: 0.6856
アンサンブルモデルの最適閾値: 0.49
----------------------------------------


In [None]:
# 提出ファイルの作成
# テストデータに対する予測値も同様に平均
ensemble_sub_preds = (sub_preds_lgbm + sub_preds_cat) / 2

# 最適化された閾値を使って最終的な予測を決定
predictions_ensemble = (ensemble_sub_preds > best_threshold_ensemble).astype(int)

# 提出用DataFrameを作成
submit_df_ensemble = sample_submit.copy()
submit_df_ensemble[1] = predictions_ensemble
submit_df_ensemble.to_csv('submission_ensemble.csv', index=False, header=False)

print("\nアンサンブルモデルの提出ファイル 'submission_ensemble.csv' を作成しました。")
print(f"提出ファイルでの購入予測数: {np.sum(predictions_ensemble)}")


アンサンブルモデルの提出ファイル 'submission_ensemble.csv' を作成しました。
提出ファイルでの購入予測数: 174
