In [1]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import matplotlib_fontja
os.chdir('../')
from src.feature import *
from src.model import *

  from .autonotebook import tqdm as notebook_tqdm


データ準備

In [2]:
# データの読み込み
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
sample_submit = pd.read_csv("data/sample_submit.csv", header=None)

In [3]:
# Xとyの分割
target_col = '購入フラグ'
X_train_df = train.drop(columns=[target_col])
y_train_df = train[target_col]
X_test_df = test.copy()

特徴量エンジニアリング

In [4]:
# 学習・テストデータの結合
X_train_df.index = "train_" + X_train_df.index.astype(str)
X_test_df.index = "test_" + X_test_df.index.astype(str)
X_df = pd.concat([X_train_df, X_test_df], axis=0)

# 不要な列の削除
drop_cols = ['企業ID', '企業名']
X_df = X_df.drop(columns=drop_cols)

# 欠損値の補完
X_df = impute_missing_values(X_df)

# 指標の組み合わせによる新規指標の作成
X_df = feature_engineering(X_df)

# カテゴリ変数を数値にエンコード
categorical_cols = ['業界', '上場種別', '特徴']
X_df = onehot_encode_categorical(X_df, categorical_cols)

# ２択（はい/いいえ）変数のエンコード
binary_cols = ['アンケート６']
X_df = encode_binary(X_df, binary_cols, true=1)

# アンケートデータの特徴量化
X_df = survey_features(X_df, None)

# 組織図の特徴量化
# 特定部門の有無
org_df = org_chart_features(X_df['組織図'])
# Transformer+PCA
embedding_df = pd.read_csv("data/組織図_embed.csv", index_col=0)
transform_pca_df = pca_reduction(embedding_df, n_components=5)
transform_pca_df.columns = [f'組織図_emb_pca_{i}' for i in range(transform_pca_df.shape[1])]
X_df = pd.concat([X_df, org_df, transform_pca_df], axis=1)
X_df = X_df.drop(columns=['組織図'])

# 今後のDX展望テキストデータの特徴量化
# 文章量
length_df = text_length_features(X_df['今後のDX展望'])
# Transformer+PCA
embedding_df = pd.read_csv("data/今後のDX展望_embed.csv", index_col=0)
transform_pca_df = pca_reduction(embedding_df, n_components=5)
transform_pca_df.columns = [f'今後のDX展望_emb_pca_{i}' for i in range(transform_pca_df.shape[1])]
X_df = pd.concat([X_df, length_df, transform_pca_df], axis=1)
X_df = X_df.drop(columns=['今後のDX展望'])

# 企業概要テキストデータの特徴量化
# Transformer+PCA
embedding_df = pd.read_csv("data/企業概要_embed.csv", index_col=0)
transform_pca_df = pca_reduction(embedding_df, n_components=5)
transform_pca_df.columns = [f'企業概要_emb_pca_{i}' for i in range(transform_pca_df.shape[1])]
X_df = pd.concat([X_df, transform_pca_df], axis=1)
X_df = X_df.drop(columns=['企業概要'])

# 学習・テストデータの再分割
X_train_df = X_df.loc[X_train_df.index].reset_index(drop=True)
X_test_df = X_df.loc[X_test_df.index].reset_index(drop=True)

予測モデルのパラメータ最適化

In [5]:
# 基本パラメータ
lgbm_params_base = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'n_estimators': 1000,
    'is_unbalance': True,
    'random_state': 42,
    'n_jobs': -1,
    'verbose': -1
    }

# 探索するパラメータを定義する「関数」
def define_lgbm_params(trial):
    return {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 80),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
    }

# LightGBMモデルの最適化実行
lgbm_results = optimize_model(
    X_train_df,
    y_train_df,
    model_name="lightgbm",
    params_base=lgbm_params_base,
    define_params_func=define_lgbm_params,
    n_trials=50,
    early_stopping_rounds=10
)

[I 2025-11-12 23:41:59,718] A new study created in memory with name: no-name-68cdec36-c42a-46bd-a28c-2b23a569670d


--- Optimizing lightgbm ---


[I 2025-11-12 23:42:02,530] Trial 0 finished with value: 0.699306408492455 and parameters: {'learning_rate': 0.02436105826754386, 'num_leaves': 53, 'max_depth': 4, 'min_child_samples': 63, 'subsample': 0.9082973976207168, 'colsample_bytree': 0.7974655475844776, 'reg_alpha': 0.10990465286840828, 'reg_lambda': 0.0034082106906830646}. Best is trial 0 with value: 0.699306408492455.
[I 2025-11-12 23:42:04,349] Trial 1 finished with value: 0.6902206435539768 and parameters: {'learning_rate': 0.046469009131287584, 'num_leaves': 71, 'max_depth': 9, 'min_child_samples': 65, 'subsample': 0.9985303013779231, 'colsample_bytree': 0.9356025032304722, 'reg_alpha': 3.305477415606281e-06, 'reg_lambda': 0.0004408616002239865}. Best is trial 0 with value: 0.699306408492455.
[I 2025-11-12 23:42:07,812] Trial 2 finished with value: 0.6666052425665102 and parameters: {'learning_rate': 0.01802643846649726, 'num_leaves': 22, 'max_depth': 12, 'min_child_samples': 18, 'subsample': 0.9853422095219707, 'colsample


--- Optimization Finished ---
Best trial for lightgbm:
  Value (Best F1 Score): 0.73764
  Best Params:
    learning_rate: 0.08429724986941373
    num_leaves: 61
    max_depth: 3
    min_child_samples: 38
    subsample: 0.9339268897596533
    colsample_bytree: 0.7614675021402157
    reg_alpha: 5.491449933213162e-05
    reg_lambda: 0.06093704295503494

--- Confusion Matrix (Best Trial at Threshold: 0.4000) ---
                          |Predicted Label              |
                          |-----------------------------|
                          | Negative (0) | Positive (1) |
-----------|--------------|--------------|--------------|
True Label | Negative (0) | 494          | 69           | (TN, FP)
           |--------------|--------------|--------------|
           | Positive (1) | 34           | 145          | (FN, TP)
-----------|--------------|--------------|--------------|

--- OOF Scores (at Best Threshold) ---
  Precision: 0.67757
  Recall:    0.81006
  F1 Score:  0.73791 (F

In [6]:
# 基本パラメータ
xgb_params_base = {
    'objective': 'binary:logistic', 
    'eval_metric': 'logloss',       
    'n_estimators': 1000,
    'scale_pos_weight': (y_train_df == 0).sum() / (y_train_df == 1).sum(), # 'is_unbalance' の代わり
    'random_state': 42,
    'n_jobs': -1,
    'verbosity': 0,                 # 'verbose': -1 -> 'verbosity': 0
    'booster': 'gbtree'             # デフォルトだが明記
    }

# 探索するパラメータを定義する「関数」
def define_xgb_params(trial):
    return {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
    }

# XGBoostモデルの最適化実行
xgb_results = optimize_model(
    X_train_df,
    y_train_df,
    model_name="xgboost",
    params_base=xgb_params_base,
    define_params_func=define_xgb_params,
    n_trials=50,
    early_stopping_rounds=10
)

[I 2025-11-12 23:43:42,131] A new study created in memory with name: no-name-25afeb7c-f21f-43ac-bad6-f86ae5081476


--- Optimizing xgboost ---


[I 2025-11-12 23:43:44,922] Trial 0 finished with value: 0.5814808192780658 and parameters: {'learning_rate': 0.06078055622541551, 'max_depth': 9, 'min_child_weight': 82, 'subsample': 0.8167762214917137, 'colsample_bytree': 0.7472382746179268, 'reg_alpha': 0.008588228528373229, 'reg_lambda': 0.000501142458401231}. Best is trial 0 with value: 0.5814808192780658.
[I 2025-11-12 23:43:49,884] Trial 1 finished with value: 0.6117720508596484 and parameters: {'learning_rate': 0.04711414721806807, 'max_depth': 7, 'min_child_weight': 59, 'subsample': 0.715153973496131, 'colsample_bytree': 0.8692970830835837, 'reg_alpha': 0.004232021657054478, 'reg_lambda': 2.288210506225864e-06}. Best is trial 1 with value: 0.6117720508596484.
[I 2025-11-12 23:43:53,747] Trial 2 finished with value: 0.6059702031069352 and parameters: {'learning_rate': 0.077843722278861, 'max_depth': 5, 'min_child_weight': 76, 'subsample': 0.9444492826401792, 'colsample_bytree': 0.9216185613865379, 'reg_alpha': 7.393176363035951


--- Optimization Finished ---
Best trial for xgboost:
  Value (Best F1 Score): 0.73772
  Best Params:
    learning_rate: 0.010977503833937455
    max_depth: 3
    min_child_weight: 5
    subsample: 0.8357898827121102
    colsample_bytree: 0.6778869205927853
    reg_alpha: 1.155342938102474e-08
    reg_lambda: 3.171340125356516

--- Confusion Matrix (Best Trial at Threshold: 0.4600) ---
                          |Predicted Label              |
                          |-----------------------------|
                          | Negative (0) | Positive (1) |
-----------|--------------|--------------|--------------|
True Label | Negative (0) | 506          | 57           | (TN, FP)
           |--------------|--------------|--------------|
           | Positive (1) | 41           | 138          | (FN, TP)
-----------|--------------|--------------|--------------|

--- OOF Scores (at Best Threshold) ---
  Precision: 0.70769
  Recall:    0.77095
  F1 Score:  0.73797 (F1 score on total OOF pr

In [7]:
# 基本パラメータ
cat_params_base = {
    'objective': 'Logloss', # CatBoostでは'Logloss'を指定
        'iterations': 1000,
        'scale_pos_weight': (y_train_df == 0).sum() / (y_train_df == 1).sum(), # 不均衡データへの対処
        'random_seed': 42,
        'verbose': 0, # 学習ログを非表示
        'early_stopping_rounds': 10 # 早期停止
    }

# 探索するパラメータを定義する「関数」
def define_cat_params(trial):
    return {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'depth': trial.suggest_int('depth', 3, 10),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0, log=True), # L2正則化
    }

# CatBoostモデルの最適化実行
cat_results = optimize_model(
    X_train_df,
    y_train_df,
    model_name="catboost",
    params_base=cat_params_base,
    define_params_func=define_cat_params,
    n_trials=50,
    early_stopping_rounds=10
)

[I 2025-11-12 23:55:32,658] A new study created in memory with name: no-name-9d9af4dd-11eb-4063-a343-608bb088ddd1


--- Optimizing catboost ---


[I 2025-11-12 23:55:39,914] Trial 0 finished with value: 0.6021846445182651 and parameters: {'learning_rate': 0.016927380811615815, 'depth': 9, 'min_data_in_leaf': 48, 'subsample': 0.6009927676215964, 'l2_leaf_reg': 6.261048297218274e-07}. Best is trial 0 with value: 0.6021846445182651.
[I 2025-11-12 23:55:41,744] Trial 1 finished with value: 0.5809157355389802 and parameters: {'learning_rate': 0.08158144150958244, 'depth': 6, 'min_data_in_leaf': 77, 'subsample': 0.7038347048215046, 'l2_leaf_reg': 2.2521667835729668e-08}. Best is trial 0 with value: 0.6021846445182651.
[I 2025-11-12 23:55:45,237] Trial 2 finished with value: 0.6234709390102442 and parameters: {'learning_rate': 0.06775316350904638, 'depth': 8, 'min_data_in_leaf': 83, 'subsample': 0.8670639412324189, 'l2_leaf_reg': 0.011956558252699945}. Best is trial 2 with value: 0.6234709390102442.
[I 2025-11-12 23:55:47,299] Trial 3 finished with value: 0.5330665804761616 and parameters: {'learning_rate': 0.09459877595395552, 'depth'


--- Optimization Finished ---
Best trial for catboost:
  Value (Best F1 Score): 0.73194
  Best Params:
    learning_rate: 0.03606332831708327
    depth: 3
    min_data_in_leaf: 61
    subsample: 0.7319054035471461
    l2_leaf_reg: 0.7034017540040058

--- Confusion Matrix (Best Trial at Threshold: 0.4900) ---
                          |Predicted Label              |
                          |-----------------------------|
                          | Negative (0) | Positive (1) |
-----------|--------------|--------------|--------------|
True Label | Negative (0) | 498          | 65           | (TN, FP)
           |--------------|--------------|--------------|
           | Positive (1) | 38           | 141          | (FN, TP)
-----------|--------------|--------------|--------------|

--- OOF Scores (at Best Threshold) ---
  Precision: 0.68447
  Recall:    0.78771
  F1 Score:  0.73247 (F1 score on total OOF predictions)
  (Note: 'Best F1 Score' above is the CV mean optimized by Optuna.)


In [9]:
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
import optuna
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, confusion_matrix, precision_score, recall_score
from typing import Dict, Any, Callable
from datetime import datetime
from scipy.optimize import minimize
from sklearn.linear_model import LogisticRegression
# 改良版アンサンブル学習関数（ブレンディング＋スタッキング）
def train_ensemble_models_(
    X_train_df: pd.DataFrame,
    y_train_df: pd.Series,
    X_test_df: pd.DataFrame,
    lgb_best_params: dict,
    xgb_best_params: dict,
    cat_best_params: dict,
    sample_submit: pd.DataFrame,
    n_folds: int = 5,
    early_stopping_rounds: int = 50,
    thresholds: np.ndarray = np.arange(0.1, 0.5, 0.01),
    random_state: int = 42
) -> dict:
    """
    LightGBM, XGBoost, CatBoostの最適化済みパラメータを用いてアンサンブル学習（Blending & Stacking）を行う関数。
    """
    
    # --- 1. Base Models Training (Level 1) ---
    folds = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_state)
    n_trains = X_train_df.shape[0]
    n_tests = X_test_df.shape[0]

    # OOF予測値とテスト予測値の初期化
    oof_preds_lgb = np.zeros(n_trains)
    oof_preds_xgb = np.zeros(n_trains)
    oof_preds_cat = np.zeros(n_trains)
    
    test_preds_lgb = np.zeros(n_tests)
    test_preds_xgb = np.zeros(n_tests)
    test_preds_cat = np.zeros(n_tests)

    print("--- Start Base Models Training ---")
    all_valid_indices = []
    
    for fold, (train_idx, valid_idx) in enumerate(folds.split(X_train_df, y_train_df)):
        print(f"Fold {fold+1}/{folds.n_splits} started...")
        X_train_fold, y_train_fold = X_train_df.iloc[train_idx], y_train_df.iloc[train_idx]
        X_valid_fold, y_valid_fold = X_train_df.iloc[valid_idx], y_train_df.iloc[valid_idx]
        all_valid_indices.append(valid_idx)

        # LightGBM
        lgb_model = lgb.LGBMClassifier(**lgb_best_params)
        lgb_model.fit(
            X_train_fold, y_train_fold,
            eval_set=[(X_valid_fold, y_valid_fold)],
            callbacks=[lgb.early_stopping(early_stopping_rounds, verbose=False)]
        )
        oof_preds_lgb[valid_idx] = lgb_model.predict_proba(X_valid_fold)[:, 1]
        test_preds_lgb += lgb_model.predict_proba(X_test_df)[:, 1] / folds.n_splits

        # XGBoost
        xgb_cp_params = xgb_best_params.copy()
        xgb_cp_params['early_stopping_rounds'] = early_stopping_rounds
        xgb_model = xgb.XGBClassifier(**xgb_cp_params)
        xgb_model.fit(X_train_fold, y_train_fold,
                      eval_set=[(X_valid_fold, y_valid_fold)],
                      verbose=False)
        oof_preds_xgb[valid_idx] = xgb_model.predict_proba(X_valid_fold)[:, 1]
        test_preds_xgb += xgb_model.predict_proba(X_test_df)[:, 1] / folds.n_splits

        # CatBoost
        cat_model = CatBoostClassifier(**cat_best_params)
        cat_model.fit(
            X_train_fold, y_train_fold,
            eval_set=[(X_valid_fold, y_valid_fold)],
            use_best_model=True,
            early_stopping_rounds=early_stopping_rounds,
            verbose=False
        )
        oof_preds_cat[valid_idx] = cat_model.predict_proba(X_valid_fold)[:, 1]
        test_preds_cat += cat_model.predict_proba(X_test_df)[:, 1] / folds.n_splits

    print("--- Base Models Training Finished ---")

    # メタ特徴量の作成（Level 1の予測値を結合）
    train_meta = np.column_stack([oof_preds_lgb, oof_preds_xgb, oof_preds_cat])
    test_meta = np.column_stack([test_preds_lgb, test_preds_xgb, test_preds_cat])

    # --- 2. Blending Implementation (Optimization) ---
    print("\n--- Start Blending Optimization ---")
    def objective_function(weights):
        weighted_oof_preds = np.dot(train_meta, weights)
        f1_scores_list = [
            f1_score(y_train_df, (weighted_oof_preds > t).astype(int)) for t in thresholds
        ]
        return -np.max(f1_scores_list)

    constraints = ({'type': 'eq', 'fun': lambda w: np.sum(w) - 1})
    bounds = [(0, 1) for _ in range(3)]
    initial_weights = np.array([1/3, 1/3, 1/3])
    
    result = minimize(objective_function, initial_weights, method='SLSQP', bounds=bounds, constraints=constraints)
    
    best_weights = result.x if result.success else initial_weights
    print(f"Blending Weights - LGB: {best_weights[0]:.4f}, XGB: {best_weights[1]:.4f}, CAT: {best_weights[2]:.4f}")
    
    oof_preds_blending = np.dot(train_meta, best_weights)
    test_preds_blending = np.dot(test_meta, best_weights)

    # --- 3. Stacking Implementation (Meta Model) ---
    print("\n--- Start Stacking (Logistic Regression) ---")
    # メタモデルにはロジスティック回帰を使用（過学習しにくいため）
    meta_model = LogisticRegression(random_state=random_state)
    meta_model.fit(train_meta, y_train_df)
    
    oof_preds_stacking = meta_model.predict_proba(train_meta)[:, 1]
    test_preds_stacking = meta_model.predict_proba(test_meta)[:, 1]
    
    # スタッキングモデルの係数を確認（どのモデルを重要視したか）
    print(f"Stacking Coefs - LGB: {meta_model.coef_[0][0]:.4f}, XGB: {meta_model.coef_[0][1]:.4f}, CAT: {meta_model.coef_[0][2]:.4f}")

    # --- 4. Threshold Optimization & Scoring ---
    def get_best_metrics(y_true, y_pred_proba, thresholds):
        scores = [f1_score(y_true, (y_pred_proba > t).astype(int)) for t in thresholds]
        best_idx = np.argmax(scores)
        return np.max(scores), thresholds[best_idx]

    # 各モデルのスコア算出
    best_f1_blend, best_th_blend = get_best_metrics(y_train_df, oof_preds_blending, thresholds)
    best_f1_stack, best_th_stack = get_best_metrics(y_train_df, oof_preds_stacking, thresholds)
    best_f1_lgb, best_th_lgb = get_best_metrics(y_train_df, oof_preds_lgb, thresholds)
    best_f1_xgb, best_th_xgb = get_best_metrics(y_train_df, oof_preds_xgb, thresholds)
    best_f1_cat, best_th_cat = get_best_metrics(y_train_df, oof_preds_cat, thresholds)

    print("\n--- Evaluation Results ---")
    print(f"Blending F1: {best_f1_blend:.5f} (Th: {best_th_blend:.2f})")
    print(f"Stacking F1: {best_f1_stack:.5f} (Th: {best_th_stack:.2f})")
    print(f"LightGBM F1: {best_f1_lgb:.5f}")
    print(f"XGBoost  F1: {best_f1_xgb:.5f}")
    print(f"CatBoost F1: {best_f1_cat:.5f}")

    # --- 5. Submission Files Generation ---
    now = datetime.now()
    timestamp = now.strftime("%m%d%H%M")
    submission_files = []

    def save_submission(preds, threshold, suffix):
        final_preds = (preds > threshold).astype(int)
        submit_df = sample_submit.copy()
        submit_df[1] = final_preds
        filename = f'submission_{suffix}_{timestamp}.csv'
        submit_df.to_csv(filename, index=False, header=False)
        submission_files.append(filename)

    # Blending Submission
    save_submission(test_preds_blending, best_th_blend, "blending")
    
    # Stacking Submission
    save_submission(test_preds_stacking, best_th_stack, "stacking")

    # 単体モデル（Blendingより良ければ出力）
    global_best_f1 = max(best_f1_blend, best_f1_stack)
    if best_f1_lgb > global_best_f1: save_submission(test_preds_lgb, best_th_lgb, "lgb")
    if best_f1_xgb > global_best_f1: save_submission(test_preds_xgb, best_th_xgb, "xgb")
    if best_f1_cat > global_best_f1: save_submission(test_preds_cat, best_th_cat, "cat")

    return {
        'scores': {
            'Blending': best_f1_blend,
            'Stacking': best_f1_stack,
            'LGB': best_f1_lgb,
            'XGB': best_f1_xgb,
            'CAT': best_f1_cat
        },
        'thresholds': {
            'Blending': best_th_blend,
            'Stacking': best_th_stack
        },
        'blending_weights': best_weights.tolist(),
        'stacking_model': meta_model,
        'submission_files': submission_files
    }


In [10]:
# アンサンブルモデルの学習
ensemble_result = train_ensemble_models_(
    X_train_df=X_train_df,
    y_train_df=y_train_df,
    X_test_df=X_test_df,
    lgb_best_params = lgbm_results['best_params'],
    xgb_best_params = xgb_results['best_params'],
    cat_best_params = cat_results['best_params'],
    sample_submit=sample_submit,
    n_folds=5,
    early_stopping_rounds=10,
    thresholds=np.arange(0.1, 0.5, 0.01),
    random_state=42
)

--- Start Base Models Training ---
Fold 1/5 started...
Fold 2/5 started...
Fold 3/5 started...
Fold 4/5 started...
Fold 5/5 started...
--- Base Models Training Finished ---

--- Start Blending Optimization ---
Blending Weights - LGB: 0.3333, XGB: 0.3333, CAT: 0.3333

--- Start Stacking (Logistic Regression) ---
Stacking Coefs - LGB: 1.5350, XGB: 2.5784, CAT: 2.9940

--- Evaluation Results ---
Blending F1: 0.73791 (Th: 0.43)
Stacking F1: 0.73846 (Th: 0.31)
LightGBM F1: 0.73791
XGBoost  F1: 0.73797
CatBoost F1: 0.73247
