In [1]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import matplotlib_fontja
os.chdir('../')
from src.feature import *
from src.model import *

  from .autonotebook import tqdm as notebook_tqdm


データ準備

In [2]:
# データの読み込み
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
sample_submit = pd.read_csv("data/sample_submit.csv", header=None)

In [3]:
# Xとyの分割
target_col = '購入フラグ'
X_train_df = train.drop(columns=[target_col])
y_train_df = train[target_col]
X_test_df = test.copy()

特徴量エンジニアリング

In [4]:
# 学習・テストデータの結合
X_train_df.index = "train_" + X_train_df.index.astype(str)
X_test_df.index = "test_" + X_test_df.index.astype(str)
X_df = pd.concat([X_train_df, X_test_df], axis=0)

# 不要な列の削除
drop_cols = ['企業ID', '企業名']
X_df = X_df.drop(columns=drop_cols)

# 欠損値の補完
X_df = impute_missing_values(X_df)

# 指標の組み合わせによる新規指標の作成
X_df = feature_engineering(X_df)

# カテゴリ変数を数値にエンコード
categorical_cols = ['業界', '上場種別', '特徴']
X_df = onehot_encode_categorical(X_df, categorical_cols)

# ２択（はい/いいえ）変数のエンコード
binary_cols = ['アンケート６']
X_df = encode_binary(X_df, binary_cols, true=1)

# アンケートデータの特徴量化
X_df = survey_features(X_df, None)

# 組織図の特徴量化
# 特定部門の有無
org_df = org_chart_features(X_df['組織図'])
# Transformer+PCA
embedding_df = pd.read_csv("data/組織図_embed.csv", index_col=0)
transform_pca_df = pca_reduction(embedding_df, n_components=5)
transform_pca_df.columns = [f'組織図_emb_pca_{i}' for i in range(transform_pca_df.shape[1])]
X_df = pd.concat([X_df, org_df, transform_pca_df], axis=1)
X_df = X_df.drop(columns=['組織図'])

# 今後のDX展望テキストデータの特徴量化
# 文章量
length_df = text_length_features(X_df['今後のDX展望'])
# Transformer+PCA
embedding_df = pd.read_csv("data/今後のDX展望_embed.csv", index_col=0)
transform_pca_df = pca_reduction(embedding_df, n_components=5)
transform_pca_df.columns = [f'今後のDX展望_emb_pca_{i}' for i in range(transform_pca_df.shape[1])]
X_df = pd.concat([X_df, length_df, transform_pca_df], axis=1)
X_df = X_df.drop(columns=['今後のDX展望'])

# 企業概要テキストデータの特徴量化
# Transformer+PCA
embedding_df = pd.read_csv("data/企業概要_embed.csv", index_col=0)
transform_pca_df = pca_reduction(embedding_df, n_components=5)
transform_pca_df.columns = [f'企業概要_emb_pca_{i}' for i in range(transform_pca_df.shape[1])]
X_df = pd.concat([X_df, transform_pca_df], axis=1)
X_df = X_df.drop(columns=['企業概要'])

# 学習・テストデータの再分割
X_train_df = X_df.loc[X_train_df.index].reset_index(drop=True)
X_test_df = X_df.loc[X_test_df.index].reset_index(drop=True)

予測モデルのパラメータ最適化

In [5]:
# 基本パラメータ
lgbm_params_base = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'n_estimators': 1000,
    'is_unbalance': True,
    'random_state': 42,
    'n_jobs': -1,
    'verbose': -1
    }

# 探索するパラメータを定義する「関数」
def define_lgbm_params(trial):
    return {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 80),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
    }

# LightGBMモデルの最適化実行
lgbm_results = optimize_model(
    X_train_df,
    y_train_df,
    model_name="lightgbm",
    params_base=lgbm_params_base,
    define_params_func=define_lgbm_params,
    n_trials=50,
    early_stopping_rounds=10
)

[I 2025-11-13 10:18:20,011] A new study created in memory with name: no-name-cd49b3c8-cbb2-436a-9f6c-3fedc327d545


--- Optimizing lightgbm ---


[I 2025-11-13 10:18:20,693] Trial 0 finished with value: 0.7130065161960479 and parameters: {'learning_rate': 0.09436751794377071, 'num_leaves': 75, 'max_depth': 3, 'min_child_samples': 34, 'subsample': 0.891548098693977, 'colsample_bytree': 0.7589273779837218, 'reg_alpha': 2.488563854626132e-07, 'reg_lambda': 1.04187513739438e-05}. Best is trial 0 with value: 0.7130065161960479.
[I 2025-11-13 10:18:22,658] Trial 1 finished with value: 0.6993388677698953 and parameters: {'learning_rate': 0.02856393919210095, 'num_leaves': 80, 'max_depth': 11, 'min_child_samples': 29, 'subsample': 0.7335311440225406, 'colsample_bytree': 0.9651776942411479, 'reg_alpha': 1.9119079564255332e-07, 'reg_lambda': 0.008433065869771833}. Best is trial 0 with value: 0.7130065161960479.
[I 2025-11-13 10:18:24,041] Trial 2 finished with value: 0.6861152658411724 and parameters: {'learning_rate': 0.018097045608683318, 'num_leaves': 25, 'max_depth': 4, 'min_child_samples': 20, 'subsample': 0.8845364772776485, 'colsam


--- Optimization Finished ---
Best trial for lightgbm:
  Value (Best F1 Score): 0.74546
  Best Params:
    learning_rate: 0.06421888593122477
    num_leaves: 60
    max_depth: 9
    min_child_samples: 71
    subsample: 0.8539993653457615
    colsample_bytree: 0.8912694307242868
    reg_alpha: 0.02518853779883269
    reg_lambda: 1.0984932570184269e-08

--- Confusion Matrix (Best Trial at Threshold: 0.4200) ---
                          |Predicted Label              |
                          |-----------------------------|
                          | Negative (0) | Positive (1) |
-----------|--------------|--------------|--------------|
True Label | Negative (0) | 505          | 58           | (TN, FP)
           |--------------|--------------|--------------|
           | Positive (1) | 38           | 141          | (FN, TP)
-----------|--------------|--------------|--------------|

--- OOF Scores (at Best Threshold) ---
  Precision: 0.70854
  Recall:    0.78771
  F1 Score:  0.74603 (

In [6]:
# 基本パラメータ
xgb_params_base = {
    'objective': 'binary:logistic', 
    'eval_metric': 'logloss',       
    'n_estimators': 1000,
    'scale_pos_weight': (y_train_df == 0).sum() / (y_train_df == 1).sum(), # 'is_unbalance' の代わり
    'random_state': 42,
    'n_jobs': -1,
    'verbosity': 0,                 # 'verbose': -1 -> 'verbosity': 0
    'booster': 'gbtree'             # デフォルトだが明記
    }

# 探索するパラメータを定義する「関数」
def define_xgb_params(trial):
    return {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
    }

# XGBoostモデルの最適化実行
xgb_results = optimize_model(
    X_train_df,
    y_train_df,
    model_name="xgboost",
    params_base=xgb_params_base,
    define_params_func=define_xgb_params,
    n_trials=50,
    early_stopping_rounds=10
)

[I 2025-11-13 10:19:15,260] A new study created in memory with name: no-name-c3f9cd75-cc52-45ec-bc1a-cbe4d8a43c8f


--- Optimizing xgboost ---


[I 2025-11-13 10:19:18,622] Trial 0 finished with value: 0.6190080662929673 and parameters: {'learning_rate': 0.03877360448209994, 'max_depth': 12, 'min_child_weight': 60, 'subsample': 0.7627294161861108, 'colsample_bytree': 0.8865758290380865, 'reg_alpha': 0.0008208834055961972, 'reg_lambda': 0.005480593801530841}. Best is trial 0 with value: 0.6190080662929673.
[I 2025-11-13 10:19:21,449] Trial 1 finished with value: 0.7056395173453998 and parameters: {'learning_rate': 0.09752225208579593, 'max_depth': 4, 'min_child_weight': 20, 'subsample': 0.9613700440131332, 'colsample_bytree': 0.9321712832074821, 'reg_alpha': 3.263206676467805, 'reg_lambda': 1.3663218584124212e-06}. Best is trial 1 with value: 0.7056395173453998.
[I 2025-11-13 10:19:25,161] Trial 2 finished with value: 0.6454631833971862 and parameters: {'learning_rate': 0.04423518172553682, 'max_depth': 7, 'min_child_weight': 44, 'subsample': 0.7082927156516882, 'colsample_bytree': 0.9260449032115606, 'reg_alpha': 5.918425311257


--- Optimization Finished ---
Best trial for xgboost:
  Value (Best F1 Score): 0.74129
  Best Params:
    learning_rate: 0.03266438336760609
    max_depth: 3
    min_child_weight: 5
    subsample: 0.8020627558871282
    colsample_bytree: 0.774224467164935
    reg_alpha: 4.377078526205754e-07
    reg_lambda: 0.0001586007158161246

--- Confusion Matrix (Best Trial at Threshold: 0.4800) ---
                          |Predicted Label              |
                          |-----------------------------|
                          | Negative (0) | Positive (1) |
-----------|--------------|--------------|--------------|
True Label | Negative (0) | 514          | 49           | (TN, FP)
           |--------------|--------------|--------------|
           | Positive (1) | 45           | 134          | (FN, TP)
-----------|--------------|--------------|--------------|

--- OOF Scores (at Best Threshold) ---
  Precision: 0.73224
  Recall:    0.74860
  F1 Score:  0.74033 (F1 score on total OOF 

In [7]:
# 基本パラメータ
cat_params_base = {
    'objective': 'Logloss', # CatBoostでは'Logloss'を指定
        'iterations': 1000,
        'scale_pos_weight': (y_train_df == 0).sum() / (y_train_df == 1).sum(), # 不均衡データへの対処
        'random_seed': 42,
        'verbose': 0, # 学習ログを非表示
        'early_stopping_rounds': 10 # 早期停止
    }

# 探索するパラメータを定義する「関数」
def define_cat_params(trial):
    return {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'depth': trial.suggest_int('depth', 3, 10),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0, log=True), # L2正則化
    }

# CatBoostモデルの最適化実行
cat_results = optimize_model(
    X_train_df,
    y_train_df,
    model_name="catboost",
    params_base=cat_params_base,
    define_params_func=define_cat_params,
    n_trials=50,
    early_stopping_rounds=10
)

[I 2025-11-13 10:24:33,692] A new study created in memory with name: no-name-165a3284-4ce4-4b57-a226-c97bc90e8969


--- Optimizing catboost ---


[I 2025-11-13 10:24:37,278] Trial 0 finished with value: 0.6831893927668575 and parameters: {'learning_rate': 0.03998088848730111, 'depth': 6, 'min_data_in_leaf': 96, 'subsample': 0.6378608122703554, 'l2_leaf_reg': 0.22380024505818671}. Best is trial 0 with value: 0.6831893927668575.
[I 2025-11-13 10:24:39,190] Trial 1 finished with value: 0.5474795881438987 and parameters: {'learning_rate': 0.07662239452152697, 'depth': 8, 'min_data_in_leaf': 79, 'subsample': 0.995349618858959, 'l2_leaf_reg': 0.003032850945715206}. Best is trial 0 with value: 0.6831893927668575.
[I 2025-11-13 10:24:41,851] Trial 2 finished with value: 0.650021902665224 and parameters: {'learning_rate': 0.014498133183898042, 'depth': 5, 'min_data_in_leaf': 77, 'subsample': 0.9467030546099436, 'l2_leaf_reg': 1.7995292443471474e-05}. Best is trial 0 with value: 0.6831893927668575.
[I 2025-11-13 10:24:45,027] Trial 3 finished with value: 0.6890256206764283 and parameters: {'learning_rate': 0.014884681539816678, 'depth': 3


--- Optimization Finished ---
Best trial for catboost:
  Value (Best F1 Score): 0.75248
  Best Params:
    learning_rate: 0.07224027841139621
    depth: 4
    min_data_in_leaf: 12
    subsample: 0.8378334913954077
    l2_leaf_reg: 3.135950164278423

--- Confusion Matrix (Best Trial at Threshold: 0.4700) ---
                          |Predicted Label              |
                          |-----------------------------|
                          | Negative (0) | Positive (1) |
-----------|--------------|--------------|--------------|
True Label | Negative (0) | 496          | 67           | (TN, FP)
           |--------------|--------------|--------------|
           | Positive (1) | 31           | 148          | (FN, TP)
-----------|--------------|--------------|--------------|

--- OOF Scores (at Best Threshold) ---
  Precision: 0.68837
  Recall:    0.82682
  F1 Score:  0.75127 (F1 score on total OOF predictions)
  (Note: 'Best F1 Score' above is the CV mean optimized by Optuna.)



In [8]:
# アンサンブルモデルの学習
ensemble_result = train_ensemble_models_(
    X_train_df=X_train_df,
    y_train_df=y_train_df,
    X_test_df=X_test_df,
    lgb_best_params = lgbm_results['best_params'],
    xgb_best_params = xgb_results['best_params'],
    cat_best_params = cat_results['best_params'],
    sample_submit=sample_submit,
    n_folds=5,
    early_stopping_rounds=10,
    thresholds=np.arange(0.1, 0.5, 0.01),
    random_state=42
)

--- Start Base Models Training ---
Fold 1/5 started...
Fold 2/5 started...
Fold 3/5 started...
Fold 4/5 started...
Fold 5/5 started...
--- Base Models Training Finished ---

--- Start Blending Optimization ---
Blending Weights - LGB: 0.3333, XGB: 0.3333, CAT: 0.3333

--- Start Stacking (Logistic Regression) ---
Stacking Coefs - LGB: 1.8704, XGB: 2.2752, CAT: 2.8395

--- Evaluation Results ---
Blending F1: 0.75510 (Th: 0.43)
Stacking F1: 0.75275 (Th: 0.41)
LightGBM F1: 0.74603
XGBoost  F1: 0.74033
CatBoost F1: 0.75127
