In [None]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import matplotlib_fontja
os.chdir('../')
from src.feature import *
from src.model import *

  from .autonotebook import tqdm as notebook_tqdm


データ準備

In [None]:
# データの読み込み
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
sample_submit = pd.read_csv("data/sample_submit.csv", header=None)

In [None]:
# Xとyの分割
target_col = '購入フラグ'
X_train_df = train.drop(columns=[target_col])
y_train_df = train[target_col]
X_test_df = test.copy()

特徴量エンジニアリング

In [None]:
# 学習・テストデータの結合
X_train_df.index = "train_" + X_train_df.index.astype(str)
X_test_df.index = "test_" + X_test_df.index.astype(str)
X_df = pd.concat([X_train_df, X_test_df], axis=0)

# 不要な列の削除
drop_cols = ['企業ID', '企業名']
X_df = X_df.drop(columns=drop_cols)

# 欠損値の補完
X_df = impute_missing_values(X_df)

# 指標の組み合わせによる新規指標の作成
X_df = feature_engineering(X_df)

# カテゴリ変数を数値にエンコード
categorical_cols = ['業界', '上場種別', '特徴']
X_df = onehot_encode_categorical(X_df, categorical_cols)

# ２択（はい/いいえ）変数のエンコード
binary_cols = ['アンケート６']
X_df = encode_binary(X_df, binary_cols, true=1)

# アンケートデータの特徴量化
X_df = survey_features(X_df)

# 組織図の特徴量化
# 特定部門の有無
org_df = org_chart_features(X_df['組織図'])
# Transformer+PCA
embedding_df = pd.read_csv("data/組織図_embed.csv", index_col=0)
transform_pca_df = pca_reduction(embedding_df, n_components=5)
transform_pca_df.columns = [f'組織図_emb_pca_{i}' for i in range(transform_pca_df.shape[1])]
X_df = pd.concat([X_df, org_df, transform_pca_df], axis=1)
X_df = X_df.drop(columns=['組織図'])

# 今後のDX展望テキストデータの特徴量化
# 文章量
length_df = text_length_features(X_df['今後のDX展望'])
# Transformer+PCA
embedding_df = pd.read_csv("data/今後のDX展望_embed.csv", index_col=0)
transform_pca_df = pca_reduction(embedding_df, n_components=5)
transform_pca_df.columns = [f'今後のDX展望_emb_pca_{i}' for i in range(transform_pca_df.shape[1])]
X_df = pd.concat([X_df, length_df, transform_pca_df], axis=1)
X_df = X_df.drop(columns=['今後のDX展望'])

# 企業概要テキストデータの特徴量化
# Transformer+PCA
embedding_df = pd.read_csv("data/企業概要_embed.csv", index_col=0)
transform_pca_df = pca_reduction(embedding_df, n_components=5)
transform_pca_df.columns = [f'企業概要_emb_pca_{i}' for i in range(transform_pca_df.shape[1])]
X_df = pd.concat([X_df, transform_pca_df], axis=1)
X_df = X_df.drop(columns=['企業概要'])

# 学習・テストデータの再分割
X_train_df = X_df.loc[X_train_df.index].reset_index(drop=True)
X_test_df = X_df.loc[X_test_df.index].reset_index(drop=True)

In [5]:
list(X_df.columns)

['従業員数',
 '事業所数',
 '工場数',
 '店舗数',
 '資本金',
 '総資産',
 '流動資産',
 '固定資産',
 '負債',
 '短期借入金',
 '長期借入金',
 '純資産',
 '自己資本',
 '売上',
 '営業利益',
 '経常利益',
 '当期純利益',
 '営業CF',
 '減価償却費',
 '運転資本変動',
 '投資CF',
 '有形固定資産変動',
 '無形固定資産変動(ソフトウェア関連)',
 'アンケート１',
 'アンケート２',
 'アンケート３',
 'アンケート４',
 'アンケート５',
 'アンケート６',
 'アンケート７',
 'アンケート８',
 'アンケート９',
 'アンケート１０',
 'アンケート１１',
 '営業利益率',
 '総資産回転率',
 'ROA',
 '営業CFマージン',
 '流動比率',
 '無形固定資産変動',
 '投資効率対利益',
 '投資効率対CF',
 '新規投資度合い',
 '業界_IT',
 '業界_その他',
 '業界_その他サービス',
 '業界_アパレル・美容',
 '業界_エネルギー',
 '業界_エンタメ',
 '業界_ゲーム',
 '業界_コンサルティング',
 '業界_マスコミ',
 '業界_不動産',
 '業界_人材',
 '業界_化学',
 '業界_医療・福祉',
 '業界_商社',
 '業界_外食',
 '業界_専門サービス',
 '業界_小売',
 '業界_広告',
 '業界_建設・工事',
 '業界_教育',
 '業界_機械',
 '業界_機械関連サービス',
 '業界_生活用品',
 '業界_自動車・乗り物',
 '業界_製造',
 '業界_通信',
 '業界_通信機器',
 '業界_運輸・物流',
 '業界_金融',
 '業界_電気製品',
 '業界_食品',
 '上場種別_GR',
 '上場種別_PR',
 '上場種別_ST',
 '特徴_BtoB',
 '特徴_BtoC',
 '特徴_CtoC',
 '組織図_has_dx_it',
 '組織図_has_planning',
 '組織図_has_quality',
 '組織図_has_rd',
 '組織図_has_risk',
 '組織図_has_mfg',
 '組織図_has_

予測モデルのパラメータ最適化

In [6]:
# 基本パラメータ
lgbm_params_base = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'n_estimators': 1000,
    'is_unbalance': True,
    'random_state': 42,
    'n_jobs': -1,
    'verbose': -1
    }

# 探索するパラメータを定義する「関数」
def define_lgbm_params(trial):
    return {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 80),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
    }

# LightGBMモデルの最適化実行
lgbm_results = optimize_model(
    X_train_df,
    y_train_df,
    model_name="lightgbm",
    params_base=lgbm_params_base,
    define_params_func=define_lgbm_params,
    n_trials=50,
    early_stopping_rounds=10
)

[I 2025-11-09 01:29:33,425] A new study created in memory with name: no-name-0b60b3ec-e85f-4f13-aa12-2883ee955ed1


--- Optimizing lightgbm ---


[I 2025-11-09 01:29:34,579] Trial 0 finished with value: 0.6731208607497268 and parameters: {'learning_rate': 0.02803122870432997, 'num_leaves': 37, 'max_depth': 3, 'min_child_samples': 24, 'subsample': 0.7840015836165994, 'colsample_bytree': 0.8518736630891488, 'reg_alpha': 0.0027210009516727826, 'reg_lambda': 2.6345037502091714e-08}. Best is trial 0 with value: 0.6731208607497268.
[I 2025-11-09 01:29:37,327] Trial 1 finished with value: 0.6708694717941925 and parameters: {'learning_rate': 0.010061482248400533, 'num_leaves': 25, 'max_depth': 7, 'min_child_samples': 32, 'subsample': 0.7663923468239127, 'colsample_bytree': 0.9147516336813085, 'reg_alpha': 0.01603008348749399, 'reg_lambda': 0.033856865310383365}. Best is trial 0 with value: 0.6731208607497268.
[I 2025-11-09 01:29:38,393] Trial 2 finished with value: 0.6821865665305251 and parameters: {'learning_rate': 0.05453512489670697, 'num_leaves': 40, 'max_depth': 9, 'min_child_samples': 20, 'subsample': 0.8120912978979616, 'colsamp


--- Optimization Finished ---
Best trial for lightgbm:
  Value (Best F1 Score): 0.71632
  Best Params:
    learning_rate: 0.09185601963084762
    num_leaves: 80
    max_depth: 7
    min_child_samples: 64
    subsample: 0.8935940071257527
    colsample_bytree: 0.6313134707347311
    reg_alpha: 0.00015652985396757415
    reg_lambda: 1.2926511611433944e-05

--- Confusion Matrix (Best Trial at Threshold: 0.4300) ---
                          |Predicted Label              |
                          |-----------------------------|
                          | Negative (0) | Positive (1) |
-----------|--------------|--------------|--------------|
True Label | Negative (0) | 498          | 65           | (TN, FP)
           |--------------|--------------|--------------|
           | Positive (1) | 43           | 136          | (FN, TP)
-----------|--------------|--------------|--------------|

--- OOF Scores (at Best Threshold) ---
  Precision: 0.67662
  Recall:    0.75978
  F1 Score:  0.7157

In [7]:
# 基本パラメータ
xgb_params_base = {
    'objective': 'binary:logistic', 
    'eval_metric': 'logloss',       
    'n_estimators': 1000,
    'scale_pos_weight': (y_train_df == 0).sum() / (y_train_df == 1).sum(), # 'is_unbalance' の代わり
    'random_state': 42,
    'n_jobs': -1,
    'verbosity': 0,                 # 'verbose': -1 -> 'verbosity': 0
    'booster': 'gbtree'             # デフォルトだが明記
    }

# 探索するパラメータを定義する「関数」
def define_xgb_params(trial):
    return {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
    }

# XGBoostモデルの最適化実行
xgb_results = optimize_model(
    X_train_df,
    y_train_df,
    model_name="xgboost",
    params_base=xgb_params_base,
    define_params_func=define_xgb_params,
    n_trials=50,
    early_stopping_rounds=10
)

[I 2025-11-09 01:30:59,537] A new study created in memory with name: no-name-37fe243c-e538-48fa-9df3-cd4c0e25697f


--- Optimizing xgboost ---


[I 2025-11-09 01:31:05,193] Trial 0 finished with value: 0.5989756350567232 and parameters: {'learning_rate': 0.014824724292385972, 'max_depth': 11, 'min_child_weight': 65, 'subsample': 0.7530711354141253, 'colsample_bytree': 0.6206647923944985, 'reg_alpha': 2.0388931480546835e-05, 'reg_lambda': 1.6720209625199131e-06}. Best is trial 0 with value: 0.5989756350567232.
[I 2025-11-09 01:31:06,693] Trial 1 finished with value: 0.5468305455720103 and parameters: {'learning_rate': 0.03839977335432059, 'max_depth': 4, 'min_child_weight': 93, 'subsample': 0.8391671795242978, 'colsample_bytree': 0.8945219122585233, 'reg_alpha': 3.278339038320222e-08, 'reg_lambda': 1.8088760764623746e-08}. Best is trial 0 with value: 0.5989756350567232.
[I 2025-11-09 01:31:12,275] Trial 2 finished with value: 0.5822166249281528 and parameters: {'learning_rate': 0.010747466412620022, 'max_depth': 11, 'min_child_weight': 80, 'subsample': 0.8081733169384757, 'colsample_bytree': 0.9194371166878067, 'reg_alpha': 0.00


--- Optimization Finished ---
Best trial for xgboost:
  Value (Best F1 Score): 0.72619
  Best Params:
    learning_rate: 0.07282289948746733
    max_depth: 7
    min_child_weight: 10
    subsample: 0.6017292072279093
    colsample_bytree: 0.9569452899506171
    reg_alpha: 0.24113966409763987
    reg_lambda: 0.00013928976266704493

--- Confusion Matrix (Best Trial at Threshold: 0.4800) ---
                          |Predicted Label              |
                          |-----------------------------|
                          | Negative (0) | Positive (1) |
-----------|--------------|--------------|--------------|
True Label | Negative (0) | 502          | 61           | (TN, FP)
           |--------------|--------------|--------------|
           | Positive (1) | 42           | 137          | (FN, TP)
-----------|--------------|--------------|--------------|

--- OOF Scores (at Best Threshold) ---
  Precision: 0.69192
  Recall:    0.76536
  F1 Score:  0.72679 (F1 score on total OOF

In [8]:
# 基本パラメータ
cat_params_base = {
    'objective': 'Logloss', # CatBoostでは'Logloss'を指定
        'iterations': 1000,
        'scale_pos_weight': (y_train_df == 0).sum() / (y_train_df == 1).sum(), # 不均衡データへの対処
        'random_seed': 42,
        'verbose': 0, # 学習ログを非表示
        'early_stopping_rounds': 10 # 早期停止
    }

# 探索するパラメータを定義する「関数」
def define_cat_params(trial):
    return {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'depth': trial.suggest_int('depth', 3, 10),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0, log=True), # L2正則化
    }

# CatBoostモデルの最適化実行
cat_results = optimize_model(
    X_train_df,
    y_train_df,
    model_name="catboost",
    params_base=cat_params_base,
    define_params_func=define_cat_params,
    n_trials=50,
    early_stopping_rounds=10
)

[I 2025-11-09 01:34:07,897] A new study created in memory with name: no-name-9ebc0691-b589-4888-b52f-79ad841723f8


--- Optimizing catboost ---


[I 2025-11-09 01:34:10,579] Trial 0 finished with value: 0.5691455149788484 and parameters: {'learning_rate': 0.025771248659583162, 'depth': 8, 'min_data_in_leaf': 54, 'subsample': 0.7254287079587747, 'l2_leaf_reg': 1.7528277303287428e-08}. Best is trial 0 with value: 0.5691455149788484.
[I 2025-11-09 01:34:12,970] Trial 1 finished with value: 0.6235421731585291 and parameters: {'learning_rate': 0.013655724005944541, 'depth': 6, 'min_data_in_leaf': 46, 'subsample': 0.7828066872044912, 'l2_leaf_reg': 1.8372627861912708e-08}. Best is trial 1 with value: 0.6235421731585291.
[I 2025-11-09 01:34:14,215] Trial 2 finished with value: 0.5974293257144458 and parameters: {'learning_rate': 0.08163232102964803, 'depth': 6, 'min_data_in_leaf': 85, 'subsample': 0.7641339870488237, 'l2_leaf_reg': 0.002031724277230247}. Best is trial 1 with value: 0.6235421731585291.
[I 2025-11-09 01:34:26,653] Trial 3 finished with value: 0.614627552355014 and parameters: {'learning_rate': 0.048191088386142666, 'dept


--- Optimization Finished ---
Best trial for catboost:
  Value (Best F1 Score): 0.74620
  Best Params:
    learning_rate: 0.09182329168607842
    depth: 3
    min_data_in_leaf: 20
    subsample: 0.9973522156787301
    l2_leaf_reg: 8.355895288690249

--- Confusion Matrix (Best Trial at Threshold: 0.4700) ---
                          |Predicted Label              |
                          |-----------------------------|
                          | Negative (0) | Positive (1) |
-----------|--------------|--------------|--------------|
True Label | Negative (0) | 490          | 73           | (TN, FP)
           |--------------|--------------|--------------|
           | Positive (1) | 29           | 150          | (FN, TP)
-----------|--------------|--------------|--------------|

--- OOF Scores (at Best Threshold) ---
  Precision: 0.67265
  Recall:    0.83799
  F1 Score:  0.74627 (F1 score on total OOF predictions)
  (Note: 'Best F1 Score' above is the CV mean optimized by Optuna.)



In [9]:
# アンサンブルモデルの学習
ensemble_result = train_ensemble_models(
    X_train_df=X_train_df,
    y_train_df=y_train_df,
    X_test_df=X_test_df,
    lgb_best_params = lgbm_results['best_params'],
    xgb_best_params = xgb_results['best_params'],
    cat_best_params = cat_results['best_params'],
    sample_submit=sample_submit,
    n_folds=5,
    early_stopping_rounds=10,
    thresholds=np.arange(0.1, 0.5, 0.01),
    random_state=42
)

--- Start Ensemble Training ---
Fold 1/5 started...
Fold 2/5 started...
Fold 3/5 started...
Fold 4/5 started...
Fold 5/5 started...
--- Ensemble Training Finished ---
Optimal Weights - LGB: 0.3333, XGB: 0.3333, CAT: 0.3333

--- Evaluation ---
Ensemble Best F1: 0.74500 (Threshold: 0.42)
LightGBM: 0.71579 (Threshold: 0.43)
XGBoost:  0.72679 (Threshold: 0.48)
CatBoost: 0.74627 (Threshold: 0.47)

--- Fold-wise Stability Evaluation (using global thresholds) ---
  Fold 1 F1 -> Ensemble: 0.77647 | LGB: 0.74699 | XGB: 0.77108 | CAT: 0.75610
  Fold 2 F1 -> Ensemble: 0.78947 | LGB: 0.76712 | XGB: 0.69333 | CAT: 0.76543
  Fold 3 F1 -> Ensemble: 0.75000 | LGB: 0.75676 | XGB: 0.70423 | CAT: 0.75000
  Fold 4 F1 -> Ensemble: 0.69767 | LGB: 0.62500 | XGB: 0.71605 | CAT: 0.70000
  Fold 5 F1 -> Ensemble: 0.71233 | LGB: 0.68571 | XGB: 0.74627 | CAT: 0.75949
Ensemble Fold F1 Mean: 0.74519, Std: 0.03550
