In [1]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import matplotlib_fontja
os.chdir('../')
from src.feature import *
from src.model import *

  from .autonotebook import tqdm as notebook_tqdm


データ準備

In [2]:
# データの読み込み
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
sample_submit = pd.read_csv("data/sample_submit.csv", header=None)

In [6]:
# Xとyの分割
target_col = '購入フラグ'
X_train_df = train.drop(columns=[target_col])
y_train_df = train[target_col]
X_test_df = test.copy()

特徴量エンジニアリング

In [7]:
# 学習・テストデータの結合
X_train_df.index = "train_" + X_train_df.index.astype(str)
X_test_df.index = "test_" + X_test_df.index.astype(str)
X_df = pd.concat([X_train_df, X_test_df], axis=0)

# 不要な列の削除
drop_cols = ['企業ID', '企業名']
X_df = X_df.drop(columns=drop_cols)

# 指標の組み合わせによる新規指標の作成
X_df = feature_engineering(X_df)

# カテゴリ変数を数値にエンコード
#categorical_cols = ['業界', '上場種別', '特徴']
#X_df = onehot_encode_categorical(X_df, categorical_cols)
drop_cols = ['業界', '上場種別', '特徴']
X_df = X_df.drop(columns=drop_cols)

# ２択（はい/いいえ）変数のエンコード
binary_cols = ['アンケート６']
X_df = encode_binary(X_df, binary_cols, true=1)

# 組織図の特徴量化
org_df = org_chart_features(X_df['組織図'])
X_df = pd.concat([X_df, org_df], axis=1)
X_df = X_df.drop(columns=['組織図'])

# 今後のDX展望テキストデータの特徴量化（文章量, ポジネガ出現率, TF-IDF）
length_df = text_length_features(X_df['今後のDX展望'])
tfidf_df = tfidf_vectorization(X_df['今後のDX展望'], max_features=30, ngram_range=(1, 2))
X_df = pd.concat([X_df, length_df, tfidf_df], axis=1)
X_df = X_df.drop(columns=['今後のDX展望'])

# 企業概要テキストデータの特徴量化（文章量, ポジネガ出現率, TF-IDF）
tfidf_df = tfidf_vectorization(X_df['企業概要'], max_features=30, ngram_range=(1, 2))
X_df = pd.concat([X_df, tfidf_df], axis=1)
X_df = X_df.drop(columns=['企業概要'])

# 学習・テストデータの再分割
X_train_df = X_df.loc[X_train_df.index].reset_index(drop=True)
X_test_df = X_df.loc[X_test_df.index].reset_index(drop=True)

In [8]:
list(X_train_df.columns)

['従業員数',
 '事業所数',
 '工場数',
 '店舗数',
 '資本金',
 '総資産',
 '流動資産',
 '固定資産',
 '負債',
 '短期借入金',
 '長期借入金',
 '純資産',
 '自己資本',
 '売上',
 '営業利益',
 '経常利益',
 '当期純利益',
 '営業CF',
 '減価償却費',
 '運転資本変動',
 '投資CF',
 '有形固定資産変動',
 '無形固定資産変動(ソフトウェア関連)',
 'アンケート１',
 'アンケート２',
 'アンケート３',
 'アンケート４',
 'アンケート５',
 'アンケート６',
 'アンケート７',
 'アンケート８',
 'アンケート９',
 'アンケート１０',
 'アンケート１１',
 '営業利益率',
 '総資産回転率',
 'ROA',
 '営業CFマージン',
 '流動比率',
 '無形固定資産変動',
 '投資効率対利益',
 '投資効率対CF',
 '新規投資度合い',
 '組織図_line_count',
 '組織図_unit_count',
 '組織図_has_dx_it',
 '組織図_has_planning',
 '組織図_has_quality',
 '組織図_has_rd',
 '組織図_has_risk',
 '組織図_has_mfg',
 '今後のDX展望_length',
 '今後のDX展望_ai',
 '今後のDX展望_ai iot',
 '今後のDX展望_dx',
 '今後のDX展望_iot',
 '今後のDX展望_このように',
 '今後のDX展望_これにより',
 '今後のDX展望_さらに',
 '今後のDX展望_しかし',
 '今後のDX展望_そのため',
 '今後のDX展望_その結果',
 '今後のDX展望_ただし',
 '今後のDX展望_また',
 '今後のDX展望_もっとも',
 '今後のDX展望_クラウド',
 '今後のDX展望_クラウド ai',
 '今後のDX展望_クラウドやai',
 '今後のDX展望_一方',
 '今後のDX展望_今後については',
 '今後のDX展望_今後についても',
 '今後のDX展望_今後は',
 '今後のDX展望_具体的には',
 '今後のDX展望_加えて',
 '今後のDX展望_弊社では

予測モデルのパラメータ最適化

In [9]:
# 基本パラメータ
lgbm_params_base = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'n_estimators': 1000,
    'is_unbalance': True,
    'random_state': 42,
    'n_jobs': -1,
    'verbose': -1
    }

# 探索するパラメータを定義する「関数」
def define_lgbm_params(trial):
    return {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 80),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
    }

# LightGBMモデルの最適化実行
lgbm_results = optimize_model(
    X_train_df,
    y_train_df,
    model_name="lightgbm",
    params_base=lgbm_params_base,
    define_params_func=define_lgbm_params,
    n_trials=50
)

[I 2025-11-07 01:13:25,678] A new study created in memory with name: no-name-d8d88583-80f9-436c-9136-99d29e32f826


--- Optimizing lightgbm ---


[I 2025-11-07 01:13:30,370] Trial 0 finished with value: 0.6542868353537404 and parameters: {'learning_rate': 0.025283992558709452, 'num_leaves': 75, 'max_depth': 5, 'min_child_samples': 43, 'subsample': 0.8474359625609011, 'colsample_bytree': 0.9790552472325648, 'reg_alpha': 7.871382749471019e-07, 'reg_lambda': 6.464516356126631e-08}. Best is trial 0 with value: 0.6542868353537404.
[I 2025-11-07 01:13:39,052] Trial 1 finished with value: 0.6352164108663791 and parameters: {'learning_rate': 0.011805158206264939, 'num_leaves': 68, 'max_depth': 7, 'min_child_samples': 10, 'subsample': 0.9929226962486444, 'colsample_bytree': 0.613107063873147, 'reg_alpha': 3.5557572378736676e-08, 'reg_lambda': 3.7390970266240493e-06}. Best is trial 0 with value: 0.6542868353537404.
[I 2025-11-07 01:13:42,672] Trial 2 finished with value: 0.6389529362537493 and parameters: {'learning_rate': 0.028685916011698055, 'num_leaves': 45, 'max_depth': 5, 'min_child_samples': 81, 'subsample': 0.8282919703049504, 'co


--- Optimization Finished ---
Best trial for lightgbm:
  Value (Best F1 Score): 0.67839
  Best Params:
    learning_rate: 0.08304205949097239
    num_leaves: 63
    max_depth: 3
    min_child_samples: 69
    subsample: 0.830716801487515
    colsample_bytree: 0.6403612911941047
    reg_alpha: 1.1604448067191962e-07
    reg_lambda: 0.02949395789446631

--- Confusion Matrix (Best Trial at Threshold: 0.3900) ---
--------------------------|Predicted Label              |
--------------------------|-----------------------------|
--------------------------| Negative (0) | Positive (1) |
-----------|--------------|--------------|--------------|
True Label | Negative (0) | 475          | 88           | (TN, FP)
           |--------------|--------------|--------------|
           | Positive (1) | 42           | 137          | (FN, TP)
-----------|--------------|--------------|--------------|

--- Feature Importance (Best Model) ---
               Feature  Importance
0               アンケート７       

In [18]:
list(xgb_results["feature_importance"]["Feature"])

['上場種別_ST',
 '総資産',
 '店舗数',
 '今後のDX展望_今後は',
 '営業CFマージン',
 '当期純利益',
 '無形固定資産変動',
 '無形固定資産変動(ソフトウェア関連)',
 '新規投資度合い',
 '営業利益率',
 '流動比率',
 'ROA',
 '投資効率対CF',
 '長期借入金',
 '今後のDX展望_さらに',
 '短期借入金',
 '営業CF',
 'アンケート４',
 'アンケート８',
 '今後のDX展望_iot',
 '投資効率対利益',
 '今後のDX展望_クラウド',
 'アンケート２',
 '組織図_has_dx_it',
 'アンケート１０',
 '従業員数',
 'アンケート１',
 '自己資本',
 '流動資産',
 '経常利益',
 'アンケート１１',
 '純資産',
 'アンケート７',
 '企業概要_当社は',
 '今後のDX展望_ai',
 '特徴_BtoC',
 'アンケート３',
 'アンケート５',
 '事業所数',
 '減価償却費',
 '組織図_unit_count',
 '今後のDX展望_length',
 '投資CF',
 '工場数',
 '負債',
 '企業概要_また',
 '企業概要_さらに',
 '総資産回転率',
 '組織図_line_count',
 '今後のDX展望_具体的には',
 '固定資産',
 '資本金',
 '売上',
 '有形固定資産変動',
 '組織図_has_mfg',
 '運転資本変動',
 '営業利益',
 '業界_その他',
 '業界_IT',
 'アンケート９',
 'アンケート６',
 '業界_広告',
 '業界_その他サービス',
 '業界_エネルギー',
 '業界_アパレル・美容',
 '業界_ゲーム',
 '業界_コンサルティング',
 '業界_マスコミ',
 '業界_エンタメ',
 '業界_人材',
 '業界_化学',
 '業界_医療・福祉',
 '業界_商社',
 '業界_外食',
 '業界_専門サービス',
 '業界_小売',
 '業界_不動産',
 '上場種別_PR',
 '上場種別_GR',
 '業界_食品',
 '業界_電気製品',
 '業界_運輸・物流',
 '業界_金融',
 '業界_通信機器',
 '業界_通信',


In [10]:
# 基本パラメータ
xgb_params_base = {
    'objective': 'binary:logistic', 
    'eval_metric': 'logloss',       
    'n_estimators': 1000,
    'scale_pos_weight': (y_train_df == 0).sum() / (y_train_df == 1).sum(), # 'is_unbalance' の代わり
    'random_state': 42,
    'n_jobs': -1,
    'verbosity': 0,                 # 'verbose': -1 -> 'verbosity': 0
    'booster': 'gbtree'             # デフォルトだが明記
    }

# 探索するパラメータを定義する「関数」
def define_xgb_params(trial):
    return {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
    }

# XGBoostモデルの最適化実行
xgb_results = optimize_model(
    X_train_df,
    y_train_df,
    model_name="xgboost",
    params_base=xgb_params_base,
    define_params_func=define_xgb_params,
    n_trials=50
)

[I 2025-11-07 01:15:16,907] A new study created in memory with name: no-name-8cd6b862-f99e-4ec0-bb93-bb42f79b4f19


--- Optimizing xgboost ---


[I 2025-11-07 01:15:32,867] Trial 0 finished with value: 0.5944108725010773 and parameters: {'learning_rate': 0.02255025481100187, 'max_depth': 12, 'min_child_weight': 47, 'subsample': 0.6705762696366907, 'colsample_bytree': 0.6409576383114287, 'reg_alpha': 6.777311604844179, 'reg_lambda': 0.4199964381506101}. Best is trial 0 with value: 0.5944108725010773.
[I 2025-11-07 01:15:43,800] Trial 1 finished with value: 0.5689538466465915 and parameters: {'learning_rate': 0.0431623170028496, 'max_depth': 10, 'min_child_weight': 56, 'subsample': 0.7215705209975459, 'colsample_bytree': 0.82613490481403, 'reg_alpha': 5.85659399264374, 'reg_lambda': 0.00040930554609664224}. Best is trial 0 with value: 0.5944108725010773.
[I 2025-11-07 01:15:56,550] Trial 2 finished with value: 0.6553578478672111 and parameters: {'learning_rate': 0.026871076779408937, 'max_depth': 4, 'min_child_weight': 24, 'subsample': 0.6724822606624293, 'colsample_bytree': 0.9044371678208152, 'reg_alpha': 0.028294347570643898, 


--- Optimization Finished ---
Best trial for xgboost:
  Value (Best F1 Score): 0.67433
  Best Params:
    learning_rate: 0.025662881614008648
    max_depth: 4
    min_child_weight: 9
    subsample: 0.6929980920622789
    colsample_bytree: 0.7339814058387809
    reg_alpha: 1.0948185462253982e-06
    reg_lambda: 0.004859632844650183

--- Confusion Matrix (Best Trial at Threshold: 0.3200) ---
--------------------------|Predicted Label              |
--------------------------|-----------------------------|
--------------------------| Negative (0) | Positive (1) |
-----------|--------------|--------------|--------------|
True Label | Negative (0) | 453          | 110          | (TN, FP)
           |--------------|--------------|--------------|
           | Positive (1) | 32           | 147          | (FN, TP)
-----------|--------------|--------------|--------------|

--- Feature Importance (Best Model) ---
               Feature  Importance
0      今後のDX展望_弊社はこれまで    0.034642
1            

In [11]:
# 基本パラメータ
cat_params_base = {
    'objective': 'Logloss', # CatBoostでは'Logloss'を指定
        'iterations': 1000,
        'scale_pos_weight': (y_train_df == 0).sum() / (y_train_df == 1).sum(), # 不均衡データへの対処
        'random_seed': 42,
        'verbose': 0, # 学習ログを非表示
        'early_stopping_rounds': 50 # 早期停止
    }

# 探索するパラメータを定義する「関数」
def define_cat_params(trial):
    return {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'depth': trial.suggest_int('depth', 3, 10),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0, log=True), # L2正則化
    }

# CatBoostモデルの最適化実行
cat_results = optimize_model(
    X_train_df,
    y_train_df,
    model_name="catboost",
    params_base=cat_params_base,
    define_params_func=define_cat_params,
    n_trials=50
)

[I 2025-11-07 01:25:31,757] A new study created in memory with name: no-name-c980e1cb-6800-41d7-a340-beb6065aea09


--- Optimizing catboost ---


[I 2025-11-07 01:25:33,673] Trial 0 finished with value: 0.6006216090069785 and parameters: {'learning_rate': 0.07510414595143826, 'depth': 3, 'min_data_in_leaf': 54, 'subsample': 0.9738012312611327, 'l2_leaf_reg': 1.0506298605246633e-06}. Best is trial 0 with value: 0.6006216090069785.
[I 2025-11-07 01:25:39,044] Trial 1 finished with value: 0.652107707910282 and parameters: {'learning_rate': 0.045753284210934754, 'depth': 7, 'min_data_in_leaf': 8, 'subsample': 0.7659398093297881, 'l2_leaf_reg': 0.08629872711092298}. Best is trial 1 with value: 0.652107707910282.
[I 2025-11-07 01:25:47,178] Trial 2 finished with value: 0.6335251592144566 and parameters: {'learning_rate': 0.06733497695207888, 'depth': 7, 'min_data_in_leaf': 85, 'subsample': 0.810761797987503, 'l2_leaf_reg': 9.845864898810246}. Best is trial 1 with value: 0.652107707910282.
[I 2025-11-07 01:25:49,872] Trial 3 finished with value: 0.5964609195418983 and parameters: {'learning_rate': 0.09885855674287342, 'depth': 6, 'min_


--- Optimization Finished ---
Best trial for catboost:
  Value (Best F1 Score): 0.65967
  Best Params:
    learning_rate: 0.02881943070420616
    depth: 4
    min_data_in_leaf: 25
    subsample: 0.7968419457322264
    l2_leaf_reg: 9.280634745353932

--- Confusion Matrix (Best Trial at Threshold: 0.4900) ---
--------------------------|Predicted Label              |
--------------------------|-----------------------------|
--------------------------| Negative (0) | Positive (1) |
-----------|--------------|--------------|--------------|
True Label | Negative (0) | 493          | 70           | (TN, FP)
           |--------------|--------------|--------------|
           | Positive (1) | 57           | 122          | (FN, TP)
-----------|--------------|--------------|--------------|

--- Feature Importance (Best Model) ---
               Feature  Importance
0             営業CFマージン    8.408031
1             無形固定資産変動    6.502320
2                  ROA    4.357155
3              新規投資度合い    3

In [12]:
# アンサンブルモデルの学習
ensemble_result = train_ensemble_models(
    X_train_df=X_train_df,
    y_train_df=y_train_df,
    X_test_df=X_test_df,
    lgb_best_params = lgbm_results['best_params'],
    xgb_best_params = xgb_results['best_params'],
    cat_best_params = cat_results['best_params'],
    sample_submit=sample_submit,
    n_folds=5,
    early_stopping_rounds=50,
    thresholds=np.arange(0.1, 0.5, 0.01),
    random_state=42
)

--- Start Ensemble Training ---
Fold 1/5 started...
Fold 2/5 started...
Fold 3/5 started...
Fold 4/5 started...
Fold 5/5 started...
--- Ensemble Training Finished ---
Optimal Weights - LGB: 0.3333, XGB: 0.3333, CAT: 0.3333

--- Evaluation ---
Ensemble Best F1: 0.66667 (Threshold: 0.35)
LightGBM: 0.67822 (Threshold: 0.39)
XGBoost:  0.67431 (Threshold: 0.32)
CatBoost: 0.65768 (Threshold: 0.49)

--- Fold-wise Stability Evaluation (using global thresholds) ---
  Fold 1 F1 -> Ensemble: 0.68966 | LGB: 0.66667 | XGB: 0.68966 | CAT: 0.69231
  Fold 2 F1 -> Ensemble: 0.62791 | LGB: 0.65000 | XGB: 0.65116 | CAT: 0.56790
  Fold 3 F1 -> Ensemble: 0.68182 | LGB: 0.71795 | XGB: 0.68182 | CAT: 0.66667
  Fold 4 F1 -> Ensemble: 0.67442 | LGB: 0.65854 | XGB: 0.66667 | CAT: 0.68493
  Fold 5 F1 -> Ensemble: 0.65882 | LGB: 0.69880 | XGB: 0.68235 | CAT: 0.68657
Ensemble Fold F1 Mean: 0.66652, Std: 0.02183
