In [1]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import matplotlib_fontja
os.chdir('../')
from src.feature import *
from src.model import *

  from .autonotebook import tqdm as notebook_tqdm


データ準備

In [2]:
# データの読み込み
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
sample_submit = pd.read_csv("data/sample_submit.csv", header=None)

In [3]:
# Xとyの分割
target_col = '購入フラグ'
X_train_df = train.drop(columns=[target_col])
y_train_df = train[target_col]
X_test_df = test.copy()

特徴量エンジニアリング

In [None]:
# 学習・テストデータの結合
X_train_df.index = "train_" + X_train_df.index.astype(str)
X_test_df.index = "test_" + X_test_df.index.astype(str)
X_df = pd.concat([X_train_df, X_test_df], axis=0)

# 不要な列の削除
drop_cols = ['企業ID', '企業名']
X_df = X_df.drop(columns=drop_cols)

# 指標の組み合わせによる新規指標の作成
X_df = feature_engineering(X_df)

# カテゴリ変数を数値にエンコード
categorical_cols = ['業界', '上場種別', '特徴']
X_df = onehot_encode_categorical(X_df, categorical_cols)

# ２択（はい/いいえ）変数のエンコード
binary_cols = ['アンケート６']
X_df = encode_binary(X_df, binary_cols, true=1)

# 組織図の特徴量化
org_df = org_chart_features(X_df['組織図'])
X_df = pd.concat([X_df, org_df], axis=1)
X_df = X_df.drop(columns=['組織図'])

# 今後のDX展望テキストデータの特徴量化（文章量, ポジネガ出現率, TF-IDF）
length_df = text_length_features(X_df['今後のDX展望'])
posneg_df = pos_neg_ratio(X_df['今後のDX展望'],
                          positive_words = [
                                        '積極', '強化', '推進', '加速', '導入', '構築', '注力',
                                        '投資', '拡大', '新た', '創出',
                                        '最適化', '効率化', '自動化', '活用', '高度化',
                                        '図る', '目指す'
                                        ],
                           negative_words = [
                                        '慎重', '限定', '停滞', '見直',
                                        '課題', '懸念', '困難', '不足', '負担', 'コスト', '難し',
                                        '検討', '未定', '具体的ではない' 
                                        ],)
X_df = pd.concat([X_df, length_df, posneg_df], axis=1)
X_df = X_df.drop(columns=['今後のDX展望'])

# 企業概要テキストデータの特徴量化（文章量, ポジネガ出現率, TF-IDF）
tfidf_df = tfidf_vectorization(X_df['企業概要'], max_features=30, ngram_range=(1, 2))
X_df = pd.concat([X_df, tfidf_df], axis=1)
X_df = X_df.drop(columns=['企業概要'])

# 学習・テストデータの再分割
X_train_df = X_df.loc[X_train_df.index].reset_index(drop=True)
X_test_df = X_df.loc[X_test_df.index].reset_index(drop=True)

予測モデルのパラメータ最適化

In [5]:
# 基本パラメータ
lgbm_params_base = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'n_estimators': 1000,
    'is_unbalance': True,
    'random_state': 42,
    'n_jobs': -1,
    'verbose': -1
    }

# 探索するパラメータを定義する「関数」
def define_lgbm_params(trial):
    return {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 80),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
    }

# LightGBMモデルの最適化実行
lgbm_results = optimize_model(
    X_train_df,
    y_train_df,
    model_name="lightgbm",
    params_base=lgbm_params_base,
    define_params_func=define_lgbm_params,
    n_trials=50,
    early_stopping_rounds=10
)

[I 2025-11-07 10:56:36,903] A new study created in memory with name: no-name-7da0f0c6-7c87-4553-91d0-19e781a732e3


--- Optimizing lightgbm ---


[I 2025-11-07 10:56:48,529] Trial 0 finished with value: 0.6781081402257872 and parameters: {'learning_rate': 0.02112582580981376, 'num_leaves': 57, 'max_depth': 10, 'min_child_samples': 73, 'subsample': 0.7002761939487059, 'colsample_bytree': 0.8530478288998371, 'reg_alpha': 1.9488608732986705, 'reg_lambda': 0.0023816240614222416}. Best is trial 0 with value: 0.6781081402257872.
[I 2025-11-07 10:56:53,516] Trial 1 finished with value: 0.6794964710353819 and parameters: {'learning_rate': 0.05562910077550609, 'num_leaves': 68, 'max_depth': 8, 'min_child_samples': 95, 'subsample': 0.8400165587157723, 'colsample_bytree': 0.6025226673189207, 'reg_alpha': 1.9280502556900796e-07, 'reg_lambda': 0.03557514533686008}. Best is trial 1 with value: 0.6794964710353819.
[I 2025-11-07 10:56:57,977] Trial 2 finished with value: 0.6918413876029529 and parameters: {'learning_rate': 0.06637562240792874, 'num_leaves': 61, 'max_depth': 4, 'min_child_samples': 66, 'subsample': 0.9671801129576039, 'colsample


--- Optimization Finished ---
Best trial for lightgbm:
  Value (Best F1 Score): 0.69184
  Best Params:
    learning_rate: 0.06637562240792874
    num_leaves: 61
    max_depth: 4
    min_child_samples: 66
    subsample: 0.9671801129576039
    colsample_bytree: 0.928748129222844
    reg_alpha: 0.40219249454535705
    reg_lambda: 0.0008395055150329392

--- Confusion Matrix (Best Trial at Threshold: 0.3300) ---
--------------------------|Predicted Label              |
--------------------------|-----------------------------|
--------------------------| Negative (0) | Positive (1) |
-----------|--------------|--------------|--------------|
True Label | Negative (0) | 473          | 90           | (TN, FP)
           |--------------|--------------|--------------|
           | Positive (1) | 37           | 142          | (FN, TP)
-----------|--------------|--------------|--------------|

--- Feature Importance (Best Model) ---
               Feature  Importance
0               アンケート７        

In [6]:
# 基本パラメータ
xgb_params_base = {
    'objective': 'binary:logistic', 
    'eval_metric': 'logloss',       
    'n_estimators': 1000,
    'scale_pos_weight': (y_train_df == 0).sum() / (y_train_df == 1).sum(), # 'is_unbalance' の代わり
    'random_state': 42,
    'n_jobs': -1,
    'verbosity': 0,                 # 'verbose': -1 -> 'verbosity': 0
    'booster': 'gbtree'             # デフォルトだが明記
    }

# 探索するパラメータを定義する「関数」
def define_xgb_params(trial):
    return {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
    }

# XGBoostモデルの最適化実行
xgb_results = optimize_model(
    X_train_df,
    y_train_df,
    model_name="xgboost",
    params_base=xgb_params_base,
    define_params_func=define_xgb_params,
    n_trials=50,
    early_stopping_rounds=10
)

[I 2025-11-07 11:03:05,910] A new study created in memory with name: no-name-d2cfd8d1-6102-44ab-a210-d0df0ea993b3


--- Optimizing xgboost ---


[I 2025-11-07 11:03:18,959] Trial 0 finished with value: 0.6545130106580561 and parameters: {'learning_rate': 0.025781239748838204, 'max_depth': 5, 'min_child_weight': 20, 'subsample': 0.8186766048022247, 'colsample_bytree': 0.9293693716452847, 'reg_alpha': 0.0023908132918120475, 'reg_lambda': 6.090390166652093e-06}. Best is trial 0 with value: 0.6545130106580561.
[I 2025-11-07 11:03:24,595] Trial 1 finished with value: 0.5552708859511771 and parameters: {'learning_rate': 0.0409306445867105, 'max_depth': 12, 'min_child_weight': 81, 'subsample': 0.7971454496757495, 'colsample_bytree': 0.6381665156256155, 'reg_alpha': 0.001559743074511445, 'reg_lambda': 1.699070213712978e-06}. Best is trial 0 with value: 0.6545130106580561.
[I 2025-11-07 11:03:37,771] Trial 2 finished with value: 0.6198309371547197 and parameters: {'learning_rate': 0.014753115997207402, 'max_depth': 8, 'min_child_weight': 44, 'subsample': 0.6837945339401791, 'colsample_bytree': 0.7339252983387689, 'reg_alpha': 1.15884707


--- Optimization Finished ---
Best trial for xgboost:
  Value (Best F1 Score): 0.67562
  Best Params:
    learning_rate: 0.014828202198489439
    max_depth: 3
    min_child_weight: 11
    subsample: 0.9659295775512261
    colsample_bytree: 0.6170699899154761
    reg_alpha: 1.505242824264109
    reg_lambda: 2.39015744229071e-08

--- Confusion Matrix (Best Trial at Threshold: 0.4500) ---
--------------------------|Predicted Label              |
--------------------------|-----------------------------|
--------------------------| Negative (0) | Positive (1) |
-----------|--------------|--------------|--------------|
True Label | Negative (0) | 489          | 74           | (TN, FP)
           |--------------|--------------|--------------|
           | Positive (1) | 50           | 129          | (FN, TP)
-----------|--------------|--------------|--------------|

--- Feature Importance (Best Model) ---
               Feature  Importance
0    今後のDX展望_neg_ratio    0.056388
1                

In [7]:
# 基本パラメータ
cat_params_base = {
    'objective': 'Logloss', # CatBoostでは'Logloss'を指定
        'iterations': 1000,
        'scale_pos_weight': (y_train_df == 0).sum() / (y_train_df == 1).sum(), # 不均衡データへの対処
        'random_seed': 42,
        'verbose': 0, # 学習ログを非表示
        'early_stopping_rounds': 10 # 早期停止
    }

# 探索するパラメータを定義する「関数」
def define_cat_params(trial):
    return {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'depth': trial.suggest_int('depth', 3, 10),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0, log=True), # L2正則化
    }

# CatBoostモデルの最適化実行
cat_results = optimize_model(
    X_train_df,
    y_train_df,
    model_name="catboost",
    params_base=cat_params_base,
    define_params_func=define_cat_params,
    n_trials=50,
    early_stopping_rounds=10
)

[I 2025-11-07 11:16:49,908] A new study created in memory with name: no-name-e6f730ee-75a3-4977-a686-222b9565ca9a


--- Optimizing catboost ---


[I 2025-11-07 11:17:12,337] Trial 0 finished with value: 0.5860713616882125 and parameters: {'learning_rate': 0.0196265489452726, 'depth': 10, 'min_data_in_leaf': 60, 'subsample': 0.8092083307604437, 'l2_leaf_reg': 1.0197823760628435e-06}. Best is trial 0 with value: 0.5860713616882125.
[I 2025-11-07 11:17:18,745] Trial 1 finished with value: 0.5805551115551115 and parameters: {'learning_rate': 0.02179449537103032, 'depth': 8, 'min_data_in_leaf': 12, 'subsample': 0.8821916246018775, 'l2_leaf_reg': 4.2835109794676517e-05}. Best is trial 0 with value: 0.5860713616882125.
[I 2025-11-07 11:17:22,575] Trial 2 finished with value: 0.5848136504845365 and parameters: {'learning_rate': 0.07609361005268273, 'depth': 6, 'min_data_in_leaf': 42, 'subsample': 0.7285045150377365, 'l2_leaf_reg': 0.005303778736869338}. Best is trial 0 with value: 0.5860713616882125.
[I 2025-11-07 11:17:29,758] Trial 3 finished with value: 0.6476758724508535 and parameters: {'learning_rate': 0.012535544807467212, 'depth


--- Optimization Finished ---
Best trial for catboost:
  Value (Best F1 Score): 0.67916
  Best Params:
    learning_rate: 0.09806469439177379
    depth: 5
    min_data_in_leaf: 71
    subsample: 0.9932284512000729
    l2_leaf_reg: 1.4510326429900609

--- Confusion Matrix (Best Trial at Threshold: 0.3900) ---
--------------------------|Predicted Label              |
--------------------------|-----------------------------|
--------------------------| Negative (0) | Positive (1) |
-----------|--------------|--------------|--------------|
True Label | Negative (0) | 455          | 108          | (TN, FP)
           |--------------|--------------|--------------|
           | Positive (1) | 32           | 147          | (FN, TP)
-----------|--------------|--------------|--------------|

--- Feature Importance (Best Model) ---
               Feature  Importance
0    今後のDX展望_neg_ratio    8.725564
1             営業CFマージン    5.501669
2             無形固定資産変動    4.896917
3                 流動比率    

In [8]:
# アンサンブルモデルの学習
ensemble_result = train_ensemble_models(
    X_train_df=X_train_df,
    y_train_df=y_train_df,
    X_test_df=X_test_df,
    lgb_best_params = lgbm_results['best_params'],
    xgb_best_params = xgb_results['best_params'],
    cat_best_params = cat_results['best_params'],
    sample_submit=sample_submit,
    n_folds=5,
    early_stopping_rounds=10,
    thresholds=np.arange(0.1, 0.5, 0.01),
    random_state=42
)

--- Start Ensemble Training ---
Fold 1/5 started...
Fold 2/5 started...
Fold 3/5 started...
Fold 4/5 started...
Fold 5/5 started...
--- Ensemble Training Finished ---
Optimal Weights - LGB: 0.3333, XGB: 0.3333, CAT: 0.3333

--- Evaluation ---
Ensemble Best F1: 0.67482 (Threshold: 0.40)
LightGBM: 0.68281 (Threshold: 0.36)
XGBoost:  0.67539 (Threshold: 0.45)
CatBoost: 0.67742 (Threshold: 0.39)

--- Fold-wise Stability Evaluation (using global thresholds) ---
  Fold 1 F1 -> Ensemble: 0.72500 | LGB: 0.68354 | XGB: 0.66667 | CAT: 0.74419
  Fold 2 F1 -> Ensemble: 0.62791 | LGB: 0.67442 | XGB: 0.66667 | CAT: 0.63158
  Fold 3 F1 -> Ensemble: 0.70130 | LGB: 0.73418 | XGB: 0.69444 | CAT: 0.72500
  Fold 4 F1 -> Ensemble: 0.65116 | LGB: 0.66667 | XGB: 0.67500 | CAT: 0.64444
  Fold 5 F1 -> Ensemble: 0.67500 | LGB: 0.65854 | XGB: 0.67532 | CAT: 0.65060
Ensemble Fold F1 Mean: 0.67607, Std: 0.03456
