# スタッキング
・GBDT 2~3個：決定木の深さが「浅い」「普通」「深い」モデル

・Random Forest 1~2個：決定木の深さが「浅い」「深い」モデル

・Neural Net 1~2個：層の数が「少ない」「多い」モデル

・Linier 1個

In [16]:
import os
# os.chdir('../../')

In [17]:
import numpy as np
import pandas as pd
from scr.util import *
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from scr.models.gbdt import Model1_CatBoost_1, Model1_CatBoost_2, Model1_CatBoost_3, Model1_XGBoost_1, Model1_XGBoost_2, Model1_XGBoost_3, Model1_LightGBM_1, Model1_LightGBM_2, Model1_LightGBM_3
from scr.models.random_forest import Model1_RandomForest_1, Model1_RandomForest_2, Model1_RandomForest_3
from scr.models.nn import Model1_NN_1, Model1_NN_2, Model1_NN_all_1, Model1_NN_all_2, Model1_TabNet_1, Model1_TabNet_2
from scr.models.linear import Model1_Logistic_1, Model1_Logistic_2, Model1_Logistic_3, Model1_Logistic_4, Model2_Logistic

In [18]:
path_gb = 'data/sampling/under_sampling/mean_gb'
path_nn = 'data/sampling/under_sampling/mean_nn'

files_gb = os.listdir(path_gb)
files_nn = os.listdir(path_nn)

trains_gb = []
for file in files_gb:
    file_path = os.path.join(path_gb, file)
    df = pd.read_csv(file_path)
    trains_gb.append(df)

trains_nn = []
for file in files_nn:
    file_path = os.path.join(path_nn, file)
    df = pd.read_csv(file_path)
    trains_nn.append(df)

In [19]:
train_gb = trains_gb[4]
test_gb = pd.read_csv('data/feature_engineered/null_representative/test_null_mean.csv')

train_nn = trains_nn[4]
test_nn = pd.read_csv('data/feature_engineered/null_representative/test_null_mean_scaled.csv')

combination_columns = train_gb.columns[44:110]
y = train_gb['ProdTaken']

In [20]:
# 特徴量が多いので、落とす特徴量を選択
drop_gb = [
    'EconomicSegment',
    'ContractRate_FM',
    'ContractRate_G1',
    'ContractRate_G2',
    'ContractRate_G3',
    'ContractRate_G4',
    'ContractRate_G5',
    'ContractRate_G6'
]

## --------------------------------------------------------------------------------------------
## Label Encoding
train_gb = mapping_columns_if_exist(train_gb)
test_gb = mapping_columns_if_exist(test_gb)

def handle_unknown_label(train_series, test_series):
    unique_labels = train_series.unique()
    label_map = {label: idx for idx, label in enumerate(unique_labels)}
    train_encoded = train_series.map(label_map)
    test_encoded = test_series.map(lambda x: label_map.get(x, -1))
    return train_encoded, test_encoded

for col in combination_columns:
    train_gb[col], test_gb[col] = handle_unknown_label(train_gb[col], test_gb[col])
## --------------------------------------------------------------------------------------------

X_gb = train_gb.drop(columns=drop_gb, axis=1)
y_gb = train_gb['ProdTaken']

test_feature = X_gb.columns.drop('ProdTaken')
test_gb = test_gb[test_feature]

tmp = X_gb.groupby(by=['AgeGroup', 'ProductPitched'], as_index=False)['ProdTaken'].mean()
tmp = tmp.rename(columns={'ProdTaken': 'ContractRate_G4'})
test_gb = test_gb.merge(tmp, on=['AgeGroup', 'ProductPitched'], how='left')

In [21]:
drop_nn = [
    'EconomicSegment',
    'ContractRate_FM',
    'ContractRate_G1',
    'ContractRate_G2',
    'ContractRate_G3',
    'ContractRate_G4',
    'ContractRate_G5',
    'ContractRate_G6'
]

X_nn = train_nn.drop(columns=drop_nn, axis=1)
X_nn = X_nn.drop(columns=['ProdTaken'])
y_nn = train_nn['ProdTaken']

test_nn = test_nn.drop(columns=drop_nn, axis=1)

num_features_nn = len(test_nn.columns)

In [22]:
# スタッキング
def predict_cv(model, X, y, df_test):
    preds = []
    preds_test = []
    va_idxes = []

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

    for i, (tr_idx, va_idx) in enumerate(skf.split(X, y)):
        tr_x, va_x = X.iloc[tr_idx], X.iloc[va_idx]
        tr_y, va_y = y.iloc[tr_idx], y.iloc[va_idx]
        
        # Target Encoding
        if 'AgeGroup' in tr_x.columns:
            tmp = tr_x.groupby(by=['AgeGroup', 'ProductPitched'], as_index=False)['ProdTaken'].mean()
            tmp = tmp.rename(columns={'ProdTaken': 'ContractRate_G4'})
            tr_x = tr_x.merge(tmp, on=['AgeGroup', 'ProductPitched'], how='left')
            va_x = va_x.merge(tmp, on=['AgeGroup', 'ProductPitched'], how='left')
            
            tr_x = tr_x.drop(labels='ProdTaken', axis=1)
            va_x = va_x.drop(labels='ProdTaken', axis=1)
        
        model.fit(tr_x, tr_y, va_x, va_y)
        pred = model.predict(va_x)
        preds.append(pred)
        pred_test = model.predict(df_test)
        preds_test.append(pred_test)
        va_idxes.append(va_idx)

    # バリデーションデータに対する予測値を連結し、その後元の順序に並べ直す
    va_idxes = np.concatenate(va_idxes)
    preds = np.concatenate(preds, axis=0)
    order = np.argsort(va_idxes)
    pred_train = preds[order]

    # テストデータに対する予測値の平均をとる
    preds_test = np.mean(preds_test, axis=0)

    return pred_train, preds_test

In [23]:
# 1層目のモデル
models_gbdt = [
    Model1_CatBoost_1(),
    Model1_CatBoost_2(),
    Model1_CatBoost_3(),
    #Model1_XGBoost_1(),
    #Model1_XGBoost_2(),
    #Model1_XGBoost_3(),
    #Model1_LightGBM_1(),
    #Model1_LightGBM_2(),
    #Model1_LightGBM_3()
]

models_nn = [
    Model1_RandomForest_1(),
    #Model1_RandomForest_2(),
    #Model1_RandomForest_3(),
    Model1_NN_1(input_shape=num_features_nn),
    Model1_NN_2(input_shape=num_features_nn),
    #Model1_NN_all_1(input_shape=num_features_nn),
    #Model1_NN_all_2(input_shape=num_features_nn),
    Model1_TabNet_1(input_dim=num_features_nn),
    Model1_TabNet_2(input_dim=num_features_nn),
    Model1_Logistic_1(),
    #Model1_Logistic_2(),
    #Model1_Logistic_3(),
    #Model1_Logistic_4()
]



In [24]:
pred_train_list = []
pred_test_list = []

for model in models_gbdt:
    pred_train, pred_test = predict_cv(model, X_gb, y_gb, test_gb)
    pred_train_list.append(pred_train)
    pred_test_list.append(pred_test)

for model in models_nn:
    pred_train, pred_test = predict_cv(model, X_nn, y_nn, test_nn)
    pred_train_list.append(pred_train)
    pred_test_list.append(pred_test)

0:	test: 0.7792017	best: 0.7792017 (0)	total: 2.72ms	remaining: 13.6s
200:	test: 0.8348739	best: 0.8352941 (158)	total: 388ms	remaining: 9.27s
400:	test: 0.8406723	best: 0.8409244 (398)	total: 732ms	remaining: 8.39s
600:	test: 0.8431092	best: 0.8438655 (579)	total: 1.1s	remaining: 8.06s
800:	test: 0.8435294	best: 0.8442017 (770)	total: 1.46s	remaining: 7.65s
1000:	test: 0.8436975	best: 0.8444538 (905)	total: 1.82s	remaining: 7.26s
1200:	test: 0.8415966	best: 0.8444538 (905)	total: 2.18s	remaining: 6.9s
1400:	test: 0.8399160	best: 0.8444538 (905)	total: 2.56s	remaining: 6.58s
1600:	test: 0.8384874	best: 0.8444538 (905)	total: 2.93s	remaining: 6.22s
1800:	test: 0.8367227	best: 0.8444538 (905)	total: 3.3s	remaining: 5.86s
2000:	test: 0.8367227	best: 0.8444538 (905)	total: 3.65s	remaining: 5.48s
2200:	test: 0.8350420	best: 0.8444538 (905)	total: 4.01s	remaining: 5.1s
2400:	test: 0.8342857	best: 0.8444538 (905)	total: 4.37s	remaining: 4.73s
2600:	test: 0.8344538	best: 0.8444538 (905)	total:



epoch 0  | loss: 1.06009 | val_0_auc: 0.57588 |  0:00:00s
epoch 1  | loss: 0.96528 | val_0_auc: 0.52244 |  0:00:00s
epoch 2  | loss: 0.92067 | val_0_auc: 0.49235 |  0:00:01s
epoch 3  | loss: 0.80547 | val_0_auc: 0.47252 |  0:00:01s
epoch 4  | loss: 0.81403 | val_0_auc: 0.48916 |  0:00:01s
epoch 5  | loss: 0.76075 | val_0_auc: 0.46697 |  0:00:01s
epoch 6  | loss: 0.76629 | val_0_auc: 0.45437 |  0:00:02s
epoch 7  | loss: 0.73143 | val_0_auc: 0.44042 |  0:00:02s
epoch 8  | loss: 0.73155 | val_0_auc: 0.45202 |  0:00:02s
epoch 9  | loss: 0.71703 | val_0_auc: 0.45899 |  0:00:03s
epoch 10 | loss: 0.71565 | val_0_auc: 0.47361 |  0:00:03s

Early stopping occurred at epoch 10 with best_epoch = 0 and best_val_0_auc = 0.57588




epoch 0  | loss: 0.98305 | val_0_auc: 0.58897 |  0:00:00s
epoch 1  | loss: 0.92955 | val_0_auc: 0.57222 |  0:00:00s
epoch 2  | loss: 0.8048  | val_0_auc: 0.58923 |  0:00:00s
epoch 3  | loss: 0.86107 | val_0_auc: 0.62525 |  0:00:01s
epoch 4  | loss: 0.79538 | val_0_auc: 0.62298 |  0:00:01s
epoch 5  | loss: 0.76669 | val_0_auc: 0.55539 |  0:00:01s
epoch 6  | loss: 0.73415 | val_0_auc: 0.44327 |  0:00:02s
epoch 7  | loss: 0.72403 | val_0_auc: 0.43973 |  0:00:02s
epoch 8  | loss: 0.70196 | val_0_auc: 0.48047 |  0:00:02s
epoch 9  | loss: 0.67531 | val_0_auc: 0.5021  |  0:00:03s
epoch 10 | loss: 0.70639 | val_0_auc: 0.51633 |  0:00:03s
epoch 11 | loss: 0.68389 | val_0_auc: 0.56221 |  0:00:03s
epoch 12 | loss: 0.66002 | val_0_auc: 0.62551 |  0:00:04s
epoch 13 | loss: 0.65716 | val_0_auc: 0.68527 |  0:00:04s
epoch 14 | loss: 0.61209 | val_0_auc: 0.71751 |  0:00:04s
epoch 15 | loss: 0.62491 | val_0_auc: 0.73594 |  0:00:04s
epoch 16 | loss: 0.60008 | val_0_auc: 0.74714 |  0:00:05s
epoch 17 | los



epoch 0  | loss: 1.06872 | val_0_auc: 0.56532 |  0:00:00s
epoch 1  | loss: 0.86155 | val_0_auc: 0.47104 |  0:00:00s
epoch 2  | loss: 0.86396 | val_0_auc: 0.39646 |  0:00:00s
epoch 3  | loss: 0.80758 | val_0_auc: 0.43098 |  0:00:01s
epoch 4  | loss: 0.8297  | val_0_auc: 0.41448 |  0:00:01s
epoch 5  | loss: 0.75915 | val_0_auc: 0.42551 |  0:00:01s
epoch 6  | loss: 0.71951 | val_0_auc: 0.4649  |  0:00:02s
epoch 7  | loss: 0.72091 | val_0_auc: 0.50109 |  0:00:02s
epoch 8  | loss: 0.70079 | val_0_auc: 0.50152 |  0:00:02s
epoch 9  | loss: 0.70852 | val_0_auc: 0.4771  |  0:00:03s
epoch 10 | loss: 0.69765 | val_0_auc: 0.46481 |  0:00:03s

Early stopping occurred at epoch 10 with best_epoch = 0 and best_val_0_auc = 0.56532




epoch 0  | loss: 1.12375 | val_0_auc: 0.62399 |  0:00:00s
epoch 1  | loss: 0.90457 | val_0_auc: 0.67584 |  0:00:00s
epoch 2  | loss: 0.85064 | val_0_auc: 0.6649  |  0:00:00s
epoch 3  | loss: 0.84215 | val_0_auc: 0.69419 |  0:00:01s
epoch 4  | loss: 0.77204 | val_0_auc: 0.71633 |  0:00:01s
epoch 5  | loss: 0.72607 | val_0_auc: 0.73085 |  0:00:01s
epoch 6  | loss: 0.79374 | val_0_auc: 0.68796 |  0:00:02s
epoch 7  | loss: 0.69727 | val_0_auc: 0.56557 |  0:00:02s
epoch 8  | loss: 0.72347 | val_0_auc: 0.49125 |  0:00:02s
epoch 9  | loss: 0.6887  | val_0_auc: 0.51524 |  0:00:03s
epoch 10 | loss: 0.70925 | val_0_auc: 0.61671 |  0:00:03s
epoch 11 | loss: 0.6813  | val_0_auc: 0.64537 |  0:00:03s
epoch 12 | loss: 0.66985 | val_0_auc: 0.7178  |  0:00:04s
epoch 13 | loss: 0.66776 | val_0_auc: 0.75859 |  0:00:04s
epoch 14 | loss: 0.6626  | val_0_auc: 0.76591 |  0:00:04s
epoch 15 | loss: 0.63234 | val_0_auc: 0.76052 |  0:00:04s
epoch 16 | loss: 0.6358  | val_0_auc: 0.76818 |  0:00:05s
epoch 17 | los



epoch 0  | loss: 1.451   | val_0_auc: 0.53782 |  0:00:00s
epoch 1  | loss: 1.07119 | val_0_auc: 0.49908 |  0:00:01s
epoch 2  | loss: 1.08662 | val_0_auc: 0.51622 |  0:00:01s
epoch 3  | loss: 1.08576 | val_0_auc: 0.55261 |  0:00:02s
epoch 4  | loss: 0.95112 | val_0_auc: 0.60571 |  0:00:02s
epoch 5  | loss: 1.00103 | val_0_auc: 0.61462 |  0:00:03s
epoch 6  | loss: 0.94682 | val_0_auc: 0.63143 |  0:00:03s
epoch 7  | loss: 0.96327 | val_0_auc: 0.65866 |  0:00:04s
epoch 8  | loss: 0.90557 | val_0_auc: 0.67059 |  0:00:04s
epoch 9  | loss: 0.85999 | val_0_auc: 0.67202 |  0:00:05s
epoch 10 | loss: 0.84285 | val_0_auc: 0.69958 |  0:00:05s
epoch 11 | loss: 0.81236 | val_0_auc: 0.70681 |  0:00:06s
epoch 12 | loss: 0.8142  | val_0_auc: 0.71328 |  0:00:07s
epoch 13 | loss: 0.79286 | val_0_auc: 0.7195  |  0:00:07s
epoch 14 | loss: 0.75688 | val_0_auc: 0.73294 |  0:00:08s
epoch 15 | loss: 0.74329 | val_0_auc: 0.73756 |  0:00:08s
epoch 16 | loss: 0.79078 | val_0_auc: 0.73134 |  0:00:09s
epoch 17 | los



epoch 0  | loss: 1.50363 | val_0_auc: 0.57277 |  0:00:00s
epoch 1  | loss: 1.08671 | val_0_auc: 0.54353 |  0:00:01s
epoch 2  | loss: 1.13354 | val_0_auc: 0.53924 |  0:00:01s
epoch 3  | loss: 1.11435 | val_0_auc: 0.52966 |  0:00:02s
epoch 4  | loss: 1.01055 | val_0_auc: 0.53269 |  0:00:02s
epoch 5  | loss: 0.95994 | val_0_auc: 0.45992 |  0:00:03s
epoch 6  | loss: 0.86227 | val_0_auc: 0.41311 |  0:00:03s
epoch 7  | loss: 0.90281 | val_0_auc: 0.39966 |  0:00:04s
epoch 8  | loss: 0.89785 | val_0_auc: 0.39513 |  0:00:04s
epoch 9  | loss: 0.81654 | val_0_auc: 0.43168 |  0:00:05s
epoch 10 | loss: 0.80219 | val_0_auc: 0.46328 |  0:00:06s
epoch 11 | loss: 0.79987 | val_0_auc: 0.47916 |  0:00:06s
epoch 12 | loss: 0.77328 | val_0_auc: 0.53176 |  0:00:07s
epoch 13 | loss: 0.78644 | val_0_auc: 0.55714 |  0:00:07s
epoch 14 | loss: 0.7264  | val_0_auc: 0.57622 |  0:00:08s
epoch 15 | loss: 0.74168 | val_0_auc: 0.59672 |  0:00:08s
epoch 16 | loss: 0.72164 | val_0_auc: 0.60479 |  0:00:09s
epoch 17 | los



epoch 0  | loss: 1.46773 | val_0_auc: 0.52054 |  0:00:00s
epoch 1  | loss: 1.01228 | val_0_auc: 0.50598 |  0:00:01s
epoch 2  | loss: 1.1155  | val_0_auc: 0.42593 |  0:00:01s
epoch 3  | loss: 1.20151 | val_0_auc: 0.51785 |  0:00:02s
epoch 4  | loss: 1.19623 | val_0_auc: 0.49537 |  0:00:02s
epoch 5  | loss: 1.04965 | val_0_auc: 0.57635 |  0:00:03s
epoch 6  | loss: 0.90242 | val_0_auc: 0.6     |  0:00:03s
epoch 7  | loss: 0.9861  | val_0_auc: 0.59276 |  0:00:04s
epoch 8  | loss: 0.95816 | val_0_auc: 0.55715 |  0:00:04s
epoch 9  | loss: 0.96055 | val_0_auc: 0.58855 |  0:00:05s
epoch 10 | loss: 0.86001 | val_0_auc: 0.61288 |  0:00:05s
epoch 11 | loss: 0.84892 | val_0_auc: 0.62433 |  0:00:06s
epoch 12 | loss: 0.82859 | val_0_auc: 0.62761 |  0:00:07s
epoch 13 | loss: 0.82606 | val_0_auc: 0.64209 |  0:00:07s
epoch 14 | loss: 0.74953 | val_0_auc: 0.65354 |  0:00:08s
epoch 15 | loss: 0.77677 | val_0_auc: 0.66785 |  0:00:08s
epoch 16 | loss: 0.74168 | val_0_auc: 0.67921 |  0:00:09s
epoch 17 | los



epoch 0  | loss: 1.49961 | val_0_auc: 0.42433 |  0:00:00s
epoch 1  | loss: 1.06774 | val_0_auc: 0.42348 |  0:00:01s
epoch 2  | loss: 1.13366 | val_0_auc: 0.42727 |  0:00:01s
epoch 3  | loss: 1.16391 | val_0_auc: 0.45892 |  0:00:02s
epoch 4  | loss: 1.11127 | val_0_auc: 0.45118 |  0:00:02s
epoch 5  | loss: 0.94758 | val_0_auc: 0.40227 |  0:00:03s
epoch 6  | loss: 0.91386 | val_0_auc: 0.49592 |  0:00:04s
epoch 7  | loss: 0.88935 | val_0_auc: 0.52247 |  0:00:04s
epoch 8  | loss: 0.96433 | val_0_auc: 0.52786 |  0:00:05s
epoch 9  | loss: 0.90879 | val_0_auc: 0.55185 |  0:00:05s
epoch 10 | loss: 0.84095 | val_0_auc: 0.56616 |  0:00:06s
epoch 11 | loss: 0.81074 | val_0_auc: 0.58316 |  0:00:07s
epoch 12 | loss: 0.80185 | val_0_auc: 0.61633 |  0:00:07s
epoch 13 | loss: 0.77312 | val_0_auc: 0.63645 |  0:00:08s
epoch 14 | loss: 0.73988 | val_0_auc: 0.68561 |  0:00:08s
epoch 15 | loss: 0.72868 | val_0_auc: 0.69116 |  0:00:09s
epoch 16 | loss: 0.73789 | val_0_auc: 0.7133  |  0:00:10s
epoch 17 | los



epoch 0  | loss: 1.53062 | val_0_auc: 0.52365 |  0:00:00s
epoch 1  | loss: 1.06227 | val_0_auc: 0.46431 |  0:00:01s
epoch 2  | loss: 1.16528 | val_0_auc: 0.41734 |  0:00:01s
epoch 3  | loss: 1.06904 | val_0_auc: 0.41229 |  0:00:02s
epoch 4  | loss: 1.03001 | val_0_auc: 0.46751 |  0:00:02s
epoch 5  | loss: 0.84605 | val_0_auc: 0.48788 |  0:00:03s
epoch 6  | loss: 0.93923 | val_0_auc: 0.50059 |  0:00:03s
epoch 7  | loss: 0.95077 | val_0_auc: 0.50816 |  0:00:04s
epoch 8  | loss: 0.83047 | val_0_auc: 0.51667 |  0:00:04s
epoch 9  | loss: 0.80489 | val_0_auc: 0.51557 |  0:00:05s
epoch 10 | loss: 0.80154 | val_0_auc: 0.52517 |  0:00:05s
epoch 11 | loss: 0.82751 | val_0_auc: 0.56069 |  0:00:06s
epoch 12 | loss: 0.83141 | val_0_auc: 0.58283 |  0:00:06s
epoch 13 | loss: 0.84503 | val_0_auc: 0.59411 |  0:00:07s
epoch 14 | loss: 0.76878 | val_0_auc: 0.60152 |  0:00:08s
epoch 15 | loss: 0.77239 | val_0_auc: 0.60354 |  0:00:08s
epoch 16 | loss: 0.72462 | val_0_auc: 0.61793 |  0:00:09s
epoch 17 | los



In [25]:
# 1層目のモデルの評価
for i, pred_train in enumerate(pred_train_list):
    auc_score = roc_auc_score(y, pred_train)
    print(f'AUC for model {i+1}: {auc_score}')

AUC for model 1: 0.8152544026702019
AUC for model 2: 0.8347913568366723
AUC for model 3: 0.8317126841315451
AUC for model 4: 0.8217196153509687
AUC for model 5: 0.8203199127877634
AUC for model 6: 0.5428860790159015
AUC for model 7: 0.6303691715510453
AUC for model 8: 0.7726391795589592
AUC for model 9: 0.7868145326810361


In [26]:
# 特徴量として使用する列名を作成
column_names = [f'pred_{i+1}' for i in range(len(pred_train_list))]

# 予測値を特徴量としてデータフレームを作成
train_x_2 = pd.DataFrame(
    {f'pred_{i+1}': pred_train_list[i] for i in range(len(pred_train_list))},
    columns=column_names
)

test_x_2 = pd.DataFrame(
    {f'pred_{i+1}': pred_test_list[i] for i in range(len(pred_test_list))},
    columns=column_names
)

In [27]:
# 2層目のモデル
# pred_train_2は、2層目のモデルの学習データのクロスバリデーションでの予測値
# pred_test_2は、2層目のモデルのテストデータの予測値
model_2 = Model2_Logistic()
pred_train_2, pred_test_2 = predict_cv(model_2, train_x_2, y, test_x_2)
print(f'AUC: {roc_auc_score(y, pred_train_2)}')

AUC: 0.832836483785657


In [28]:
index = pd.read_csv('data/test.csv')['id'].values
df_submit = pd.DataFrame({
    "id": index,
    "prediction": pred_test_2
})

In [29]:
path = 'submission/submit_30_5.csv'

In [30]:
df_submit.to_csv(path, index=False, header=None)