# スタッキング
・GBDT 2~3個：決定木の深さが「浅い」「普通」「深い」モデル

・Random Forest 1~2個：決定木の深さが「浅い」「深い」モデル

・Neural Net 1~2個：層の数が「少ない」「多い」モデル

・Linier 1個

In [1]:
import os
os.chdir('../../')

In [2]:
import numpy as np
import pandas as pd
from scr.util import *
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from scr.models.gbdt import Model1_CatBoost_1, Model1_CatBoost_2, Model1_CatBoost_3, Model1_XGBoost_1, Model1_XGBoost_2, Model1_XGBoost_3, Model1_LightGBM_1, Model1_LightGBM_2, Model1_LightGBM_3
from scr.models.random_forest import Model1_RandomForest_1, Model1_RandomForest_2, Model1_RandomForest_3
from scr.models.nn import Model1_NN_1, Model1_NN_2, Model1_NN_all_1, Model1_NN_all_2, Model1_TabNet_1, Model1_TabNet_2
from scr.models.linear import Model1_Logistic_1, Model1_Logistic_2, Model1_Logistic_3, Model1_Logistic_4, Model2_Logistic

In [3]:
df_train_gb = pd.read_csv('data/sampling/over_sampling/smote/train_mean_gb_smote.csv')
df_test_gb = pd.read_csv('data/sampling/over_sampling/smote/test_mean_gb.csv')
combination_gb= df_train_gb.columns[44:110]

df_train_nn = pd.read_csv('data/sampling/over_sampling/smote/train_mean_nn_smote.csv')
df_test_nn = pd.read_csv('data/feature_engineered/null_representative/test_null_mean_scaled.csv')

y = df_train_gb['ProdTaken']

In [4]:
missing_values = df_test_gb.isnull().sum()
print(missing_values[missing_values > 0])

ContractRate_G1    3
ContractRate_G2    8
ContractRate_G3    6
ContractRate_G5    3
dtype: int64


In [5]:
# 特徴量が多いので、落とす特徴量を選択
drop_gb = [
    'EconomicSegment',
    'ContractRate_FM',
    'ContractRate_G1',
    'ContractRate_G2',
    'ContractRate_G3',
    'ContractRate_G4',
    'ContractRate_G5',
    'ContractRate_G6'
]

# ## --------------------------------------------------------------------------------------------
# ## Label Encoding
# df_train_gb = mapping_columns_if_exist(df_train_gb)
# df_test_gb = mapping_columns_if_exist(df_test_gb)

# def handle_unknown_label(train_series, test_series):
#     unique_labels = train_series.unique()
#     label_map = {label: idx for idx, label in enumerate(unique_labels)}
#     train_encoded = train_series.map(label_map)
#     test_encoded = test_series.map(lambda x: label_map.get(x, -1))
#     return train_encoded, test_encoded

# for col in combination_gb:
#     df_train_gb[col], df_test_gb[col] = handle_unknown_label(df_train_gb[col], df_test_gb[col])
# ## --------------------------------------------------------------------------------------------

X_gb = df_train_gb.drop(columns=drop_gb, axis=1)
y_gb = df_train_gb['ProdTaken']

test_feature = X_gb.columns.drop('ProdTaken')
df_test_gb = df_test_gb[test_feature]

tmp = X_gb.groupby(by=['AgeGroup', 'ProductPitched'], as_index=False)['ProdTaken'].mean()
tmp = tmp.rename(columns={'ProdTaken': 'ContractRate_G4'})
df_test_gb = df_test_gb.merge(tmp, on=['AgeGroup', 'ProductPitched'], how='left')

In [6]:
drop_nn = [
    'EconomicSegment',
    'ContractRate_FM',
    'ContractRate_G1',
    'ContractRate_G2',
    'ContractRate_G3',
    'ContractRate_G4',
    'ContractRate_G5',
    'ContractRate_G6'
]

X_nn = df_train_nn.drop(columns=drop_nn, axis=1)
y_nn = df_train_nn['ProdTaken']

df_test_nn = df_test_nn.drop(columns=drop_nn, axis=1)

num_features_nn = len(df_test_nn.columns)

In [7]:
# スタッキング
def predict_cv(model, X, y, df_test):
    preds = []
    preds_test = []
    va_idxes = []

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    for i, (tr_idx, va_idx) in enumerate(skf.split(X, y)):
        tr_x, va_x = X.iloc[tr_idx], X.iloc[va_idx]
        tr_y, va_y = y.iloc[tr_idx], y.iloc[va_idx]
        
        # Target Encoding
        if 'AgeGroup' in tr_x.columns:
            tmp = tr_x.groupby(by=['AgeGroup', 'ProductPitched'], as_index=False)['ProdTaken'].mean()
            tmp = tmp.rename(columns={'ProdTaken': 'ContractRate_G4'})
            tr_x = tr_x.merge(tmp, on=['AgeGroup', 'ProductPitched'], how='left')
            va_x = va_x.merge(tmp, on=['AgeGroup', 'ProductPitched'], how='left')
            
            tr_x = tr_x.drop(labels='ProdTaken', axis=1)
            va_x = va_x.drop(labels='ProdTaken', axis=1)
        
        model.fit(tr_x, tr_y, va_x, va_y)
        pred = model.predict(va_x)
        preds.append(pred)
        pred_test = model.predict(df_test)
        preds_test.append(pred_test)
        va_idxes.append(va_idx)

    # バリデーションデータに対する予測値を連結し、その後元の順序に並べ直す
    va_idxes = np.concatenate(va_idxes)
    preds = np.concatenate(preds, axis=0)
    order = np.argsort(va_idxes)
    pred_train = preds[order]

    # テストデータに対する予測値の平均をとる
    preds_test = np.mean(preds_test, axis=0)

    return pred_train, preds_test

In [8]:
# 1層目のモデル
models_gbdt = [
    Model1_CatBoost_1(),
    Model1_CatBoost_2(),
    Model1_CatBoost_3(),
    #Model1_XGBoost_1(),
    #Model1_XGBoost_2(),
    #Model1_XGBoost_3(),
    #Model1_LightGBM_1(),
    #Model1_LightGBM_2(),
    #Model1_LightGBM_3(),
    #Model1_RandomForest_1(),
    Model1_RandomForest_2(),
    #Model1_RandomForest_3()
]

models_nn = [
    Model1_NN_1(input_shape=num_features_nn),
    Model1_NN_2(input_shape=num_features_nn),
    Model1_NN_all_1(input_shape=num_features_nn),
    Model1_NN_all_2(input_shape=num_features_nn),
    #Model1_TabNet_1(input_dim=num_features_nn),
    #Model1_TabNet_2(input_dim=num_features_nn),
    Model1_Logistic_1(),
    Model1_Logistic_2(),
    #Model1_Logistic_3(),
    #Model1_Logistic_4()
]

In [9]:
pred_train_list = []
pred_test_list = []
for model in models_gbdt:
    pred_train, pred_test = predict_cv(model, X_gb, y_gb, df_test_gb)
    pred_train_list.append(pred_train)
    pred_test_list.append(pred_test)
for model in models_nn:
    pred_train, pred_test = predict_cv(model, X_nn, y_nn, df_test_nn)
    pred_train_list.append(pred_train)
    pred_test_list.append(pred_test)

0:	test: 0.8094036	best: 0.8094036 (0)	total: 133ms	remaining: 798ms
6:	test: 0.8557267	best: 0.8557267 (6)	total: 162ms	remaining: 0us

bestTest = 0.855726657
bestIteration = 6

0:	test: 0.8214192	best: 0.8214192 (0)	total: 5.55ms	remaining: 33.3ms
6:	test: 0.8939034	best: 0.8939034 (6)	total: 37.9ms	remaining: 0us

bestTest = 0.89390344
bestIteration = 6

0:	test: 0.8252564	best: 0.8252564 (0)	total: 5.9ms	remaining: 35.4ms
6:	test: 0.8999377	best: 0.9024531 (4)	total: 34.3ms	remaining: 0us

bestTest = 0.9024530851
bestIteration = 4

Shrink model to first 5 iterations.
0:	test: 0.8308399	best: 0.8308399 (0)	total: 2.94ms	remaining: 17.6ms
6:	test: 0.9103816	best: 0.9120748 (5)	total: 20ms	remaining: 0us

bestTest = 0.9120747511
bestIteration = 5

Shrink model to first 6 iterations.
0:	test: 0.8123161	best: 0.8123161 (0)	total: 4.78ms	remaining: 28.7ms
6:	test: 0.8924341	best: 0.8929318 (4)	total: 29.7ms	remaining: 0us

bestTest = 0.8929318464
bestIteration = 4

Shrink model to first 

ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
# 1層目のモデルの評価
for i, pred_train in enumerate(pred_train_list):
    auc_score = roc_auc_score(y, pred_train)
    print(f'AUC for model {i+1}: {auc_score}')

AUC for model 1: 0.8087502286445948
AUC for model 2: 0.8259819612864352
AUC for model 3: 0.8290901828080783
AUC for model 4: 0.8206585771312366
AUC for model 5: 0.8435089144492625
AUC for model 6: 0.8460125727627799
AUC for model 7: 0.910081478173856
AUC for model 8: 0.915961006681802
AUC for model 9: 0.8305629229925003
AUC for model 10: 0.8260270177213009


In [None]:
# 特徴量として使用する列名を作成
column_names = [f'pred_{i+1}' for i in range(len(pred_train_list))]

# 予測値を特徴量としてデータフレームを作成
train_x_2 = pd.DataFrame(
    {f'pred_{i+1}': pred_train_list[i] for i in range(len(pred_train_list))},
    columns=column_names
)

test_x_2 = pd.DataFrame(
    {f'pred_{i+1}': pred_test_list[i] for i in range(len(pred_test_list))},
    columns=column_names
)

In [None]:
# 2層目のモデル
# pred_train_2は、2層目のモデルの学習データのクロスバリデーションでの予測値
# pred_test_2は、2層目のモデルのテストデータの予測値
model_2 = Model2_Logistic()
pred_train_2, pred_test_2 = predict_cv(model_2, train_x_2, y, test_x_2)
print(f'AUC: {roc_auc_score(y, pred_train_2)}')

AUC: 0.9141412647005026


In [None]:
index = pd.read_csv('data/test.csv')['id'].values
df_submit = pd.DataFrame({
    "id": index,
    "prediction": pred_test_2
})

In [None]:
path = 'submission/submit_27.csv'

In [None]:
df_submit.to_csv(path, index=False, header=None)