# スタッキング
・GBDT 2~3個：決定木の深さが「浅い」「普通」「深い」モデル

・Random Forest 1~2個：決定木の深さが「浅い」「深い」モデル

・Neural Net 1~2個：層の数が「少ない」「多い」モデル

・Linier 1個

In [2]:
import os
os.chdir('../../')

In [3]:
import numpy as np
import pandas as pd
from scr.util import *
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from scr.models.gbdt import Model1_CatBoost_1, Model1_CatBoost_2, Model1_CatBoost_3, Model1_XGBoost_1, Model1_XGBoost_2, Model1_XGBoost_3, Model1_LightGBM_1, Model1_LightGBM_2, Model1_LightGBM_3
from scr.models.random_forest import Model1_RandomForest_1, Model1_RandomForest_2, Model1_RandomForest_3
from scr.models.nn import Model1_NN_1, Model1_NN_2, Model1_NN_all_1, Model1_NN_all_2, Model1_TabNet_1, Model1_TabNet_2
from scr.models.linear import Model1_Logistic_1, Model1_Logistic_2, Model1_Logistic_3, Model1_Logistic_4, Model2_Logistic

In [5]:
df_train_gb = pd.read_csv('data/sampling/over_sampling/smote/train_mean_gb_smote.csv')
df_test_gb = pd.read_csv('data/sampling/over_sampling/smote/test_mean_gb.csv')
combination_gb= df_train_gb.columns[44:110]

df_train_nn = pd.read_csv('data/sampling/over_sampling/smote/train_mean_nn_smote.csv')
df_test_nn = pd.read_csv('data/feature_engineered/null_representative/test_null_mean_scaled.csv')

y = df_train_gb['ProdTaken']

In [6]:
# 特徴量が多いので、落とす特徴量を選択
drop_gb = [
    'EconomicSegment',
    'ContractRate_FM',
    'ContractRate_G1',
    'ContractRate_G2',
    'ContractRate_G3',
    'ContractRate_G4',
    'ContractRate_G5',
    'ContractRate_G6'
]

# ## --------------------------------------------------------------------------------------------
# ## Label Encoding
# df_train_gb = mapping_columns_if_exist(df_train_gb)
# df_test_gb = mapping_columns_if_exist(df_test_gb)

# def handle_unknown_label(train_series, test_series):
#     unique_labels = train_series.unique()
#     label_map = {label: idx for idx, label in enumerate(unique_labels)}
#     train_encoded = train_series.map(label_map)
#     test_encoded = test_series.map(lambda x: label_map.get(x, -1))
#     return train_encoded, test_encoded

# for col in combination_gb:
#     df_train_gb[col], df_test_gb[col] = handle_unknown_label(df_train_gb[col], df_test_gb[col])
# ## --------------------------------------------------------------------------------------------

X_gb = df_train_gb.drop(columns=drop_gb, axis=1)
y_gb = df_train_gb['ProdTaken']

df_test_gb = df_test_gb.drop(columns=drop_gb, axis=1)

num_features_gb = len(df_test_gb.columns)

In [7]:
drop_nn = [
    'EconomicSegment',
    'ContractRate_FM',
    'ContractRate_G1',
    'ContractRate_G2',
    'ContractRate_G3',
    'ContractRate_G4',
    'ContractRate_G5',
    'ContractRate_G6'
]

X_nn = df_train_nn.drop(columns=drop_nn, axis=1)
y_nn = df_train_nn['ProdTaken']

df_test_nn = df_test_nn.drop(columns=drop_nn, axis=1)

num_features_nn = len(df_test_nn.columns)

In [8]:
# スタッキング
def predict_cv(model, X, y, df_test):
    preds = []
    preds_test = []
    va_idxes = []

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    for i, (tr_idx, va_idx) in enumerate(skf.split(X, y)):
        tr_x, va_x = X.iloc[tr_idx], X.iloc[va_idx]
        tr_y, va_y = y.iloc[tr_idx], y.iloc[va_idx]
        
        # Target Encoding
        
        
        model.fit(tr_x, tr_y, va_x, va_y)
        pred = model.predict(va_x)
        preds.append(pred)
        pred_test = model.predict(df_test)
        preds_test.append(pred_test)
        va_idxes.append(va_idx)

    # バリデーションデータに対する予測値を連結し、その後元の順序に並べ直す
    va_idxes = np.concatenate(va_idxes)
    preds = np.concatenate(preds, axis=0)
    order = np.argsort(va_idxes)
    pred_train = preds[order]

    # テストデータに対する予測値の平均をとる
    preds_test = np.mean(preds_test, axis=0)

    return pred_train, preds_test

In [9]:
# 1層目のモデル
models_gbdt = [
    #Model1_CatBoost_1(),
    #Model1_CatBoost_2(),
    #Model1_CatBoost_3(),
    Model1_XGBoost_1(),
    Model1_XGBoost_2(),
    Model1_XGBoost_3(),
    #Model1_LightGBM_1(),
    #Model1_LightGBM_2(),
    #Model1_LightGBM_3(),
    #Model1_RandomForest_1(),
    Model1_RandomForest_2(),
    #Model1_RandomForest_3()
]

models_nn = [
    Model1_NN_1(input_shape=num_features_nn),
    Model1_NN_2(input_shape=num_features_nn),
    Model1_NN_all_1(input_shape=num_features_nn),
    Model1_NN_all_2(input_shape=num_features_nn),
    #Model1_TabNet_1(input_dim=num_features_nn),
    #Model1_TabNet_2(input_dim=num_features_nn),
    Model1_Logistic_1(),
    Model1_Logistic_2(),
    #Model1_Logistic_3(),
    #Model1_Logistic_4()
]

In [9]:
pred_train_list = []
pred_test_list = []
for model in models_gbdt:
    pred_train, pred_test = predict_cv(model, X_gb, y_gb, df_test_gb)
    pred_train_list.append(pred_train)
    pred_test_list.append(pred_test)
for model in models_nn:
    pred_train, pred_test = predict_cv(model, X_nn, y_nn, df_test_nn)
    pred_train_list.append(pred_train)
    pred_test_list.append(pred_test)

[0]	train-auc:1.00000	eval-auc:1.00000
[100]	train-auc:1.00000	eval-auc:1.00000
[200]	train-auc:1.00000	eval-auc:1.00000
[300]	train-auc:1.00000	eval-auc:1.00000
[400]	train-auc:1.00000	eval-auc:1.00000
[500]	train-auc:1.00000	eval-auc:1.00000
[600]	train-auc:1.00000	eval-auc:1.00000
[700]	train-auc:1.00000	eval-auc:1.00000
[800]	train-auc:1.00000	eval-auc:1.00000
[900]	train-auc:1.00000	eval-auc:1.00000
[1000]	train-auc:1.00000	eval-auc:1.00000
[1100]	train-auc:1.00000	eval-auc:1.00000
[1200]	train-auc:1.00000	eval-auc:1.00000
[1300]	train-auc:1.00000	eval-auc:1.00000
[1400]	train-auc:1.00000	eval-auc:1.00000
[1500]	train-auc:1.00000	eval-auc:1.00000
[1600]	train-auc:1.00000	eval-auc:1.00000
[1700]	train-auc:1.00000	eval-auc:1.00000
[1800]	train-auc:1.00000	eval-auc:1.00000
[1900]	train-auc:1.00000	eval-auc:1.00000
[2000]	train-auc:1.00000	eval-auc:1.00000
[2100]	train-auc:1.00000	eval-auc:1.00000
[2200]	train-auc:1.00000	eval-auc:1.00000
[2300]	train-auc:1.00000	eval-auc:1.00000
[240

ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:TypeofContact: object, Occupation: object, Gender: object, ProductPitched: object, Designation: object, Marry: object, Car: object, Child: object, AgeGroup: object, TypeofContact_CityTier: object, TypeofContact_Occupation: object, TypeofContact_Gender: object, TypeofContact_ProductPitched: object, TypeofContact_PreferredPropertyStar: object, TypeofContact_Passport: object, TypeofContact_PitchSatisfactionScore: object, TypeofContact_Designation: object, TypeofContact_Marry: object, TypeofContact_Car: object, TypeofContact_Child: object, CityTier_Occupation: object, CityTier_Gender: object, CityTier_ProductPitched: object, CityTier_PreferredPropertyStar: object, CityTier_Passport: object, CityTier_PitchSatisfactionScore: object, CityTier_Designation: object, CityTier_Marry: object, CityTier_Car: object, CityTier_Child: object, Occupation_Gender: object, Occupation_ProductPitched: object, Occupation_PreferredPropertyStar: object, Occupation_Passport: object, Occupation_PitchSatisfactionScore: object, Occupation_Designation: object, Occupation_Marry: object, Occupation_Car: object, Occupation_Child: object, Gender_ProductPitched: object, Gender_PreferredPropertyStar: object, Gender_Passport: object, Gender_PitchSatisfactionScore: object, Gender_Designation: object, Gender_Marry: object, Gender_Car: object, Gender_Child: object, ProductPitched_PreferredPropertyStar: object, ProductPitched_Passport: object, ProductPitched_PitchSatisfactionScore: object, ProductPitched_Designation: object, ProductPitched_Marry: object, ProductPitched_Car: object, ProductPitched_Child: object, PreferredPropertyStar_Passport: object, PreferredPropertyStar_PitchSatisfactionScore: object, PreferredPropertyStar_Designation: object, PreferredPropertyStar_Marry: object, PreferredPropertyStar_Car: object, PreferredPropertyStar_Child: object, Passport_PitchSatisfactionScore: object, Passport_Designation: object, Passport_Marry: object, Passport_Car: object, Passport_Child: object, PitchSatisfactionScore_Designation: object, PitchSatisfactionScore_Marry: object, PitchSatisfactionScore_Car: object, PitchSatisfactionScore_Child: object, Designation_Marry: object, Designation_Car: object, Designation_Child: object, Marry_Car: object, Marry_Child: object, Car_Child: object

In [None]:
# 1層目のモデルの評価
for i, pred_train in enumerate(pred_train_list):
    auc_score = roc_auc_score(y, pred_train)
    print(f'AUC for model {i+1}: {auc_score}')

AUC for model 1: 0.8087502286445948
AUC for model 2: 0.8259819612864352
AUC for model 3: 0.8290901828080783
AUC for model 4: 0.8206585771312366
AUC for model 5: 0.8435089144492625
AUC for model 6: 0.8460125727627799
AUC for model 7: 0.910081478173856
AUC for model 8: 0.915961006681802
AUC for model 9: 0.8305629229925003
AUC for model 10: 0.8260270177213009


In [None]:
# 特徴量として使用する列名を作成
column_names = [f'pred_{i+1}' for i in range(len(pred_train_list))]

# 予測値を特徴量としてデータフレームを作成
train_x_2 = pd.DataFrame(
    {f'pred_{i+1}': pred_train_list[i] for i in range(len(pred_train_list))},
    columns=column_names
)

test_x_2 = pd.DataFrame(
    {f'pred_{i+1}': pred_test_list[i] for i in range(len(pred_test_list))},
    columns=column_names
)

In [None]:
# 2層目のモデル
# pred_train_2は、2層目のモデルの学習データのクロスバリデーションでの予測値
# pred_test_2は、2層目のモデルのテストデータの予測値
model_2 = Model2_Logistic()
pred_train_2, pred_test_2 = predict_cv(model_2, train_x_2, y, test_x_2)
print(f'AUC: {roc_auc_score(y, pred_train_2)}')

AUC: 0.9141412647005026


In [None]:
index = pd.read_csv('data/test.csv')['id'].values
df_submit = pd.DataFrame({
    "id": index,
    "prediction": pred_test_2
})

In [None]:
path = 'submission/submit_27.csv'

In [None]:
df_submit.to_csv(path, index=False, header=None)