# スタッキング
・GBDT 2~3個：決定木の深さが「浅い」「普通」「深い」モデル

・Random Forest 1~2個：決定木の深さが「浅い」「深い」モデル

・Neural Net 1~2個：層の数が「少ない」「多い」モデル

・Linier 1個

In [22]:
# import os
# os.chdir('../../')

In [23]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from scr.models.gbdt import Model1_CatBoost_1, Model1_CatBoost_2, Model1_CatBoost_3, Model1_XGBoost_1, Model1_XGBoost_2, Model1_XGBoost_3, Model1_LightGBM_1, Model1_LightGBM_2, Model1_LightGBM_3
from scr.models.random_forest import Model1_RandomForest_1, Model1_RandomForest_2, Model1_RandomForest_3
from scr.models.nn import Model1_NN_1, Model1_NN_2, Model1_NN_all_1, Model1_NN_all_2, Model1_TabNet_1, Model1_TabNet_2
from scr.models.linear import Model1_Logistic_1, Model1_Logistic_2, Model1_Logistic_3, Model1_Logistic_4, Model2_Logistic

In [24]:
df_train = pd.read_csv('data/feature_engineered/null_cat/train_scaled_for_not_gbdt.csv')
df_test = pd.read_csv('data/feature_engineered/null_cat/test_scaled_for_not_gbdt.csv')

In [25]:
feature = [
    'Age', 
    'DurationOfPitch', 
    'NumberOfPersonVisiting',
    'NumberOfFollowups', 
    'NumberOfTrips', 
    'MonthlyIncome', 
    #'ProdTaken',
    'Motivation', 
    'EconomicPower', 
    'TripEasier', 
    'SalesPerformance',
    'LivingCost', 
    'EconomicStability', 
    'NumberOfTrips_log', 
    'TravelCost',
    'EconomicSegment', 
    'PackageMatch', 
    'Monetary', 
    #'ContractRate_FM',
    #'ContractRate_G1',
    #'ContractRate_G2', 
    #'ContractRate_G3',
    #'ContractRate_G4',
    #'ContractRate_G5', 
    #'ContractRate_G6',
    'TypeofContact_No',
    'TypeofContact_Self Enquiry',
    'CityTier_2',
    'CityTier_3',
    'Occupation_Salaried',
    'Occupation_Small Business',
    'Gender_male', 
    'ProductPitched_Deluxe', 
    'ProductPitched_King',
    'ProductPitched_Standard',
    'ProductPitched_Super Deluxe',
    'PreferredPropertyStar_4',
    'PreferredPropertyStar_5', 
    'Passport_1',
    'PitchSatisfactionScore_2', 
    'PitchSatisfactionScore_3',
    'PitchSatisfactionScore_4',
    'PitchSatisfactionScore_5',
    'Designation_Executive',
    'Designation_Manager',
    'Designation_Senior Manager', 
    'Designation_VP',
    'Marry_Married',
    'Marry_Single', 
    'Car_No Car', 
    'Child_1_child', 
    'Child_2_child',
    'Child_3_child',
    'AgeGroup_20s',
    'AgeGroup_30s',
    'AgeGroup_40s',
    'AgeGroup_50s', 
    'AgeGroup_60s', 
    'TypeofContactNULL_1',
    'Child01_1',
    'IsFamily_1',
    'FreaqencySeg_1',
    'FreaqencySeg_2',
    'MonetarySeg_2',
    'MonetarySeg_3', 
    'MonetarySeg_4', 
    'AgeNull', 
    'DurationOfPitchNull',
    'NumberOfTripsNull', 
    'MonthlyIncomeNull'
]

X = df_train[feature]
y = df_train['ProdTaken']

df_test = df_test[feature]

num_features = len(df_test.columns)

In [26]:
# スタッキング
def predict_cv(model, X, y, df_test):
    preds = []
    preds_test = []
    va_idxes = []

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    for i, (tr_idx, va_idx) in enumerate(skf.split(X, y)):
        tr_x, va_x = X.iloc[tr_idx], X.iloc[va_idx]
        tr_y, va_y = y.iloc[tr_idx], y.iloc[va_idx]
        model.fit(tr_x, tr_y, va_x, va_y)
        pred = model.predict(va_x)
        preds.append(pred)
        pred_test = model.predict(df_test)
        preds_test.append(pred_test)
        va_idxes.append(va_idx)

    # バリデーションデータに対する予測値を連結し、その後元の順序に並べ直す
    va_idxes = np.concatenate(va_idxes)
    preds = np.concatenate(preds, axis=0)
    order = np.argsort(va_idxes)
    pred_train = preds[order]

    # テストデータに対する予測値の平均をとる
    preds_test = np.mean(preds_test, axis=0)

    return pred_train, preds_test

In [27]:
# 1層目のモデル
models_gbdt = [
    Model1_CatBoost_1(),
    Model1_CatBoost_2(),
    Model1_CatBoost_3(),
    Model1_XGBoost_1(),
    Model1_XGBoost_2(),
    Model1_XGBoost_3(),
    Model1_LightGBM_1(),
    Model1_LightGBM_2(),
    Model1_LightGBM_3(),
    Model1_RandomForest_1(),
    Model1_RandomForest_2(),
    Model1_RandomForest_3()
]

models_nn = [
    Model1_NN_1(input_shape=num_features),
    Model1_NN_2(input_shape=num_features),
    Model1_NN_all_1(input_shape=num_features),
    Model1_NN_all_2(input_shape=num_features),
    Model1_TabNet_1(input_dim=num_features),
    Model1_TabNet_2(input_dim=num_features),
    Model1_Logistic_1(),
    Model1_Logistic_2(),
    Model1_Logistic_3(),
    Model1_Logistic_4()
]



In [28]:
pred_train_list = []
pred_test_list = []
for model in models_gbdt:
    pred_train, pred_test = predict_cv(model, X, y, df_test)
    pred_train_list.append(pred_train)
    pred_test_list.append(pred_test)
for model in models_nn:
    pred_train, pred_test = predict_cv(model, X, y, df_test)
    pred_train_list.append(pred_train)
    pred_test_list.append(pred_test)

0:	test: 0.6015970	best: 0.6015970 (0)	total: 2.07ms	remaining: 143ms
69:	test: 0.8174415	best: 0.8220736 (43)	total: 135ms	remaining: 0us

bestTest = 0.8220735786
bestIteration = 43

Shrink model to first 44 iterations.
0:	test: 0.5922993	best: 0.5922993 (0)	total: 2.53ms	remaining: 175ms
69:	test: 0.8067057	best: 0.8295485 (22)	total: 154ms	remaining: 0us

bestTest = 0.829548495
bestIteration = 22

Shrink model to first 23 iterations.
0:	test: 0.5802432	best: 0.5802432 (0)	total: 3.47ms	remaining: 239ms
69:	test: 0.7766311	best: 0.7876258 (22)	total: 192ms	remaining: 0us

bestTest = 0.787625841
bestIteration = 22

Shrink model to first 23 iterations.
0:	test: 0.5700578	best: 0.5700578 (0)	total: 2.64ms	remaining: 182ms
69:	test: 0.7837305	best: 0.7940170 (31)	total: 142ms	remaining: 0us

bestTest = 0.7940169643
bestIteration = 31

Shrink model to first 32 iterations.
0:	test: 0.5882909	best: 0.5882909 (0)	total: 2.78ms	remaining: 192ms
69:	test: 0.7733860	best: 0.7959697 (27)	total: 

KeyboardInterrupt: 

In [None]:
# 1層目のモデル

# model_1a = Model1_CatBoost_1()
# pred_train_1a, pred_test_1a = predict_cv(model_1a, X, y, df_test)

# model_1b = Model1_CatBoost_2()
# pred_train_1b, pred_test_1b = predict_cv(model_1b, X, y, df_test)

# model_1c = Model1_CatBoost_3()
# pred_train_1c, pred_test_1c = predict_cv(model_1c, X, y, df_test)

# model_1d = Model1_RandomForest_1()
# pred_train_1d, pred_test_1d = predict_cv(model_1d, X, y, df_test)

# model_1e = Model1_RandomForest_2()
# pred_train_1e, pred_test_1e = predict_cv(model_1e, X, y, df_test)

# model_1f = Model1_NN_1()
# pred_train_1f, pred_test_1f = predict_cv(model_1f, X, y, df_test)

# model_1g = Model1_NN_2()
# pred_train_1g, pred_test_1g = predict_cv(model_1g, X, y, df_test)

# model_1h = Model1_Logistic_1()
# pred_train_1h, pred_test_1h = predict_cv(model_1h, X, y, df_test)

In [None]:
# 1層目のモデルの評価
for i, pred_train in enumerate(pred_train_list):
    auc_score = roc_auc_score(y, pred_train)
    print(f'AUC for model {i+1}: {auc_score}')

AUC for model 1: 0.5979466370415003
AUC for model 2: 0.6462047687192675
AUC for model 3: 0.6970506192233615
AUC for model 4: 0.8289382014009188
AUC for model 5: 0.8167386672979051
AUC for model 6: 0.8129727563240405
AUC for model 7: 0.8118291971077802
AUC for model 8: 0.8178822265141652
AUC for model 9: 0.8184790561551124
AUC for model 10: 0.812139548521073
AUC for model 11: 0.8198650458903153
AUC for model 12: 0.8213593055660164
AUC for model 13: 0.8465377828468134
AUC for model 14: 0.8344855227622419
AUC for model 15: 0.9146940466327376
AUC for model 16: 0.927937276062794
AUC for model 17: 0.7842455804344783
AUC for model 18: 0.7682915675873422
AUC for model 19: 0.8300316605515446
AUC for model 20: 0.8255885580864869
AUC for model 21: 0.7994699480304285
AUC for model 22: 0.739332384682426


In [None]:
# 特徴量として使用する列名を作成
column_names = [f'pred_{i+1}' for i in range(len(pred_train_list))]

# 予測値を特徴量としてデータフレームを作成
train_x_2 = pd.DataFrame(
    {f'pred_{i+1}': pred_train_list[i] for i in range(len(pred_train_list))},
    columns=column_names
)

test_x_2 = pd.DataFrame(
    {f'pred_{i+1}': pred_test_list[i] for i in range(len(pred_test_list))},
    columns=column_names
)

In [None]:
# # 予測値を特徴量としてデータフレームを作成
# train_x_2 = pd.DataFrame({
#     'pred_1a': pred_train_1a,
#     'pred_1b': pred_train_1b,
#     'pred_1c': pred_train_1c,
#     'pred_1d': pred_train_1d,
#     'pred_1e': pred_train_1e,
#     'pred_1f': pred_train_1f,
#     'pred_1g': pred_train_1g,
#     'pred_1h': pred_train_1h
#     })

# test_x_2 = pd.DataFrame({
#     'pred_1a': pred_test_1a,
#     'pred_1b': pred_test_1b,
#     'pred_1c': pred_test_1c,
#     'pred_1d': pred_test_1d,
#     'pred_1e': pred_test_1e,
#     'pred_1f': pred_test_1f,
#     'pred_1g': pred_test_1g,
#     'pred_1h': pred_test_1h,
#     })

In [None]:
# 2層目のモデル
# pred_train_2は、2層目のモデルの学習データのクロスバリデーションでの予測値
# pred_test_2は、2層目のモデルのテストデータの予測値
model_2 = Model2_Logistic()
pred_train_2, pred_test_2 = predict_cv(model_2, train_x_2, y, test_x_2)
print(f'AUC: {roc_auc_score(y, pred_train_2)}')

AUC: 0.9182669546691916


In [None]:
index = pd.read_csv('data/test.csv')['id'].values
df_submit = pd.DataFrame({
    "id": index,
    "prediction": pred_test_2
})

In [None]:
path = 'submission/submit_25_second_stacking.csv'

In [None]:
df_submit.to_csv(path, index=False, header=None)