# スタッキング
・GBDT 2~3個：決定木の深さが「浅い」「普通」「深い」モデル

・Random Forest 1~2個：決定木の深さが「浅い」「深い」モデル

・Neural Net 1~2個：層の数が「少ない」「多い」モデル

・Linier 1個

In [1]:
import os
os.chdir('../../')

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from scr.models.gbdt import Model1_CatBoost_1, Model1_CatBoost_2, Model1_CatBoost_3, Model1_XGBoost_1, Model1_XGBoost_2, Model1_XGBoost_3, Model1_LightGBM_1, Model1_LightGBM_2, Model1_LightGBM_3
from scr.models.random_forest import Model1_RandomForest_1, Model1_RandomForest_2, Model1_RandomForest_3
from scr.models.nn import Model1_NN_1, Model1_NN_2, Model1_NN_all_1, Model1_NN_all_2, Model1_TabNet_1, Model1_TabNet_2
from scr.models.linear import Model1_Logistic_1, Model1_Logistic_2, Model1_Logistic_3, Model1_Logistic_4, Model2_Logistic

In [3]:
df_train = pd.read_csv('data/feature_engineered/null_cat/train_scaled_for_not_gbdt.csv')
df_test = pd.read_csv('data/feature_engineered/null_cat/test_scaled_for_not_gbdt.csv')

In [4]:
feature = [
    'Age', 
    'DurationOfPitch', 
    'NumberOfPersonVisiting',
    'NumberOfFollowups', 
    'NumberOfTrips', 
    'MonthlyIncome', 
    #'ProdTaken',
    'Motivation', 
    'EconomicPower', 
    'TripEasier', 
    'SalesPerformance',
    'LivingCost', 
    'EconomicStability', 
    'NumberOfTrips_log', 
    'TravelCost',
    'EconomicSegment', 
    'PackageMatch', 
    'Monetary', 
    #'ContractRate_FM',
    #'ContractRate_G1',
    #'ContractRate_G2', 
    #'ContractRate_G3',
    #'ContractRate_G4',
    #'ContractRate_G5', 
    #'ContractRate_G6',
    'TypeofContact_No',
    'TypeofContact_Self Enquiry',
    'CityTier_2',
    'CityTier_3',
    'Occupation_Salaried',
    'Occupation_Small Business',
    'Gender_male', 
    'ProductPitched_Deluxe', 
    'ProductPitched_King',
    'ProductPitched_Standard',
    'ProductPitched_Super Deluxe',
    'PreferredPropertyStar_4',
    'PreferredPropertyStar_5', 
    'Passport_1',
    'PitchSatisfactionScore_2', 
    'PitchSatisfactionScore_3',
    'PitchSatisfactionScore_4',
    'PitchSatisfactionScore_5',
    'Designation_Executive',
    'Designation_Manager',
    'Designation_Senior Manager', 
    'Designation_VP',
    'Marry_Married',
    'Marry_Single', 
    'Car_No Car', 
    'Child_1_child', 
    'Child_2_child',
    'Child_3_child',
    'AgeGroup_20s',
    'AgeGroup_30s',
    'AgeGroup_40s',
    'AgeGroup_50s', 
    'AgeGroup_60s', 
    'TypeofContactNULL_1',
    'Child01_1',
    'IsFamily_1',
    'FreaqencySeg_1',
    'FreaqencySeg_2',
    'MonetarySeg_2',
    'MonetarySeg_3', 
    'MonetarySeg_4', 
    'AgeNull', 
    'DurationOfPitchNull',
    'NumberOfTripsNull', 
    'MonthlyIncomeNull'
]

X = df_train[feature]
y = df_train['ProdTaken']

df_test = df_test[feature]

num_features = len(df_test.columns)

In [5]:
# スタッキング
def predict_cv(model, X, y, df_test):
    preds = []
    preds_test = []
    va_idxes = []

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    for i, (tr_idx, va_idx) in enumerate(skf.split(X, y)):
        tr_x, va_x = X.iloc[tr_idx], X.iloc[va_idx]
        tr_y, va_y = y.iloc[tr_idx], y.iloc[va_idx]
        model.fit(tr_x, tr_y, va_x, va_y)
        pred = model.predict(va_x)
        preds.append(pred)
        pred_test = model.predict(df_test)
        preds_test.append(pred_test)
        va_idxes.append(va_idx)

    # バリデーションデータに対する予測値を連結し、その後元の順序に並べ直す
    va_idxes = np.concatenate(va_idxes)
    preds = np.concatenate(preds, axis=0)
    order = np.argsort(va_idxes)
    pred_train = preds[order]

    # テストデータに対する予測値の平均をとる
    preds_test = np.mean(preds_test, axis=0)

    return pred_train, preds_test

In [6]:
# 1層目のモデル
models_gbdt = [
    Model1_CatBoost_1(),
    Model1_CatBoost_2(),
    Model1_CatBoost_3(),
    Model1_XGBoost_1(),
    Model1_XGBoost_2(),
    Model1_XGBoost_3(),
    Model1_LightGBM_1(),
    Model1_LightGBM_2(),
    Model1_LightGBM_3(),
    Model1_RandomForest_1(),
    Model1_RandomForest_2(),
    Model1_RandomForest_3()
]

models_nn = [
    Model1_NN_1(input_shape=num_features),
    Model1_NN_2(input_shape=num_features),
    Model1_NN_all_1(input_shape=num_features),
    Model1_NN_all_2(input_shape=num_features),
    Model1_TabNet_1(input_dim=num_features),
    Model1_TabNet_2(input_dim=num_features),
    Model1_Logistic_1(),
    Model1_Logistic_2(),
    Model1_Logistic_3(),
    Model1_Logistic_4()
]



In [7]:
pred_train_list = []
pred_test_list = []
for model in models_gbdt:
    pred_train, pred_test = predict_cv(model, X, y, df_test)
    pred_train_list.append(pred_train)
    pred_test_list.append(pred_test)
for model in models_nn:
    pred_train, pred_test = predict_cv(model, X, y, df_test)
    pred_train_list.append(pred_train)
    pred_test_list.append(pred_test)

0:	test: 0.6015970	best: 0.6015970 (0)	total: 144ms	remaining: 9.93s
69:	test: 0.8174415	best: 0.8220736 (43)	total: 278ms	remaining: 0us

bestTest = 0.8220735786
bestIteration = 43

Shrink model to first 44 iterations.
0:	test: 0.5922993	best: 0.5922993 (0)	total: 3.64ms	remaining: 251ms
69:	test: 0.8067057	best: 0.8295485 (22)	total: 175ms	remaining: 0us

bestTest = 0.829548495
bestIteration = 22

Shrink model to first 23 iterations.
0:	test: 0.5802432	best: 0.5802432 (0)	total: 3.23ms	remaining: 223ms
69:	test: 0.7766311	best: 0.7876258 (22)	total: 144ms	remaining: 0us

bestTest = 0.787625841
bestIteration = 22

Shrink model to first 23 iterations.
0:	test: 0.5700578	best: 0.5700578 (0)	total: 3.14ms	remaining: 217ms
69:	test: 0.7837305	best: 0.7940170 (31)	total: 147ms	remaining: 0us

bestTest = 0.7940169643
bestIteration = 31

Shrink model to first 32 iterations.
0:	test: 0.5882909	best: 0.5882909 (0)	total: 3.52ms	remaining: 243ms
69:	test: 0.7733860	best: 0.7959697 (27)	total: 1

TypeError: train() got an unexpected keyword argument 'num_boost_rounds'

In [6]:
# 1層目のモデル

# model_1a = Model1_CatBoost_1()
# pred_train_1a, pred_test_1a = predict_cv(model_1a, X, y, df_test)

# model_1b = Model1_CatBoost_2()
# pred_train_1b, pred_test_1b = predict_cv(model_1b, X, y, df_test)

# model_1c = Model1_CatBoost_3()
# pred_train_1c, pred_test_1c = predict_cv(model_1c, X, y, df_test)

# model_1d = Model1_RandomForest_1()
# pred_train_1d, pred_test_1d = predict_cv(model_1d, X, y, df_test)

# model_1e = Model1_RandomForest_2()
# pred_train_1e, pred_test_1e = predict_cv(model_1e, X, y, df_test)

# model_1f = Model1_NN_1()
# pred_train_1f, pred_test_1f = predict_cv(model_1f, X, y, df_test)

# model_1g = Model1_NN_2()
# pred_train_1g, pred_test_1g = predict_cv(model_1g, X, y, df_test)

# model_1h = Model1_Logistic_1()
# pred_train_1h, pred_test_1h = predict_cv(model_1h, X, y, df_test)

0:	test: 0.5909615	best: 0.5909615 (0)	total: 157ms	remaining: 13m 4s
200:	test: 0.8326087	best: 0.8329264 (199)	total: 689ms	remaining: 16.5s
400:	test: 0.8370401	best: 0.8373746 (364)	total: 1.23s	remaining: 14.1s
600:	test: 0.8386789	best: 0.8389130 (589)	total: 1.73s	remaining: 12.7s
800:	test: 0.8383946	best: 0.8389130 (589)	total: 2.23s	remaining: 11.7s
1000:	test: 0.8385452	best: 0.8389130 (589)	total: 2.72s	remaining: 10.9s
1200:	test: 0.8383946	best: 0.8389130 (589)	total: 3.34s	remaining: 10.6s
1400:	test: 0.8386789	best: 0.8389130 (589)	total: 3.87s	remaining: 9.94s
1600:	test: 0.8385619	best: 0.8389130 (589)	total: 4.33s	remaining: 9.19s
1800:	test: 0.8384950	best: 0.8389130 (589)	total: 4.89s	remaining: 8.69s
2000:	test: 0.8383946	best: 0.8389130 (589)	total: 5.48s	remaining: 8.22s
2200:	test: 0.8381605	best: 0.8389130 (589)	total: 6.14s	remaining: 7.8s
2400:	test: 0.8380435	best: 0.8389130 (589)	total: 6.72s	remaining: 7.28s
2600:	test: 0.8381271	best: 0.8389130 (589)	tot

In [7]:
# 1層目のモデルの評価
for i, pred_train in enumerate(pred_train_list):
    auc_score = roc_auc_score(y, pred_train)
    print(f'AUC for model {i+1}: {auc_score}')

AUC: 0.8301459828489655
AUC: 0.8294714812941822
AUC: 0.8274244396862457
AUC: 0.8074119852806679
AUC: 0.819563773012406
AUC: 0.8418512411366595
AUC: 0.8418176169315358
AUC: 0.8300316605515446


In [None]:
# 特徴量として使用する列名を作成
column_names = [f'pred_1{i}' for i in range(len(pred_train_list))]

# 予測値を特徴量としてデータフレームを作成
train_x_2 = pd.DataFrame(
    {f'pred_1{i+1}': pred_train_list[i] for i in range(len(pred_train_list))},
    columns=column_names
)

test_x_2 = pd.DataFrame(
    {f'pred_1{i+1}': pred_test_list[i] for i in range(len(pred_test_list))},
    columns=column_names
)

In [8]:
# # 予測値を特徴量としてデータフレームを作成
# train_x_2 = pd.DataFrame({
#     'pred_1a': pred_train_1a,
#     'pred_1b': pred_train_1b,
#     'pred_1c': pred_train_1c,
#     'pred_1d': pred_train_1d,
#     'pred_1e': pred_train_1e,
#     'pred_1f': pred_train_1f,
#     'pred_1g': pred_train_1g,
#     'pred_1h': pred_train_1h
#     })

# test_x_2 = pd.DataFrame({
#     'pred_1a': pred_test_1a,
#     'pred_1b': pred_test_1b,
#     'pred_1c': pred_test_1c,
#     'pred_1d': pred_test_1d,
#     'pred_1e': pred_test_1e,
#     'pred_1f': pred_test_1f,
#     'pred_1g': pred_test_1g,
#     'pred_1h': pred_test_1h,
#     })

In [10]:
# 2層目のモデル
# pred_train_2は、2層目のモデルの学習データのクロスバリデーションでの予測値
# pred_test_2は、2層目のモデルのテストデータの予測値
model_2 = Model2_Logistic()
pred_train_2, pred_test_2 = predict_cv(model_2, train_x_2, y, test_x_2)
print(f'AUC: {roc_auc_score(y, pred_train_2)}')

AUC: 0.8453548833105585


In [14]:
index = pd.read_csv('data/test.csv')['id'].values
df_submit = pd.DataFrame({
    "id": index,
    "prediction": pred_test_2
})

In [17]:
path = 'submission/submit_24_first_stacking.csv'

In [18]:
df_submit.to_csv(path, index=False, header=None)