# スタッキング
・GBDT 2~3個：決定木の深さが「浅い」「普通」「深い」モデル

・Random Forest 1~2個：決定木の深さが「浅い」「深い」モデル

・Neural Net 1~2個：層の数が「少ない」「多い」モデル

・Linier 1個

In [4]:
import os
os.chdir('../../')

In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from scr.models.gbdt import Model1_CatBoost_1, Model1_CatBoost_2, Model1_CatBoost_3
from scr.models.random_forest import Model1_RandomForest_1, Model1_RandomForest_2
from scr.models.nn import Model1_NN_1, Model1_NN_2
from scr.models.linear import Model1_Logistic

In [21]:
df_train = pd.read_csv('data/feature_engineered/null_cat/train_scaled_for_not_gbdt.csv')
df_test = pd.read_csv('data/feature_engineered/null_cat/test_scaled_for_not_gbdt.csv')

In [18]:
print(len(df_train.columns))
print(len(df_test.columns))

71
70


In [22]:
feature = [
    'Age', 
    'DurationOfPitch', 
    'NumberOfPersonVisiting',
    'NumberOfFollowups', 
    'NumberOfTrips', 
    'MonthlyIncome', 
    #'ProdTaken',
    'Motivation', 
    'EconomicPower', 
    'TripEasier', 
    'SalesPerformance',
    'LivingCost', 
    'EconomicStability', 
    'NumberOfTrips_log', 
    'TravelCost',
    'EconomicSegment', 
    'PackageMatch', 
    'Monetary', 
    #'ContractRate_FM',
    #'ContractRate_G1',
    #'ContractRate_G2', 
    #'ContractRate_G3',
    #'ContractRate_G4',
    #'ContractRate_G5', 
    #'ContractRate_G6',
    'TypeofContact_No',
    'TypeofContact_Self Enquiry',
    'CityTier_2',
    'CityTier_3',
    'Occupation_Salaried',
    'Occupation_Small Business',
    'Gender_male', 
    'ProductPitched_Deluxe', 
    'ProductPitched_King',
    'ProductPitched_Standard',
    'ProductPitched_Super Deluxe',
    'PreferredPropertyStar_4',
    'PreferredPropertyStar_5', 
    'Passport_1',
    'PitchSatisfactionScore_2', 
    'PitchSatisfactionScore_3',
    'PitchSatisfactionScore_4',
    'PitchSatisfactionScore_5',
    'Designation_Executive',
    'Designation_Manager',
    'Designation_Senior Manager', 
    'Designation_VP',
    'Marry_Married',
    'Marry_Single', 
    'Car_No Car', 
    'Child_1_child', 
    'Child_2_child',
    'Child_3_child',
    'AgeGroup_20s',
    'AgeGroup_30s',
    'AgeGroup_40s',
    'AgeGroup_50s', 
    'AgeGroup_60s', 
    'TypeofContactNULL_1',
    'Child01_1',
    'IsFamily_1',
    'FreaqencySeg_1',
    'FreaqencySeg_2',
    'MonetarySeg_2',
    'MonetarySeg_3', 
    'MonetarySeg_4', 
    'AgeNull', 
    'DurationOfPitchNull',
    'NumberOfTripsNull', 
    'MonthlyIncomeNull'
]

X = df_train[feature]
y = df_train['ProdTaken']

df_test = df_test[feature]

In [11]:
# スタッキング
def predict_cv(model, X, y, df_test):
    preds = []
    preds_test = []
    va_idxes = []

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    for i, (tr_idx, va_idx) in enumerate(skf.split(X, y)):
        tr_x, va_x = X.iloc[tr_idx], X.iloc[va_idx]
        tr_y, va_y = y.iloc[tr_idx], y.iloc[va_idx]
        model.fit(tr_x, tr_y, va_x, va_y)
        pred = model.predict(va_x)
        preds.append(pred)
        pred_test = model.predict(df_test)
        preds_test.append(pred_test)
        va_idxes.append(va_idx)

    # バリデーションデータに対する予測値を連結し、その後元の順序に並べ直す
    va_idxes = np.concatenate(va_idxes)
    preds = np.concatenate(preds, axis=0)
    order = np.argsort(va_idxes)
    pred_train = preds[order]

    # テストデータに対する予測値の平均をとる
    preds_test = np.mean(preds_test, axis=0)

    return pred_train, preds_test

In [15]:
# 1層目のモデル

model_1a = Model1_CatBoost_1()
pred_train_1a, pred_test_1a = predict_cv(model_1a, X, y, df_test)

model_1b = Model1_CatBoost_2()
pred_train_1b, pred_test_1b = predict_cv(model_1b, X, y, df_test)

model_1c = Model1_CatBoost_3()
pred_train_1c, pred_test_1c = predict_cv(model_1c, X, y, df_test)

model_1d = Model1_RandomForest_1()
pred_train_1d, pred_test_1d = predict_cv(model_1d, X, y, df_test)

model_1e = Model1_RandomForest_2()
pred_train_1e, pred_test_1e = predict_cv(model_1e, X, y, df_test)

model_1f = Model1_NN_1()
pred_train_1f, pred_test_1f = predict_cv(model_1f, X, y, df_test)

model_1g = Model1_NN_2()
pred_train_1g, pred_test_1g = predict_cv(model_1g, X, y, df_test)

model_1h = Model1_Logistic()
pred_train_1h, pred_test_1h = predict_cv(model_1h, X, y, df_test)

0:	test: 0.5909615	best: 0.5909615 (0)	total: 141ms	remaining: 16m 24s
200:	test: 0.8326087	best: 0.8329264 (199)	total: 904ms	remaining: 30.6s
400:	test: 0.8370401	best: 0.8373746 (364)	total: 1.72s	remaining: 28.4s
600:	test: 0.8386789	best: 0.8389130 (589)	total: 2.43s	remaining: 25.9s
800:	test: 0.8383946	best: 0.8389130 (589)	total: 3.24s	remaining: 25.1s
1000:	test: 0.8385452	best: 0.8389130 (589)	total: 4.06s	remaining: 24.3s
1200:	test: 0.8383946	best: 0.8389130 (589)	total: 5.08s	remaining: 24.5s
1400:	test: 0.8386789	best: 0.8389130 (589)	total: 6.08s	remaining: 24.3s
1600:	test: 0.8385619	best: 0.8389130 (589)	total: 7.01s	remaining: 23.6s
1800:	test: 0.8384950	best: 0.8389130 (589)	total: 7.81s	remaining: 22.6s
2000:	test: 0.8383946	best: 0.8389130 (589)	total: 8.94s	remaining: 22.3s
2200:	test: 0.8381605	best: 0.8389130 (589)	total: 10s	remaining: 21.8s
2400:	test: 0.8380435	best: 0.8389130 (589)	total: 11s	remaining: 21.1s
2600:	test: 0.8381271	best: 0.8389130 (589)	total

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- ContractRate_FM
- ContractRate_G1
- ContractRate_G2
- ContractRate_G3
- ContractRate_G4
- ...


In [None]:
# 1層目のモデルの評価
print(f'AUC: {roc_auc_score(y, pred_train_1a)}')
print(f'AUC: {roc_auc_score(y, pred_train_1b)}')
print(f'AUC: {roc_auc_score(y, pred_train_1c)}')
print(f'AUC: {roc_auc_score(y, pred_train_1d)}')
print(f'AUC: {roc_auc_score(y, pred_train_1e)}')
print(f'AUC: {roc_auc_score(y, pred_train_1f)}')
print(f'AUC: {roc_auc_score(y, pred_train_1g)}')
print(f'AUC: {roc_auc_score(y, pred_train_1h)}')

In [None]:
# 予測値を特徴量としてデータフレームを作成
train_x_2 = pd.DataFrame({
    'pred_1a': pred_train_1a,
    'pred_1b': pred_train_1b,
    'pred_1c': pred_train_1c,
    'pred_1d': pred_train_1d,
    'pred_1e': pred_train_1e,
    'pred_1f': pred_train_1f,
    'pred_1g': pred_train_1g,
    'pred_1h': pred_train_1h
    })

test_x_2 = pd.DataFrame({
    'pred_1a': pred_test_1a,
    'pred_1b': pred_test_1b,
    'pred_1c': pred_test_1c,
    'pred_1d': pred_test_1d,
    'pred_1e': pred_test_1e,
    'pred_1f': pred_test_1f,
    'pred_1g': pred_test_1g,
    'pred_1h': pred_test_1h,
    })

In [None]:
# 2層目のモデル
# pred_train_2は、2層目のモデルの学習データのクロスバリデーションでの予測値
# pred_test_2は、2層目のモデルのテストデータの予測値
model_2 = Model2Linear()
pred_train_2, pred_test_2 = predict_cv(model_2, train_x_2, train_y, test_x_2)
print(f'logloss: {log_loss(train_y, pred_train_2, eps=1e-7):.4f}')

In [None]:
# ---------------------------------
# データ等の準備
# ----------------------------------


# train_xは学習データ、train_yは目的変数、test_xはテストデータ
# pandasのDataFrame, Seriesで保持します。（numpyのarrayで保持することもあります）

train = pd.read_csv('../input/sample-data/train_preprocessed.csv')
train_x = train.drop(['target'], axis=1)
train_y = train['target']
test_x = pd.read_csv('../input/sample-data/test_preprocessed.csv')

# neural net用のデータ
train_nn = pd.read_csv('../input/sample-data/train_preprocessed_onehot.csv')
train_x_nn = train_nn.drop(['target'], axis=1)
train_y_nn = train_nn['target']
test_x_nn = pd.read_csv('../input/sample-data/test_preprocessed_onehot.csv')

# ---------------------------------
# スタッキング
# ----------------------------------


# models.pyにModel1Xgb, Model1NN, Model2Linearを定義しているものとする
# 各クラスは、fitで学習し、predictで予測値の確率を出力する

from models import Model1Xgb, Model1NN, Model2Linear


# 学習データに対する「目的変数を知らない」予測値と、テストデータに対する予測値を返す関数
def predict_cv(model, train_x, train_y, test_x):
    preds = []
    preds_test = []
    va_idxes = []

    kf = KFold(n_splits=4, shuffle=True, random_state=71)

    # クロスバリデーションで学習・予測を行い、予測値とインデックスを保存する
    for i, (tr_idx, va_idx) in enumerate(kf.split(train_x)):
        tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
        tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
        model.fit(tr_x, tr_y, va_x, va_y)
        pred = model.predict(va_x)
        preds.append(pred)
        pred_test = model.predict(test_x)
        preds_test.append(pred_test)
        va_idxes.append(va_idx)

    # バリデーションデータに対する予測値を連結し、その後元の順序に並べ直す
    va_idxes = np.concatenate(va_idxes)
    preds = np.concatenate(preds, axis=0)
    order = np.argsort(va_idxes)
    pred_train = preds[order]

    # テストデータに対する予測値の平均をとる
    preds_test = np.mean(preds_test, axis=0)

    return pred_train, preds_test


# 1層目のモデル
# pred_train_1a, pred_train_1bは、学習データのクロスバリデーションでの予測値
# pred_test_1a, pred_test_1bは、テストデータの予測値
model_1a = Model1Xgb()
pred_train_1a, pred_test_1a = predict_cv(model_1a, train_x, train_y, test_x)

model_1b = Model1NN()
pred_train_1b, pred_test_1b = predict_cv(model_1b, train_x_nn, train_y, test_x_nn)

# 1層目のモデルの評価
print(f'logloss: {log_loss(train_y, pred_train_1a, eps=1e-7):.4f}')
print(f'logloss: {log_loss(train_y, pred_train_1b, eps=1e-7):.4f}')

# 予測値を特徴量としてデータフレームを作成
train_x_2 = pd.DataFrame({'pred_1a': pred_train_1a, 'pred_1b': pred_train_1b})
test_x_2 = pd.DataFrame({'pred_1a': pred_test_1a, 'pred_1b': pred_test_1b})

# 2層目のモデル
# pred_train_2は、2層目のモデルの学習データのクロスバリデーションでの予測値
# pred_test_2は、2層目のモデルのテストデータの予測値
model_2 = Model2Linear()
pred_train_2, pred_test_2 = predict_cv(model_2, train_x_2, train_y, test_x_2)
print(f'logloss: {log_loss(train_y, pred_train_2, eps=1e-7):.4f}')