In [1]:
import os
os.chdir('../../')

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from scr.models.gbdt import Model1_CatBoost_1, Model1_CatBoost_2, Model1_CatBoost_3
from scr.models.random_forest import Model1_RandomForest_1, Model1_RandomForest_2
from scr.models.nn import Model1_NN_1, Model1_NN_2
from scr.models.linear import Model1_Logistic, Model2_Logistic

In [3]:
df_train = pd.read_csv('data/feature_engineered/null_cat/train_scaled_for_not_gbdt.csv')
df_test = pd.read_csv('data/feature_engineered/null_cat/test_scaled_for_not_gbdt.csv')

In [4]:
feature = [
    'Age', 
    'DurationOfPitch', 
    'NumberOfPersonVisiting',
    'NumberOfFollowups', 
    'NumberOfTrips', 
    'MonthlyIncome', 
    #'ProdTaken',
    'Motivation', 
    'EconomicPower', 
    'TripEasier', 
    'SalesPerformance',
    'LivingCost', 
    'EconomicStability', 
    'NumberOfTrips_log', 
    'TravelCost',
    'EconomicSegment', 
    'PackageMatch', 
    'Monetary', 
    #'ContractRate_FM',
    #'ContractRate_G1',
    #'ContractRate_G2', 
    #'ContractRate_G3',
    #'ContractRate_G4',
    #'ContractRate_G5', 
    #'ContractRate_G6',
    'TypeofContact_No',
    'TypeofContact_Self Enquiry',
    'CityTier_2',
    'CityTier_3',
    'Occupation_Salaried',
    'Occupation_Small Business',
    'Gender_male', 
    'ProductPitched_Deluxe', 
    'ProductPitched_King',
    'ProductPitched_Standard',
    'ProductPitched_Super Deluxe',
    'PreferredPropertyStar_4',
    'PreferredPropertyStar_5', 
    'Passport_1',
    'PitchSatisfactionScore_2', 
    'PitchSatisfactionScore_3',
    'PitchSatisfactionScore_4',
    'PitchSatisfactionScore_5',
    'Designation_Executive',
    'Designation_Manager',
    'Designation_Senior Manager', 
    'Designation_VP',
    'Marry_Married',
    'Marry_Single', 
    'Car_No Car', 
    'Child_1_child', 
    'Child_2_child',
    'Child_3_child',
    'AgeGroup_20s',
    'AgeGroup_30s',
    'AgeGroup_40s',
    'AgeGroup_50s', 
    'AgeGroup_60s', 
    'TypeofContactNULL_1',
    'Child01_1',
    'IsFamily_1',
    'FreaqencySeg_1',
    'FreaqencySeg_2',
    'MonetarySeg_2',
    'MonetarySeg_3', 
    'MonetarySeg_4', 
    'AgeNull', 
    'DurationOfPitchNull',
    'NumberOfTripsNull', 
    'MonthlyIncomeNull'
]

X = df_train[feature]
y = df_train['ProdTaken']

df_test = df_test[feature]

In [7]:
def predict_blending(model, X, y, df_test):
    preds = []
    preds_test = []
    va_idxes = []
    
    line = round(len(X) * (4/5))
    tr_x, va_x = X.iloc[:line], X.iloc[line:]
    tr_y, va_y = y.iloc[:line], y.iloc[line:]
    
    model.fit(tr_x, tr_y, va_x, va_y)
    pred_train = model.predict(va_x)
    pred_test = model.predict(df_test)
    preds.append(pred_train)
    preds_test.append(pred_test)
    
    return pred_train, pred_test

In [8]:
# 1層目のモデル

model_1a = Model1_CatBoost_1()
pred_train_1a, pred_test_1a = predict_blending(model_1a, X, y, df_test)

model_1b = Model1_CatBoost_2()
pred_train_1b, pred_test_1b = predict_blending(model_1b, X, y, df_test)

model_1c = Model1_CatBoost_3()
pred_train_1c, pred_test_1c = predict_blending(model_1c, X, y, df_test)

model_1d = Model1_RandomForest_1()
pred_train_1d, pred_test_1d = predict_blending(model_1d, X, y, df_test)

model_1e = Model1_RandomForest_2()
pred_train_1e, pred_test_1e = predict_blending(model_1e, X, y, df_test)

model_1f = Model1_NN_1()
pred_train_1f, pred_test_1f = predict_blending(model_1f, X, y, df_test)

model_1g = Model1_NN_2()
pred_train_1g, pred_test_1g = predict_blending(model_1g, X, y, df_test)

model_1h = Model1_Logistic()
pred_train_1h, pred_test_1h = predict_blending(model_1h, X, y, df_test)

0:	test: 0.6075403	best: 0.6075403 (0)	total: 167ms	remaining: 13m 54s
200:	test: 0.7615170	best: 0.7618466 (198)	total: 564ms	remaining: 13.5s
400:	test: 0.7689817	best: 0.7690786 (348)	total: 955ms	remaining: 10.9s
600:	test: 0.7692919	best: 0.7715798 (500)	total: 1.42s	remaining: 10.4s
800:	test: 0.7689429	best: 0.7715798 (500)	total: 1.85s	remaining: 9.7s
1000:	test: 0.7676633	best: 0.7715798 (500)	total: 2.27s	remaining: 9.07s
1200:	test: 0.7666550	best: 0.7715798 (500)	total: 2.7s	remaining: 8.54s
1400:	test: 0.7672755	best: 0.7715798 (500)	total: 3.14s	remaining: 8.07s
1600:	test: 0.7685358	best: 0.7715798 (500)	total: 3.61s	remaining: 7.67s
1800:	test: 0.7693501	best: 0.7715798 (500)	total: 4.04s	remaining: 7.18s
2000:	test: 0.7690980	best: 0.7715798 (500)	total: 4.44s	remaining: 6.66s
2200:	test: 0.7690593	best: 0.7715798 (500)	total: 4.89s	remaining: 6.22s
2400:	test: 0.7692144	best: 0.7715798 (500)	total: 5.32s	remaining: 5.76s
2600:	test: 0.7694083	best: 0.7715798 (500)	tot

In [9]:
# 1層目のモデルの評価
print(f'AUC: {roc_auc_score(y, pred_train_1a)}')
print(f'AUC: {roc_auc_score(y, pred_train_1b)}')
print(f'AUC: {roc_auc_score(y, pred_train_1c)}')
print(f'AUC: {roc_auc_score(y, pred_train_1d)}')
print(f'AUC: {roc_auc_score(y, pred_train_1e)}')
print(f'AUC: {roc_auc_score(y, pred_train_1f)}')
print(f'AUC: {roc_auc_score(y, pred_train_1g)}')
print(f'AUC: {roc_auc_score(y, pred_train_1h)}')

ValueError: Found input variables with inconsistent numbers of samples: [3489, 698]

In [10]:
# 予測値を特徴量としてデータフレームを作成
train_x_2 = pd.DataFrame({
    'pred_1a': pred_train_1a,
    'pred_1b': pred_train_1b,
    'pred_1c': pred_train_1c,
    'pred_1d': pred_train_1d,
    'pred_1e': pred_train_1e,
    'pred_1f': pred_train_1f,
    'pred_1g': pred_train_1g,
    'pred_1h': pred_train_1h
    })

test_x_2 = pd.DataFrame({
    'pred_1a': pred_test_1a,
    'pred_1b': pred_test_1b,
    'pred_1c': pred_test_1c,
    'pred_1d': pred_test_1d,
    'pred_1e': pred_test_1e,
    'pred_1f': pred_test_1f,
    'pred_1g': pred_test_1g,
    'pred_1h': pred_test_1h,
    })

In [11]:
# 2層目のモデル
# pred_train_2は、2層目のモデルの学習データのクロスバリデーションでの予測値
# pred_test_2は、2層目のモデルのテストデータの予測値
model_2 = Model2_Logistic()
pred_train_2, pred_test_2 = predict_blending(model_2, train_x_2, y, test_x_2)
print(f'AUC: {roc_auc_score(y, pred_train_2)}')

ValueError: Found input variables with inconsistent numbers of samples: [3489, 140]