## 学習用ノートブック

In [13]:
# ライブラリの読み込み
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import catboost as cb

In [14]:
# 前処理したデータを読み込む
train_p = pd.read_csv('../output/train_preprocessed.csv')

In [15]:
# 使用する説明変数を指定
feature_cols =[
    'Age_int', 'TypeofContact_int', 'CityTier', 'Duration_int', 'Occupation_int', "Gender_int", "NumberOfPersonVisiting", 
    "NumberOfFollowups_normalized", "ProductPitched_int", "PreferredPropertyStar", "NumberOfTrips_int", "Passport", 
    "PitchSatisfactionScore", "Designation_int", "MonthlyIncome_int", "customer_info_kekkon_int", "customer_info_car_int", "customer_info_child_int",
]
# 目的変数を指定
target_col = 'ProdTaken'

In [16]:
# クロスバリデーションのためのデータを作成
N_SPLITS = 5
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=0)
cv_list = list(skf.split(train_p[feature_cols], train_p[target_col]))

In [17]:
# catboostで学習させるための関数
def train_catboost(X, y, cv, params: dict = None):
    if params is None:
        params = {}

    models = []
    n_records = len(X)
    oof_pred = np.zeros((n_records, ))
    for i, (tr_idx, va_idx) in enumerate(cv):
        tr_x, va_x = X.iloc[tr_idx], X.iloc[va_idx]
        tr_y, va_y = y.iloc[tr_idx], y.iloc[va_idx]
        model = cb.CatBoostRegressor(**params)
        model.fit(tr_x, tr_y, eval_set=(va_x, va_y), use_best_model=True, verbose=100)
        oof_pred[va_idx] = model.predict(va_x)
        models.append(model)
    return oof_pred, models

In [18]:
#X = train_p[feature_cols].values
#y = train_p[target_col].values

# catboostで学習させる
params = {
    'loss_function': 'RMSE',
    'eval_metric': 'AUC',
    'iterations': 10000,
    'learning_rate': 0.01,
    'depth': 6,
    'verbose': 200,
    'random_seed': 0,
}

oof, models = train_catboost(train_p[feature_cols], train_p[target_col], cv_list, params)

0:	test: 0.8007802	best: 0.8007802 (0)	total: 1.41ms	remaining: 14.1s
100:	test: 0.8279050	best: 0.8279253 (36)	total: 129ms	remaining: 12.6s
200:	test: 0.8297946	best: 0.8305260 (166)	total: 247ms	remaining: 12s
300:	test: 0.8312778	best: 0.8314200 (264)	total: 364ms	remaining: 11.7s
400:	test: 0.8298149	best: 0.8318874 (344)	total: 494ms	remaining: 11.8s
500:	test: 0.8309527	best: 0.8318874 (344)	total: 618ms	remaining: 11.7s
600:	test: 0.8298759	best: 0.8318874 (344)	total: 735ms	remaining: 11.5s
700:	test: 0.8283520	best: 0.8318874 (344)	total: 872ms	remaining: 11.6s
800:	test: 0.8271532	best: 0.8318874 (344)	total: 986ms	remaining: 11.3s
900:	test: 0.8253449	best: 0.8318874 (344)	total: 1.1s	remaining: 11.1s
1000:	test: 0.8251417	best: 0.8318874 (344)	total: 1.22s	remaining: 11s
1100:	test: 0.8251214	best: 0.8318874 (344)	total: 1.33s	remaining: 10.7s
1200:	test: 0.8239226	best: 0.8318874 (344)	total: 1.44s	remaining: 10.5s
1300:	test: 0.8229474	best: 0.8318874 (344)	total: 1.55s	

In [19]:
# pickleでモデルを保存
import pickle

for i, model in enumerate(models):
    with open(f'../model/catboost_model_{i}.pickle', 'wb') as f:
        pickle.dump(model, f)