In [1]:
import os
os.chdir("../")

In [2]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
# from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

from scr.util import *

In [3]:
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.pipeline import Pipeline

# Data Set

In [75]:
pass_train = 'data/feature_engineered/train_feature_ok.csv'
pass_test = 'data/feature_engineered/test_feature_ok.csv'

df = pd.read_csv(pass_train)
df_submit = pd.read_csv(pass_test)


# 特徴量選択
feature = [
    'Age',
    'TypeofContact',
    'CityTier',
    'DurationOfPitch',
    'Occupation',
    'Gender',
    'NumberOfPersonVisiting',
    'NumberOfFollowups',
    'ProductPitched',
    'PreferredPropertyStar',
    'NumberOfTrips',
    'Passport',
    'PitchSatisfactionScore',
    'Designation',
    'MonthlyIncome',
    'Marry',
    'Car',
    'Child',
    # 以下、作成特徴量
    ###'AgeGroup',
    'TypeofContactNULL',
    ##'Motivation',
    ##'EconomicPower',
    'Child01',
    'TripEasier',
    ##'SalesPerformance',
    ##'LivingCost',
    ##'EconomicStability'
]

# 型選択
float_columns = ['DurationOfPitch', 'MonthlyIncome'] + ['EconomicPower']
int_columns = ['Age', 'NumberOfTrips', 'TypeofContact', 'CityTier', 'Occupation', 
                'Gender', 'NumberOfPersonVisiting', 'NumberOfFollowups', 'ProductPitched', 
                'PreferredPropertyStar', 'Passport', 'PitchSatisfactionScore', 'Designation',
                'Marry', 'Car', 'Child'] + ['AgeGroup', 'TypeofContactNULL', 'Motivation', 'TripEasier', 'SalesPerformance', 'LivingCost', 'EconomicStability']
# catboostで措定
category_columns = ['TypeofContact', 'CityTier', 'Occupation', 'Gender', 'NumberOfPersonVisiting', 'NumberOfFollowups', 'ProductPitched', 
                'PreferredPropertyStar', 'Passport', 'PitchSatisfactionScore', 'Designation', 'Marry', 'Car'] # Child


df = mapping_columns_if_exist(df)
df_submit = mapping_columns_if_exist(df_submit)
df, df_submit = convert_type(df, df_submit, float_columns=float_columns, int_columns=int_columns)


X = df[feature]
y = df['ProdTaken']
df_submit = df_submit[feature]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Gradient Boosting Decision Tree

## xgboost

In [24]:

params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'eta': 0.001,
    'gamma': 1.2,    # 大きくすると、過学習を防ぐ
    'alpha': 1.2,    # 増やすと、過学習を防ぐ
    'lambda': 1.0,    # 増やすと、過学習を防ぐ
    'min_child_weight': 1.0,
    'max_depth': 3, # 3 ~ 9
    'subsample': 0.75, # 0.6 = 0.95 低くすると、過学習防ぐ
    'colsample_bytree': 0.75, # 0.6 ~ 0.96　低くすると、過学習防ぐ
    'eval_metric': 'auc',
    'seed': 0
}

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

num_iter = 10000
xgb_model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=num_iter,
    early_stopping_rounds=num_iter,
    evals=[(dtest, 'test')]
)

print("Best iteration:", xgb_model.best_iteration)
print("Best AUC on test set:", xgb_model.best_score)

[0]	test-auc:0.78899
[1]	test-auc:0.80806
[2]	test-auc:0.81263


[3]	test-auc:0.81424
[4]	test-auc:0.81208
[5]	test-auc:0.81053
[6]	test-auc:0.81109
[7]	test-auc:0.81209
[8]	test-auc:0.81216
[9]	test-auc:0.81628
[10]	test-auc:0.81388
[11]	test-auc:0.81424
[12]	test-auc:0.82131
[13]	test-auc:0.82031
[14]	test-auc:0.82004
[15]	test-auc:0.81902
[16]	test-auc:0.82043
[17]	test-auc:0.82067
[18]	test-auc:0.82112
[19]	test-auc:0.82450
[20]	test-auc:0.82364
[21]	test-auc:0.82272
[22]	test-auc:0.82365
[23]	test-auc:0.82334
[24]	test-auc:0.82272
[25]	test-auc:0.82307
[26]	test-auc:0.82302
[27]	test-auc:0.82365
[28]	test-auc:0.82639
[29]	test-auc:0.82586
[30]	test-auc:0.82553
[31]	test-auc:0.82574
[32]	test-auc:0.82618
[33]	test-auc:0.82503
[34]	test-auc:0.82447
[35]	test-auc:0.82445
[36]	test-auc:0.82437
[37]	test-auc:0.82405
[38]	test-auc:0.82429
[39]	test-auc:0.82420
[40]	test-auc:0.82393
[41]	test-auc:0.82375
[42]	test-auc:0.82359
[43]	test-auc:0.82370
[44]	test-auc:0.82383
[45]	test-auc:0.82383
[46]	test-auc:0.82434
[47]	test-auc:0.82344
[48]	test-auc:0.8

In [30]:
df_submit = xgb.DMatrix(df_submit)

best_iteration = xgb_model.best_iteration
y_submit = xgb_model.predict(df_submit)

## xgboost (Grid Search)

In [125]:
cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=1000,
        seed=42,
        nfold=5,
        metrics='auc',
        early_stopping_rounds=10
    )

In [130]:
grid_params = [
    (eta, max_depth)
    for eta in [0.1, 0.75, 0.50, 0.25, 0.01]
    for max_depth in [5, 6, 7, 8, 9, 10]
]

max_auc = float(0)
best_params = []

for eta, max_depth in grid_params:
    print('eta={}, max_depth={}'.format(eta, max_depth))
    
    params['eta'] = eta
    params['max_depth'] = max_depth
    
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=1000,
        seed=42,
        nfold=5,
        metrics='auc',
        early_stopping_rounds=10
    )
    
    mean_auc = cv_results['test-auc-mean'].max()
    boost_rounds = cv_results['test-auc-mean'].argmax()
    print('AUC {} for {} rounds'.format(mean_auc, boost_rounds))
    if mean_auc > max_auc:
        max_auc = mean_auc
        best_params = (eta, max_depth)

print('Best params {}, AUC {}'.format(best_params, max_auc))

eta=0.1, max_depth=5
AUC 0.8198583840482406 for 18 rounds
eta=0.1, max_depth=6
AUC 0.8175357852082994 for 13 rounds
eta=0.1, max_depth=7
AUC 0.814336881349077 for 17 rounds
eta=0.1, max_depth=8
AUC 0.8143719119710597 for 13 rounds
eta=0.1, max_depth=9
AUC 0.814579905837995 for 13 rounds
eta=0.1, max_depth=10
AUC 0.816916563639458 for 18 rounds
eta=0.75, max_depth=5
AUC 0.7852027177889714 for 3 rounds
eta=0.75, max_depth=6
AUC 0.7770512948982504 for 3 rounds
eta=0.75, max_depth=7
AUC 0.7762240599948753 for 4 rounds
eta=0.75, max_depth=8
AUC 0.7648290912305058 for 13 rounds
eta=0.75, max_depth=9
AUC 0.7572965278721075 for 6 rounds
eta=0.75, max_depth=10
AUC 0.7555143082921678 for 14 rounds
eta=0.5, max_depth=5
AUC 0.8019297439973402 for 13 rounds
eta=0.5, max_depth=6
AUC 0.7906895884513346 for 8 rounds
eta=0.5, max_depth=7
AUC 0.788338402748352 for 8 rounds
eta=0.5, max_depth=8
AUC 0.7895147808995244 for 5 rounds
eta=0.5, max_depth=9
AUC 0.7812349995289433 for 4 rounds
eta=0.5, max_depth

## xgboost (Hyper parameter tuning)

In [34]:
# def objective(params):
#     params = {
#         'min_child_weight': int(params['min_child_weight']),
#         'max_depth': int(params['max_depth']),  # 整数にキャスト
#         'subsample': float(params['subsample']),
#         'colsample_bytree': float(params['colsample_bytree']),
#         'gamma': float(params['gamma']),
#         'alpha': float(params['alpha']),
#         'lambda': float(params['lambda']),
#         'objective': 'binary:logistic',
#         'eval_metric': 'auc',
#         'booster': 'gbtree'
#     }
    
#     cv_result = xgb.cv(
#         params=params,
#         dtrain=dtrain,
#         num_boost_round=1000,
#         nfold=5,
#         metrics='auc',
#         early_stopping_rounds=10,
#         seed=42
#     )
    
#     # 最大のAUCスコアを最小化する目的で返す
#     max_auc = max(cv_result['test-auc-mean'])
#     return {'loss': -max_auc, 'status': STATUS_OK}

# param_space = {
#     'min_child_weight': hp.loguniform('min_child_weight', np.log(0.1), np.log(10)),
#     'max_depth': hp.quniform('max_depth', 3, 9, 1),
#     'subsample': hp.quniform('subsample', 0.6, 0.95, 0.05),
#     'colsample_bytree': hp.quniform('colsample_bytree', 0.6, 0.95, 0.05),
#     'gamma': hp.loguniform('gamma', np.log(1e-8), np.log(1.0)),
#     'alpha': hp.loguniform('alpha', np.log(1e-8), np.log(1.0)),
#     'lambda': hp.loguniform('lambda', np.log(1e-8), np.log(10.0)),
# }

# trials = Trials()
# best = fmin(
#     fn=objective,
#     space=param_space,
#     algo=tpe.suggest,
#     max_evals=100,
#     trials=trials,
# )

# print("Best parameters:", best)

  0%|          | 0/100 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 100/100 [00:17<00:00,  5.58trial/s, best loss: -0.8319621473813135]
Best parameters: {'alpha': 1.4842369686829721e-08, 'colsample_bytree': 0.9, 'gamma': 1.2015666902947808e-08, 'lambda': 2.758064123709704e-07, 'max_depth': 3.0, 'min_child_weight': 9.569766694332625, 'subsample': 0.7000000000000001}


In [35]:
# best_params = {
#     'min_child_weight': int(best['min_child_weight']),
#     'max_depth': int(best['max_depth']),
#     'subsample': best['subsample'],
#     'colsample_bytree': best['colsample_bytree'],
#     'gamma': best['gamma'],
#     'alpha': best['alpha'],
#     'lambda': best['lambda'],
#     'objective': 'binary:logistic',
#     'eval_metric': 'auc',
#     'booster': 'gbtree',
#     'seed': 42
# }

# final_model = xgb.train(
#     params=best_params,
#     dtrain=dtrain,
#     num_boost_round=1000,
#     evals=[(dtest, 'test')],
#     early_stopping_rounds=10
# )

[0]	test-auc:0.78585
[1]	test-auc:0.82703
[2]	test-auc:0.82025
[3]	test-auc:0.82469
[4]	test-auc:0.83042
[5]	test-auc:0.83218
[6]	test-auc:0.83171
[7]	test-auc:0.83216
[8]	test-auc:0.83365
[9]	test-auc:0.83373
[10]	test-auc:0.83639
[11]	test-auc:0.83478
[12]	test-auc:0.83232
[13]	test-auc:0.83183
[14]	test-auc:0.83302
[15]	test-auc:0.83418
[16]	test-auc:0.83622
[17]	test-auc:0.83517
[18]	test-auc:0.83249
[19]	test-auc:0.83195
[20]	test-auc:0.83230


## catboost

カテゴリ変数を明確に与える必要がある。

In [76]:
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
cv_list = list(skf.split(X, y))

def train_catboost(X, y, cv, params: dict = None):
    if params is None:
        params = {}

    models = []
    n_records = len(X)
    oof_pred = np.zeros((n_records, ))
    for i, (tr_idx, va_idx) in enumerate(cv):
        tr_x, va_x = X.iloc[tr_idx], X.iloc[va_idx]
        tr_y, va_y = y.iloc[tr_idx], y.iloc[va_idx]
        model = cb.CatBoostRegressor(**params)
        model.fit(tr_x, tr_y, eval_set=(va_x, va_y), use_best_model=True, verbose=100)
        oof_pred[va_idx] = model.predict(va_x)
        models.append(model)
    return oof_pred, models

params = {
    'loss_function': 'RMSE',
    'eval_metric': 'AUC',
    'iterations': 8000,
    'learning_rate': 0.005,
    'depth': 5,
    # 'cat_features': category_columns,
    'l2_leaf_reg': 7,    # 3 ~ 10
    'verbose': 200,
    'random_seed': 42,
}

oof, models = train_catboost(X, y, cv_list, params)

0:	test: 0.7771488	best: 0.7771488 (0)	total: 1.45ms	remaining: 11.6s
100:	test: 0.8267224	best: 0.8301505 (68)	total: 124ms	remaining: 9.7s
200:	test: 0.8329599	best: 0.8332609 (196)	total: 245ms	remaining: 9.51s
300:	test: 0.8339130	best: 0.8345652 (253)	total: 358ms	remaining: 9.15s
400:	test: 0.8354181	best: 0.8355853 (397)	total: 485ms	remaining: 9.19s
500:	test: 0.8374247	best: 0.8375585 (495)	total: 616ms	remaining: 9.22s
600:	test: 0.8385284	best: 0.8385619 (579)	total: 747ms	remaining: 9.2s
700:	test: 0.8396823	best: 0.8397659 (695)	total: 885ms	remaining: 9.21s
800:	test: 0.8407525	best: 0.8408361 (794)	total: 1.01s	remaining: 9.1s
900:	test: 0.8415886	best: 0.8416890 (887)	total: 1.13s	remaining: 8.91s
1000:	test: 0.8424749	best: 0.8424749 (999)	total: 1.25s	remaining: 8.76s
1100:	test: 0.8432441	best: 0.8432441 (1100)	total: 1.37s	remaining: 8.6s
1200:	test: 0.8431940	best: 0.8434950 (1154)	total: 1.49s	remaining: 8.42s
1300:	test: 0.8431773	best: 0.8435619 (1225)	total: 1.

In [71]:
for col in category_columns:
    df_submit[col] = df_submit[col].astype('str')

y_submit = np.zeros(len(df_submit))
for model in models:
    y_submit += model.predict(df_submit) / len(models)

# Nural Network

# 提出

In [72]:
index = pd.read_csv('data/test.csv')['id'].values

df_submit = pd.DataFrame({
    "id": index,
    "prediction": y_submit
})

df_submit.head()

Unnamed: 0,id,prediction
0,3489,0.057189
1,3490,0.235104
2,3491,0.299855
3,3492,0.4481
4,3493,0.389674


In [73]:
path = 'submission/submit_16_cat_featureselection.csv'

In [74]:
df_submit.to_csv(path, index=False, header=None)