In [1]:
import os
os.chdir("../")

In [10]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
# from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

from scr.util import *

In [4]:
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.pipeline import Pipeline

# Data Set

In [5]:
feature = [
    'Age',
    'TypeofContact',
    'CityTier',
    'DurationOfPitch',
    'Occupation',
    'Gender',
    'NumberOfPersonVisiting',
    'NumberOfFollowups',
    'ProductPitched',
    'PreferredPropertyStar',
    'NumberOfTrips',
    'Passport',
    'PitchSatisfactionScore',
    'Designation',
    'MonthlyIncome',
    'Marry',
    'Car',
    'Child',
    'AgeGroup'    # 作成
]

In [9]:
pass_train = 'data/null_survey/train_age_ok.csv'
pass_test = 'data/null_survey/test_age_ok.csv'

df = pd.read_csv(pass_train)
df_submit = pd.read_csv(pass_test)

In [13]:
def mapping_first_category_plus_agegroup(df_train, df_test):

    mapping = {
        'No': 0,
        'Self Enquiry': 1,
        'Company Invited': 2
    }
    df_train.loc[:, 'TypeofContact'] = df_train.loc[:, 'TypeofContact'].map(mapping)
    df_test.loc[:, 'TypeofContact'] = df_test.loc[:, 'TypeofContact'].map(mapping)

    mapping = {
        'Salaried': 0,
        'Small Business': 1,
        'Large Business': 2
    }
    df_train.loc[:, 'Occupation'] = df_train.loc[:, 'Occupation'].map(mapping)
    df_test.loc[:, 'Occupation'] = df_test.loc[:, 'Occupation'].map(mapping)

    mapping = {
        'male': 0,
        'female': 1
    }
    df_train.loc[:, 'Gender'] = df_train.loc[:, 'Gender'].map(mapping)
    df_test.loc[:, 'Gender'] = df_test.loc[:, 'Gender'].map(mapping)

    mapping = {
        'Super Deluxe': 0,
        'Standard': 1,
        'King': 2,
        'Deluxe': 3,
        'Basic': 4
    }
    df_train.loc[:, 'ProductPitched'] = df_train.loc[:, 'ProductPitched'].map(mapping)
    df_test.loc[:, 'ProductPitched'] = df_test.loc[:, 'ProductPitched'].map(mapping)

    mapping = {
        'Manager': 0,
        'VP': 1,
        'AVP': 2,
        'Senior Manager': 3,
        'Executive': 4
    }

    df_train.loc[:, 'Designation'] = df_train.loc[:, 'Designation'].map(mapping)
    df_test.loc[:, 'Designation'] = df_test.loc[:, 'Designation'].map(mapping)

    mapping = {
        'Married': 0,
        'Single': 1,
        'Divorced': 2,
    }
    df_train.loc[:, 'Marry'] = df_train.loc[:, 'Marry'].map(mapping)
    df_test.loc[:, 'Marry'] = df_test.loc[:, 'Marry'].map(mapping)

    mapping = {
        'No Car': 0,
        'Has Car': 1,
    }
    df_train.loc[:, 'Car'] = df_train.loc[:, 'Car'].map(mapping)
    df_test.loc[:, 'Car'] = df_test.loc[:, 'Car'].map(mapping)

    mapping = {
        '0_child': 0,
        '1_child': 1,
        '2_child': 2,
        '3_child': 3
    }
    df_train.loc[:, 'Child'] = df_train.loc[:, 'Child'].map(mapping)
    df_test.loc[:, 'Child'] = df_test.loc[:, 'Child'].map(mapping)
    
    mapping = {
        '10s': 1,
        '20s': 2,
        '30s': 3,
        '40s': 4,
        '50s': 5,
        '60s': 6
    }
    df_train.loc[:, 'AgeGroup'] = df_train.loc[:, 'AgeGroup'].map(mapping)
    df_test.loc[:, 'AgeGroup'] = df_test.loc[:, 'AgeGroup'].map(mapping)
    return df_train, df_test

In [14]:
df, df_submit = mapping_first_category_plus_agegroup(df, df_submit)


In [18]:
X = df[feature]
y = df['ProdTaken']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Gradient Boosting Decision Tree

## xgboost

In [25]:

params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'eta': 0.005,
    'gamma': 0.01,
    'alpha': 0.01,
    'lambda': 0.9,
    'min_child_weight': 1,
    'max_depth': 5, # 5 ~ 9
    'subsample': 0.79, # 0.6 = 0.95
    'colsample_bytree': 0.65, # 0.6 ~ 0.96
    'eval_metric': 'auc',
    'seed': 42
}

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

xgb_model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=1000,
    early_stopping_rounds=5,
    evals=[(dtest, 'test')]
)

ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, The experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:Occupation: object, Gender: object, ProductPitched: object, Designation: object, Marry: object, Car: object, Child: object, AgeGroup: object

In [69]:
df_submit = xgb.DMatrix(df_submit)

y_submit = xgb_model.predict(df_submit)

## xgboost (Grid Search)

In [125]:
cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=1000,
        seed=42,
        nfold=5,
        metrics='auc',
        early_stopping_rounds=10
    )

In [130]:
grid_params = [
    (eta, max_depth)
    for eta in [0.1, 0.75, 0.50, 0.25, 0.01]
    for max_depth in [5, 6, 7, 8, 9, 10]
]

max_auc = float(0)
best_params = []

for eta, max_depth in grid_params:
    print('eta={}, max_depth={}'.format(eta, max_depth))
    
    params['eta'] = eta
    params['max_depth'] = max_depth
    
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=1000,
        seed=42,
        nfold=5,
        metrics='auc',
        early_stopping_rounds=10
    )
    
    mean_auc = cv_results['test-auc-mean'].max()
    boost_rounds = cv_results['test-auc-mean'].argmax()
    print('AUC {} for {} rounds'.format(mean_auc, boost_rounds))
    if mean_auc > max_auc:
        max_auc = mean_auc
        best_params = (eta, max_depth)

print('Best params {}, AUC {}'.format(best_params, max_auc))

eta=0.1, max_depth=5
AUC 0.8198583840482406 for 18 rounds
eta=0.1, max_depth=6
AUC 0.8175357852082994 for 13 rounds
eta=0.1, max_depth=7
AUC 0.814336881349077 for 17 rounds
eta=0.1, max_depth=8
AUC 0.8143719119710597 for 13 rounds
eta=0.1, max_depth=9
AUC 0.814579905837995 for 13 rounds
eta=0.1, max_depth=10
AUC 0.816916563639458 for 18 rounds
eta=0.75, max_depth=5
AUC 0.7852027177889714 for 3 rounds
eta=0.75, max_depth=6
AUC 0.7770512948982504 for 3 rounds
eta=0.75, max_depth=7
AUC 0.7762240599948753 for 4 rounds
eta=0.75, max_depth=8
AUC 0.7648290912305058 for 13 rounds
eta=0.75, max_depth=9
AUC 0.7572965278721075 for 6 rounds
eta=0.75, max_depth=10
AUC 0.7555143082921678 for 14 rounds
eta=0.5, max_depth=5
AUC 0.8019297439973402 for 13 rounds
eta=0.5, max_depth=6
AUC 0.7906895884513346 for 8 rounds
eta=0.5, max_depth=7
AUC 0.788338402748352 for 8 rounds
eta=0.5, max_depth=8
AUC 0.7895147808995244 for 5 rounds
eta=0.5, max_depth=9
AUC 0.7812349995289433 for 4 rounds
eta=0.5, max_depth

## xgboost (Hyper parameter tuning)

In [34]:
# def objective(params):
#     params = {
#         'min_child_weight': int(params['min_child_weight']),
#         'max_depth': int(params['max_depth']),  # 整数にキャスト
#         'subsample': float(params['subsample']),
#         'colsample_bytree': float(params['colsample_bytree']),
#         'gamma': float(params['gamma']),
#         'alpha': float(params['alpha']),
#         'lambda': float(params['lambda']),
#         'objective': 'binary:logistic',
#         'eval_metric': 'auc',
#         'booster': 'gbtree'
#     }
    
#     cv_result = xgb.cv(
#         params=params,
#         dtrain=dtrain,
#         num_boost_round=1000,
#         nfold=5,
#         metrics='auc',
#         early_stopping_rounds=10,
#         seed=42
#     )
    
#     # 最大のAUCスコアを最小化する目的で返す
#     max_auc = max(cv_result['test-auc-mean'])
#     return {'loss': -max_auc, 'status': STATUS_OK}

# param_space = {
#     'min_child_weight': hp.loguniform('min_child_weight', np.log(0.1), np.log(10)),
#     'max_depth': hp.quniform('max_depth', 3, 9, 1),
#     'subsample': hp.quniform('subsample', 0.6, 0.95, 0.05),
#     'colsample_bytree': hp.quniform('colsample_bytree', 0.6, 0.95, 0.05),
#     'gamma': hp.loguniform('gamma', np.log(1e-8), np.log(1.0)),
#     'alpha': hp.loguniform('alpha', np.log(1e-8), np.log(1.0)),
#     'lambda': hp.loguniform('lambda', np.log(1e-8), np.log(10.0)),
# }

# trials = Trials()
# best = fmin(
#     fn=objective,
#     space=param_space,
#     algo=tpe.suggest,
#     max_evals=100,
#     trials=trials,
# )

# print("Best parameters:", best)

  0%|          | 0/100 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 100/100 [00:17<00:00,  5.58trial/s, best loss: -0.8319621473813135]
Best parameters: {'alpha': 1.4842369686829721e-08, 'colsample_bytree': 0.9, 'gamma': 1.2015666902947808e-08, 'lambda': 2.758064123709704e-07, 'max_depth': 3.0, 'min_child_weight': 9.569766694332625, 'subsample': 0.7000000000000001}


In [35]:
best_params = {
    'min_child_weight': int(best['min_child_weight']),
    'max_depth': int(best['max_depth']),
    'subsample': best['subsample'],
    'colsample_bytree': best['colsample_bytree'],
    'gamma': best['gamma'],
    'alpha': best['alpha'],
    'lambda': best['lambda'],
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'booster': 'gbtree',
    'seed': 42
}

final_model = xgb.train(
    params=best_params,
    dtrain=dtrain,
    num_boost_round=1000,
    evals=[(dtest, 'test')],
    early_stopping_rounds=10
)

[0]	test-auc:0.78585
[1]	test-auc:0.82703
[2]	test-auc:0.82025
[3]	test-auc:0.82469
[4]	test-auc:0.83042
[5]	test-auc:0.83218
[6]	test-auc:0.83171
[7]	test-auc:0.83216
[8]	test-auc:0.83365
[9]	test-auc:0.83373
[10]	test-auc:0.83639
[11]	test-auc:0.83478
[12]	test-auc:0.83232
[13]	test-auc:0.83183
[14]	test-auc:0.83302
[15]	test-auc:0.83418
[16]	test-auc:0.83622
[17]	test-auc:0.83517
[18]	test-auc:0.83249
[19]	test-auc:0.83195
[20]	test-auc:0.83230


## catboost

In [20]:
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
cv_list = list(skf.split(X, y))

def train_catboost(X, y, cv, params: dict = None):
    if params is None:
        params = {}

    models = []
    n_records = len(X)
    oof_pred = np.zeros((n_records, ))
    for i, (tr_idx, va_idx) in enumerate(cv):
        tr_x, va_x = X.iloc[tr_idx], X.iloc[va_idx]
        tr_y, va_y = y.iloc[tr_idx], y.iloc[va_idx]
        model = cb.CatBoostRegressor(**params)
        model.fit(tr_x, tr_y, eval_set=(va_x, va_y), use_best_model=True, verbose=100)
        oof_pred[va_idx] = model.predict(va_x)
        models.append(model)
    return oof_pred, models

params = {
    'loss_function': 'RMSE',
    'eval_metric': 'AUC',
    'iterations': 5000,
    'learning_rate': 0.005,
    'depth': 5,
    'l2_leaf_reg': 3,    # 3 ~ 10
    'verbose': 200,
    'random_seed': 42,
}

oof, models = train_catboost(X, y, cv_list, params)

0:	test: 0.7532107	best: 0.7532107 (0)	total: 133ms	remaining: 11m 4s
100:	test: 0.8242977	best: 0.8250669 (98)	total: 271ms	remaining: 13.1s
200:	test: 0.8269398	best: 0.8287124 (152)	total: 411ms	remaining: 9.8s
300:	test: 0.8322074	best: 0.8323411 (299)	total: 536ms	remaining: 8.37s
400:	test: 0.8346154	best: 0.8346154 (400)	total: 664ms	remaining: 7.62s
500:	test: 0.8367559	best: 0.8367559 (500)	total: 797ms	remaining: 7.16s
600:	test: 0.8375084	best: 0.8375084 (600)	total: 928ms	remaining: 6.79s
700:	test: 0.8385953	best: 0.8386789 (699)	total: 1.05s	remaining: 6.42s
800:	test: 0.8391639	best: 0.8395987 (771)	total: 1.17s	remaining: 6.12s
900:	test: 0.8400334	best: 0.8400334 (900)	total: 1.31s	remaining: 5.95s
1000:	test: 0.8403846	best: 0.8404515 (998)	total: 1.44s	remaining: 5.74s
1100:	test: 0.8403512	best: 0.8405686 (1078)	total: 1.54s	remaining: 5.47s
1200:	test: 0.8403344	best: 0.8405686 (1078)	total: 1.66s	remaining: 5.24s
1300:	test: 0.8411706	best: 0.8412040 (1296)	total:

In [21]:
y_submit = np.zeros(len(df_submit))

# アンサンブル
for model in models:
    y_submit += model.predict(df_submit) / len(models)

# Nural Network

# 提出

In [22]:
index = pd.read_csv('data/test.csv')['id'].values

df_submit = pd.DataFrame({
    "id": index,
    "prediction": y_submit
})

df_submit.head()

Unnamed: 0,id,prediction
0,3489,0.074173
1,3490,0.223132
2,3491,0.299318
3,3492,0.456694
4,3493,0.31731


In [23]:
path = 'submission/submit_10_年齢を回帰モデルで補完した.csv'

In [24]:
df_submit.to_csv(path, index=False, header=None)