In [1]:
import os
os.chdir("../")

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import catboost as cb
from scr.util import *

In [3]:
# データセットを整形

pass_train = 'data/feature_engineered/train_feature_ok.csv'
pass_test = 'data/feature_engineered/test_feature_ok.csv'

df = pd.read_csv(pass_train)
df_submit = pd.read_csv(pass_test)


# 特徴量選択
feature = [
    'Age',
    'TypeofContact',
    'CityTier',
    'DurationOfPitch',
    'Occupation',
    'Gender',
    'NumberOfPersonVisiting',
    'NumberOfFollowups',
    'ProductPitched',
    'PreferredPropertyStar',
    'NumberOfTrips',
    'Passport',
    'PitchSatisfactionScore',
    'Designation',
    'MonthlyIncome',
    'Marry',
    'Car',
    'Child',
    # 以下、作成特徴量
    'AgeGroup',
    'TypeofContactNULL',
    'Motivation',
    'EconomicPower',
    'Child01',
    'TripEasier',
    'SalesPerformance',
    'LivingCost',
    'EconomicStability',
    'TravelCost',
    'NumberOfTrips_log',
    'EconomicSegment',
    'PackageMatch'
]

# 型選択
float_columns = ['DurationOfPitch', 'MonthlyIncome'] + ['EconomicPower', 'TravelCost', 'NumberOfTrips_log']
int_columns = ['Age', 'NumberOfTrips', 'TypeofContact', 'CityTier', 'Occupation', 
                'Gender', 'NumberOfPersonVisiting', 'NumberOfFollowups', 'ProductPitched', 
                'PreferredPropertyStar', 'Passport', 'PitchSatisfactionScore', 'Designation',
                'Marry', 'Car', 'Child'] + ['AgeGroup', 'TypeofContactNULL', 'Motivation', 'TripEasier', 'SalesPerformance', 'LivingCost', 'EconomicStability', 'PackageMatch']
# catboostで措定
category_columns = ['TypeofContact', 'CityTier', 'Occupation', 'Gender', 'NumberOfPersonVisiting', 'NumberOfFollowups', 'ProductPitched', 
                'PreferredPropertyStar', 'Passport', 'PitchSatisfactionScore', 'Designation', 'Marry', 'Car', 'Child'] + ['EconomicSegment']


df = mapping_columns_if_exist(df)
df_submit = mapping_columns_if_exist(df_submit)
df, df_submit = convert_type(df, df_submit, float_columns=float_columns, int_columns=int_columns)


X = df[feature]
y = df['ProdTaken']
df_submit = df_submit[feature]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [6]:
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
import numpy as np

n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
cv_list = list(skf.split(X, y))

def train_xgboost(X, y, cv, params: dict = None):
    if params is None:
        params = {
            'objective': 'binary:logistic',  # Adjust for your specific objective
            'eval_metric': 'auc',
            'learning_rate': 0.05,
            'max_depth': 6,
            'min_child_weight': 1,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'n_estimators': 1000,
            'seed': 42
        }

    models = []
    n_records = len(X)
    oof_pred = np.zeros((n_records, ))
    for i, (tr_idx, va_idx) in enumerate(cv):
        tr_x, va_x = X.iloc[tr_idx], X.iloc[va_idx]
        tr_y, va_y = y.iloc[tr_idx], y.iloc[va_idx]

        dtrain = xgb.DMatrix(tr_x, label=tr_y)
        dvalid = xgb.DMatrix(va_x, label=va_y)

        watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

        model = xgb.train(params, dtrain, num_boost_round=8000, evals=watchlist,
                        early_stopping_rounds=100, verbose_eval=100)

        oof_pred[va_idx] = model.predict(xgb.DMatrix(va_x))
        models.append(model)
    return oof_pred, models

# Update parameters as needed
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'learning_rate': 0.005,
    'max_depth': 5,
    'min_child_weight': 1,
    #'subsample': 0.8,
    #'colsample_bytree': 0.8,
    #'n_estimators': 8000,
    'seed': 42
}

oof, models = train_xgboost(X, y, cv_list, params)


[0]	train-auc:0.84338	valid-auc:0.78424


[100]	train-auc:0.87611	valid-auc:0.80691
[200]	train-auc:0.88677	valid-auc:0.80981
[300]	train-auc:0.89729	valid-auc:0.81604
[400]	train-auc:0.91225	valid-auc:0.81801
[500]	train-auc:0.92341	valid-auc:0.82436
[600]	train-auc:0.93150	valid-auc:0.82788
[700]	train-auc:0.93754	valid-auc:0.82973
[796]	train-auc:0.94318	valid-auc:0.82729
[0]	train-auc:0.83244	valid-auc:0.78591
[100]	train-auc:0.86676	valid-auc:0.81707
[200]	train-auc:0.88030	valid-auc:0.82451
[300]	train-auc:0.89261	valid-auc:0.82790
[400]	train-auc:0.90781	valid-auc:0.83058
[500]	train-auc:0.91782	valid-auc:0.83109
[600]	train-auc:0.92777	valid-auc:0.83525
[700]	train-auc:0.93546	valid-auc:0.83704
[763]	train-auc:0.93858	valid-auc:0.83664
[0]	train-auc:0.83648	valid-auc:0.76233
[100]	train-auc:0.87484	valid-auc:0.78650
[200]	train-auc:0.88718	valid-auc:0.79254
[300]	train-auc:0.90255	valid-auc:0.79447
[400]	train-auc:0.91670	valid-auc:0.79700
[500]	train-auc:0.92701	valid-auc:0.80098
[600]	train-auc:0.93532	valid-auc:0.80