In [1]:
import os
os.chdir('/home/jovyan/kaggle/')

In [2]:
import gc
import os
import time
import json

import pandas as pd
import numpy as np
import warnings
import pickle
import polars as pl

from collections import defaultdict
from itertools import combinations
import pyarrow as pa

from xgboost import XGBClassifier

from lightgbm import LGBMClassifier
from lightgbm import early_stopping
from lightgbm import log_evaluation
from lightgbm import Booster

from catboost import CatBoostClassifier, Pool

from sklearn.model_selection import KFold, GroupKFold
from sklearn.metrics import roc_auc_score, f1_score

from function.config import *
from function.functions_20230610 import *

import warnings
warnings.filterwarnings('ignore')

In [3]:
experiment_name = 'blend_gbdt'  # NOTE: 実験概要、フォルダ名になる
path_oof = ''


PATH_BASE_DIR = 'model/blend_for_psp/{}/'
PATH_MODEL_XGB = PATH_BASE_DIR + 'xgb_models/fold{}/XGB_question{}.xgb'
PATH_MODEL_CAT = PATH_BASE_DIR + 'cat_models/fold{}/CAT_question{}.cbm'
PATH_MODEL_LGB = PATH_BASE_DIR + 'lgb_models/fold{}/LGB_question{}.lgb'
PATH_FEATURE_SET = PATH_BASE_DIR + 'pre_group_features.json'

In [4]:
%%time
df = pl.read_csv(PATH_TRAIN)
targets = MyFunctions.create_target_df(PATH_TRAIN_LABEL)

CPU times: user 37.1 s, sys: 18.4 s, total: 55.5 s
Wall time: 9.12 s


In [5]:
%%time
df = MyFunctions.preprocessing_pl(df)
df1, df2, df3 = MyFunctions.split_dataframe_by_level_group(df)

CPU times: user 16.2 s, sys: 18.4 s, total: 34.7 s
Wall time: 13.1 s


In [6]:
print(f'df1 shape: {df1.shape}')
print(f'df2 shape: {df2.shape}')
print(f'df3 shape: {df3.shape}')

df1 shape: (3981005, 20)
df2 shape: (8844238, 20)
df3 shape: (13471703, 20)


In [7]:
del df
gc.collect()

0

In [8]:
%%time
df1 = MyFunctions.feature_engineer_pre_group(df1, grp='0-4', use_extra=True, feature_suffix='')
print('df1 done',df1.shape)
df2 = MyFunctions.feature_engineer_pre_group(df2, grp='5-12', use_extra=True, feature_suffix='')
print('df2 done',df2.shape)
df3 = MyFunctions.feature_engineer_pre_group(df3, grp='13-22', use_extra=True, feature_suffix='')
print('df3 done',df3.shape)

df1 done (23562, 2101)
df2 done (23562, 2107)
df3 done (23562, 2105)
CPU times: user 6min 3s, sys: 12.8 s, total: 6min 16s
Wall time: 1min 57s


## ここが変更点
以前使用した特徴量を全て使用する

In [9]:
df1.rename(columns={col: f'{col}_grp1' for col in df1.columns if col != 'session_id'}, inplace=True)
df2.rename(columns={col: f'{col}_grp2' for col in df2.columns if col != 'session_id'}, inplace=True)
df3.rename(columns={col: f'{col}_grp3' for col in df3.columns if col != 'session_id'}, inplace=True)

In [10]:
df1.head()

Unnamed: 0,session_id,session_number__grp1,null_count_grp1,page_change_count_grp1,question_count_grp1,block_count_grp1,recap_count_grp1,lost_count_grp1,event_name_unique__grp1,name_unique__grp1,...,17_ET_max__grp1,18_ET_max__grp1,19_ET_max__grp1,20_ET_max__grp1,21_ET_max__grp1,22_ET_max__grp1,slip_click_duration_grp1,slip_click_indexCount_grp1,shirt_era_search_duration_grp1,shirt_era_search_indexCount_grp1
0,20090312431273200,165,103,15,13,0,0,0,10,3,...,,,,,,,4512,5,7916,3
1,20090312433251036,139,78,10,11,2,0,0,11,4,...,,,,,,,8267,4,9691,13
2,20090312455206810,149,73,16,12,0,0,0,9,3,...,,,,,,,5934,4,9783,3
3,20090313091715820,176,110,17,13,0,0,3,11,4,...,,,,,,,4513,5,18276,9
4,20090313571836404,112,59,9,12,0,0,0,10,4,...,,,,,,,4884,4,6955,4


In [11]:
list_feautres_df1 = MyFunctions.feature_selection(df1)
list_feautres_df2 = MyFunctions.feature_selection(df2)
list_feautres_df3 = MyFunctions.feature_selection(df3)

print(f'df1 num_features: {len(list_feautres_df1)}')
print(f'df2 num_features: {len(list_feautres_df2)}')
print(f'df3 num_features: {len(list_feautres_df3)}')

df1 num_features: 668
df2 num_features: 1003
df3 num_features: 1170


In [12]:
ALL_USERS = df1.index.unique()
print('We will train with', len(ALL_USERS) ,'users info')

We will train with 23562 users info


In [13]:
list_level_group = ['0-4', '5-12', '13-22']
MyFunctions.dump_feature_list_to_json(list_level_group,
                                      [list_feautres_df1, list_feautres_df1 + list_feautres_df2, list_feautres_df1 + list_feautres_df2 + list_feautres_df3],
                                      PATH_FEATURE_SET.format(experiment_name)
                                     )

In [14]:
xgb_params = {
        'booster': 'gbtree',
        'tree_method': 'gpu_hist',
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'learning_rate': 0.02,
        'alpha': 8,
        'max_depth': 4,
        'subsample': 0.8,
        'colsample_bytree': 0.5,
        'n_estimators': 1500,
        'random_state': 42
        }

In [15]:
dict_modeling_info = {
    '0-4': {
        'df': df1,
        'feature': list_feautres_df1,
    },
    '5-12': {
        'df': pd.concat([df1, df2], axis=1),
        'feature': list_feautres_df1 + list_feautres_df2,
    },
    '13-22': {
        'df': pd.concat([df1, df2, df3], axis=1),
        'feature': list_feautres_df1 + list_feautres_df2 + list_feautres_df3,
    }
}

In [17]:
def train_xgb(X_train, y_train, X_valid, y_valid, params, experiment_name, num_fold, num_question, is_save=True):
    clf = XGBClassifier(**params)
    clf.fit(X_train, y_train, early_stopping_rounds=30, eval_set=[[X_valid, y_valid]], verbose=0)
    if is_save:
        clf.save_model(PATH_MODEL_XGB.format(experiment_name, num_fold, num_question))
    return clf

In [18]:
oof_xgb = pd.DataFrame(np.zeros((df1.shape[0], 18)), columns=[f'q_{i}' for i in range(1, 19)], index=df1.index)
df_estimator = pd.DataFrame(np.zeros((5, 18)), index=[i for i in range(1, 6)], columns=[i for i in range(1, 19)])
kf = KFold(n_splits=5, shuffle=True, random_state=0)

for t in range(1, 19):
    print(f'question: {t}')
    pre_exe = time.time()

    # USE THIS TRAIN DATA WITH THESE QUESTIONS
    if t <= 3:
        grp = '0-4'
    elif t <= 13:
        grp = '5-12'
    elif t <= 22:
        grp = '13-22'

    df = dict_modeling_info[grp]['df'].copy()
    FEATURES = dict_modeling_info[grp]['feature'].copy()
    list_feature_xgb = [f'q_{j}' for j in range(1, t)]

    # TRAIN DATA
    df = pd.merge(df, targets.query(f'q == {t}')[['correct']], left_index=True, right_index=True, how='inner')

    for i, (train_idx, valid_idx) in enumerate(kf.split(df), start=1):
        df_train, df_valid = df.iloc[train_idx], df.iloc[valid_idx]

        X_train = df_train[FEATURES].astype('float32')
        y_train = df_train['correct']
        X_valid = df_valid[FEATURES].astype('float32')
        y_valid = df_valid['correct']

        # xgb part
        clf_xgb = train_xgb(pd.merge(X_train, oof_xgb[list_feature_xgb], left_index=True, right_index=True, how='inner'),
                            y_train,
                            pd.merge(X_valid, oof_xgb[list_feature_xgb], left_index=True, right_index=True, how='inner'),
                            y_valid,
                            xgb_params,
                            experiment_name,
                            i,
                            t,
                            is_save=False
                           )
        pred_xgb = clf_xgb.predict_proba(pd.merge(X_valid, oof_xgb[list_feature_xgb], left_index=True, right_index=True, how='inner'),
                                         ntree_limit=clf_xgb.best_ntree_limit)[:, 1]
        df_estimator.loc[i, t] = clf_xgb.best_ntree_limit
        oof_xgb.iloc[valid_idx, t-1] = pred_xgb

        del df_train, df_valid, pred_xgb, clf_xgb
        gc.collect()

    po_exe = time.time()
    print(f'  elapsed time：{(po_exe - pre_exe):.2f} sec')

    del df, FEATURES
    gc.collect()

question: 1
  elapsed time：38.16 sec
question: 2
  elapsed time：29.22 sec
question: 3
  elapsed time：28.36 sec
question: 4
  elapsed time：122.47 sec
question: 5
  elapsed time：127.65 sec
question: 6
  elapsed time：108.61 sec
question: 7
  elapsed time：103.87 sec
question: 8
  elapsed time：76.13 sec
question: 9
  elapsed time：85.83 sec
question: 10
  elapsed time：89.65 sec
question: 11
  elapsed time：80.82 sec
question: 12
  elapsed time：81.66 sec
question: 13
  elapsed time：119.94 sec
question: 14
  elapsed time：217.32 sec
question: 15
  elapsed time：200.49 sec
question: 16
  elapsed time：183.29 sec
question: 17
  elapsed time：157.92 sec
question: 18
  elapsed time：205.13 sec


In [19]:
oof_xgb.head()

Unnamed: 0_level_0,q_1,q_2,q_3,q_4,q_5,q_6,q_7,q_8,q_9,q_10,q_11,q_12,q_13,q_14,q_15,q_16,q_17,q_18
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
20090312431273200,0.9528,0.992633,0.981504,0.916668,0.797012,0.922611,0.915704,0.72697,0.886054,0.698606,0.815825,0.920244,0.321388,0.841913,0.696444,0.768404,0.780349,0.990426
20090312433251036,0.761593,0.993308,0.967912,0.422489,0.134573,0.374689,0.497005,0.392751,0.455473,0.210007,0.39592,0.713757,0.11563,0.2249,0.075385,0.632875,0.534458,0.739835
20090312455206810,0.844242,0.983783,0.974385,0.730576,0.559929,0.837613,0.737565,0.636956,0.763941,0.624996,0.774119,0.886595,0.618073,0.685474,0.37498,0.773036,0.821506,0.946428
20090313091715820,0.430476,0.96814,0.88851,0.810967,0.492738,0.743654,0.754713,0.564345,0.682542,0.467999,0.64189,0.887394,0.124244,0.663806,0.459529,0.716853,0.669798,0.976345
20090313571836404,0.963203,0.995051,0.990455,0.94422,0.831881,0.948121,0.901383,0.786778,0.898642,0.752889,0.816877,0.933532,0.517191,0.877162,0.660436,0.78464,0.797998,0.994224


In [20]:
df_true = MyFunctions.create_compare_data(oof_xgb, targets.reset_index(), ALL_USERS)

In [21]:
MyFunctions.search_best_threshold(oof_xgb, df_true)

threshold: 0.500, score: 0.6727
threshold: 0.505, score: 0.6746
threshold: 0.510, score: 0.6763
threshold: 0.515, score: 0.6784
threshold: 0.520, score: 0.6801
threshold: 0.525, score: 0.6817
threshold: 0.530, score: 0.6833
threshold: 0.535, score: 0.6848
threshold: 0.540, score: 0.6858
threshold: 0.545, score: 0.6872
threshold: 0.550, score: 0.6882
threshold: 0.555, score: 0.6893
threshold: 0.560, score: 0.6905
threshold: 0.565, score: 0.6917
threshold: 0.570, score: 0.6929
threshold: 0.575, score: 0.6940
threshold: 0.580, score: 0.6949
threshold: 0.585, score: 0.6957
threshold: 0.590, score: 0.6966
threshold: 0.595, score: 0.6971
threshold: 0.600, score: 0.6978
threshold: 0.605, score: 0.6985
threshold: 0.610, score: 0.6989
threshold: 0.615, score: 0.6991
threshold: 0.620, score: 0.6991
threshold: 0.625, score: 0.6992
threshold: 0.630, score: 0.6992
threshold: 0.635, score: 0.6990
threshold: 0.640, score: 0.6984
threshold: 0.645, score: 0.6979
threshold: 0.650, score: 0.6972
threshol

In [22]:
oof_xgb.to_csv('notebook/blend_gbdt/20230610_blend_baseline/out_of_fold/oof_pre_group.csv')

## estimator

In [23]:
df_estimator

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
1,513.0,308.0,327.0,441.0,383.0,525.0,305.0,246.0,214.0,238.0,319.0,191.0,501.0,317.0,294.0,379.0,220.0,288.0
2,538.0,340.0,371.0,559.0,555.0,305.0,324.0,350.0,187.0,258.0,175.0,261.0,478.0,283.0,346.0,223.0,236.0,435.0
3,513.0,355.0,309.0,299.0,424.0,362.0,473.0,161.0,239.0,207.0,239.0,317.0,350.0,406.0,202.0,289.0,294.0,343.0
4,594.0,371.0,311.0,399.0,523.0,398.0,300.0,163.0,458.0,392.0,169.0,253.0,573.0,400.0,447.0,332.0,232.0,332.0
5,672.0,418.0,339.0,574.0,519.0,337.0,374.0,191.0,257.0,360.0,328.0,253.0,326.0,371.0,316.0,259.0,160.0,338.0


In [24]:
list_estimator_xgb = df_estimator.median().tolist()

## Retrain

In [25]:
xgb_params = {
        'booster': 'gbtree',
        'tree_method': 'gpu_hist',
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'learning_rate': 0.02,
        'alpha': 8,
        'max_depth': 4,
        'subsample': 0.8,
        'colsample_bytree': 0.5,
        'random_state': 42
        }

In [26]:
PATH_MODEL_ALL_DATA = 'model/blend_for_psp/blend_gbdt/xgb_models/pre_group/{}'

In [27]:
for t in range(1, 19):
    pre_exe = time.time()

    # USE THIS TRAIN DATA WITH THESE QUESTIONS
    if t <= 3: 
        grp = '0-4'
    elif t <= 13: 
        grp = '5-12'
    elif t <= 22: 
        grp = '13-22'

    df = dict_modeling_info[grp]['df'].copy()
    FEATURES = dict_modeling_info[grp]['feature'].copy()
    list_feature_xgb = [f'q_{j}' for j in range(1, t)]
    xgb_params['n_estimators'] = int(list_estimator_xgb[t-1])

    # TRAIN DATA
    df = pd.merge(df, targets.query(f'q == {t}')[['correct']], left_index=True, right_index=True, how='inner')

    X_train = df[FEATURES].astype('float32')
    y_train = df['correct']

    clf = XGBClassifier(**xgb_params)
    clf.fit(pd.merge(X_train, oof_xgb[list_feature_xgb], left_index=True, right_index=True, how='inner'), y_train, verbose=0)
    clf.save_model(PATH_MODEL_ALL_DATA.format(f'XGB_question{t}.xgb'))

    po_exe = time.time()

    print(f'  question: {t}, elapsed time：{(po_exe - pre_exe):.2f} sec"')

    del df, X_train, y_train, clf, FEATURES
    gc.collect()

  question: 1, elapsed time：5.99 sec"
  question: 2, elapsed time：4.68 sec"
  question: 3, elapsed time：4.75 sec"
  question: 4, elapsed time：23.74 sec"
  question: 5, elapsed time：26.82 sec"
  question: 6, elapsed time：20.65 sec"
  question: 7, elapsed time：19.40 sec"
  question: 8, elapsed time：13.40 sec"
  question: 9, elapsed time：15.67 sec"
  question: 10, elapsed time：16.35 sec"
  question: 11, elapsed time：15.53 sec"
  question: 12, elapsed time：15.70 sec"
  question: 13, elapsed time：25.20 sec"
  question: 14, elapsed time：45.48 sec"
  question: 15, elapsed time：39.81 sec"
  question: 16, elapsed time：35.71 sec"
  question: 17, elapsed time：31.72 sec"
  question: 18, elapsed time：40.19 sec"


In [29]:
import shutil

shutil.make_archive('xgb_models', format='zip', root_dir=f'model/xgb_model_for_psp/{experiment_name}')

'/home/jovyan/kaggle/xgb_models.zip'