In [1]:
import os
os.chdir('/home/jovyan/kaggle/')

In [3]:
import gc
import time
import json

import pandas as pd
import numpy as np
import warnings
import pickle
import polars as pl

from xgboost import XGBClassifier

from sklearn.model_selection import KFold, GroupKFold
from sklearn.metrics import f1_score

from function.config import *
from function.functions import *

import warnings
warnings.filterwarnings('ignore')

In [4]:
experiment_name = 'blend_gbdt'  # NOTE: 実験概要、フォルダ名になる

PATH_BASE_DIR = 'model/blend_for_psp/{}/'
PATH_MODEL_XGB = PATH_BASE_DIR + 'xgb_models/fold{}/XGB_question{}.xgb'
PATH_FEATURE_SET = PATH_BASE_DIR + 'pre_raw_features.json'

In [20]:
xgb_params = {
        'booster': 'gbtree',
        'tree_method': 'gpu_hist',
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'learning_rate': 0.00960959386767139,
        'alpha': 9.669859706518814,
        'max_depth': 6,
        'subsample': 0.7,
        'colsample_bytree': 0.6241523164895888,
        'min_child_weight': 8,
        'lambda': 0.005372872139468915,
        'max_delta_step': 4,
        'n_estimators': 1500,
        'random_state': 42
        }

In [5]:
# フォルダ作成
list_model_dir = ['xgb_models']
for fold in range(1, 6):
    for model_name in list_model_dir:
        os.makedirs(PATH_BASE_DIR.format(experiment_name) + f'{model_name}/fold{fold}', exist_ok=True)

In [6]:
%%time
df = pl.read_csv(PATH_TRAIN)
targets = MyFunctions.create_target_df(PATH_TRAIN_LABEL)

CPU times: user 26.2 s, sys: 18.8 s, total: 45 s
Wall time: 1min 26s


In [7]:
%%time
df = MyFunctions.preprocessing_pl(df)
df1, df2, df3 = MyFunctions.split_dataframe_by_level_group(df)

CPU times: user 16.2 s, sys: 17.7 s, total: 33.9 s
Wall time: 14.8 s


In [8]:
df2 = pl.concat([df1, df2])
df3 = pl.concat([df2, df3])

In [9]:
print(f'df1 shape: {df1.shape}')
print(f'df2 shape: {df2.shape}')
print(f'df3 shape: {df3.shape}')

df1 shape: (3981005, 20)
df2 shape: (12825243, 20)
df3 shape: (26296946, 20)


In [10]:
del df
gc.collect()

0

In [11]:
%%time
df1 = MyFunctions.feature_engineer_pl(df1, grp='0-4', use_extra=True, feature_suffix='')
print('df1 done',df1.shape)
df2 = MyFunctions.feature_engineer_pl(df2, grp='5-12', use_extra=True, feature_suffix='')
print('df2 done',df2.shape)
df3 = MyFunctions.feature_engineer_pl(df3, grp='13-22', use_extra=True, feature_suffix='')
print('df3 done',df3.shape)

df1 done (23562, 2101)
df2 done (23562, 2154)
df3 done (23562, 2205)
CPU times: user 12min 32s, sys: 21.8 s, total: 12min 54s
Wall time: 3min 31s


In [12]:
df1 = MyFunctions.time_feature(df1)
df2 = MyFunctions.time_feature(df2)
df3 = MyFunctions.time_feature(df3)

In [13]:
df1.head(3)

Unnamed: 0,session_id,session_number_,null_count,page_change_count,question_count,block_count,recap_count,lost_count,event_name_unique_,name_unique_,...,slip_click_duration,slip_click_indexCount,shirt_era_search_duration,shirt_era_search_indexCount,year,month,day,hour,minute,second
0,20090312431273200,165,103,15,13,0,0,0,10,3,...,4512,5,7916,3,20,10,3,12,43,12
1,20090312433251036,139,78,10,11,2,0,0,11,4,...,8267,4,9691,13,20,10,3,12,43,32
2,20090312455206810,149,73,16,12,0,0,0,9,3,...,5934,4,9783,3,20,10,3,12,45,52


In [14]:
list_feautres_df1 = MyFunctions.feature_selection(df1)
list_feautres_df2 = MyFunctions.feature_selection(df2)
list_feautres_df3 = MyFunctions.feature_selection(df3)

print(f'df1 num_features: {len(list_feautres_df1)}')
print(f'df2 num_features: {len(list_feautres_df2)}')
print(f'df3 num_features: {len(list_feautres_df3)}')

df1 num_features: 674
df2 num_features: 1311
df3 num_features: 2006


In [15]:
ALL_USERS = df1.index.unique()
print('We will train with', len(ALL_USERS) ,'users info')

We will train with 23562 users info


In [17]:
list_level_group = ['0-4', '5-12', '13-22']
MyFunctions.dump_feature_list_to_json(list_level_group,
                                      [list_feautres_df1, list_feautres_df2, list_feautres_df3],
                                      PATH_FEATURE_SET.format(experiment_name)
                                     )

In [18]:
dict_modeling_info = {
    '0-4': {
        'df': df1,
        'feature': list_feautres_df1,
    },
    '5-12': {
        'df': df2,
        'feature': list_feautres_df2,
    },
    '13-22': {
        'df': df3,
        'feature': list_feautres_df3
    }
}

In [19]:
def train_xgb(X_train, y_train, X_valid, y_valid, params, experiment_name, num_fold, num_question, is_save=True):
    clf = XGBClassifier(**params)
    clf.fit(X_train, y_train, early_stopping_rounds=30, eval_set=[[X_valid, y_valid]], verbose=0)
    if is_save:
        clf.save_model(PATH_MODEL_XGB.format(experiment_name, num_fold, num_question))
    return clf

In [21]:
oof_xgb = pd.DataFrame(np.zeros((df1.shape[0], 18)), columns=[f'q_{i}' for i in range(1, 19)], index=df1.index)
df_estimator = pd.DataFrame(np.zeros((5, 18)), index=[i for i in range(1, 6)], columns=[i for i in range(1, 19)])
kf = KFold(n_splits=5, shuffle=True, random_state=0)

for t in range(1, 19):
    print(f'question: {t}')
    pre_exe = time.time()

    # USE THIS TRAIN DATA WITH THESE QUESTIONS
    if t <= 3:
        grp = '0-4'
    elif t <= 13:
        grp = '5-12'
    elif t <= 22:
        grp = '13-22'

    df = dict_modeling_info[grp]['df'].copy()
    FEATURES = dict_modeling_info[grp]['feature'].copy()
    list_feature_xgb = [f'q_{j}' for j in range(1, t)]

    # TRAIN DATA
    df = pd.merge(df, targets.query(f'q == {t}')[['correct']], left_index=True, right_index=True, how='inner')

    for i, (train_idx, valid_idx) in enumerate(kf.split(df), start=1):
        df_train, df_valid = df.iloc[train_idx], df.iloc[valid_idx]

        X_train = df_train[FEATURES].astype('float32')
        y_train = df_train['correct']
        X_valid = df_valid[FEATURES].astype('float32')
        y_valid = df_valid['correct']

        # xgb part
        clf_xgb = train_xgb(pd.merge(X_train, oof_xgb[list_feature_xgb], left_index=True, right_index=True, how='inner'),
                            y_train,
                            pd.merge(X_valid, oof_xgb[list_feature_xgb], left_index=True, right_index=True, how='inner'),
                            y_valid,
                            xgb_params,
                            experiment_name,
                            i,
                            t,
                            is_save=False
                           )
        pred_xgb = clf_xgb.predict_proba(pd.merge(X_valid, oof_xgb[list_feature_xgb], left_index=True, right_index=True, how='inner'),
                                         ntree_limit=clf_xgb.best_ntree_limit)[:, 1]
        df_estimator.loc[i, t] = clf_xgb.best_ntree_limit
        oof_xgb.iloc[valid_idx, t-1] = pred_xgb

        del df_train, df_valid, pred_xgb, clf_xgb
        gc.collect()

    po_exe = time.time()
    print(f'  elapsed time：{(po_exe - pre_exe):.2f} sec')

    del df, FEATURES
    gc.collect()

question: 1
  elapsed time：83.33 sec
question: 2
  elapsed time：53.33 sec
question: 3
  elapsed time：56.18 sec
question: 4
  elapsed time：190.52 sec
question: 5
  elapsed time：172.67 sec
question: 6
  elapsed time：140.08 sec
question: 7
  elapsed time：147.74 sec
question: 8
  elapsed time：108.65 sec
question: 9
  elapsed time：132.06 sec
question: 10
  elapsed time：114.06 sec
question: 11
  elapsed time：121.15 sec
question: 12
  elapsed time：117.86 sec
question: 13
  elapsed time：151.16 sec
question: 14
  elapsed time：279.95 sec
question: 15
  elapsed time：255.93 sec
question: 16
  elapsed time：262.19 sec
question: 17
  elapsed time：207.35 sec
question: 18
  elapsed time：285.94 sec


In [22]:
oof_xgb.head()

Unnamed: 0_level_0,q_1,q_2,q_3,q_4,q_5,q_6,q_7,q_8,q_9,q_10,q_11,q_12,q_13,q_14,q_15,q_16,q_17,q_18
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
20090312431273200,0.944113,0.993159,0.985704,0.935773,0.796977,0.926246,0.929807,0.718161,0.890661,0.717115,0.836508,0.929435,0.28482,0.808157,0.634133,0.755693,0.749767,0.988918
20090312433251036,0.741728,0.990332,0.965375,0.380353,0.128906,0.369236,0.449623,0.439304,0.345582,0.195768,0.342304,0.616778,0.083193,0.227715,0.083953,0.593339,0.555617,0.739533
20090312455206810,0.841481,0.989023,0.97478,0.625307,0.533895,0.82669,0.718651,0.635132,0.781317,0.581375,0.73565,0.870692,0.613771,0.724804,0.410301,0.77094,0.817944,0.95441
20090313091715820,0.450119,0.972838,0.897018,0.855149,0.499551,0.74789,0.805842,0.547301,0.749403,0.509053,0.642932,0.914969,0.13779,0.746657,0.515091,0.697605,0.728738,0.982062
20090313571836404,0.96161,0.994103,0.990465,0.955303,0.84471,0.951527,0.908702,0.804003,0.913376,0.733289,0.795736,0.92945,0.530554,0.881548,0.665148,0.784182,0.764032,0.990626


In [23]:
df_true = MyFunctions.create_compare_data(oof_xgb, targets.reset_index(), ALL_USERS)

In [26]:
MyFunctions.search_best_threshold(oof_xgb, df_true)

threshold: 0.500, score: 0.6727
threshold: 0.505, score: 0.6747
threshold: 0.510, score: 0.6765
threshold: 0.515, score: 0.6784
threshold: 0.520, score: 0.6800
threshold: 0.525, score: 0.6820
threshold: 0.530, score: 0.6835
threshold: 0.535, score: 0.6848
threshold: 0.540, score: 0.6863
threshold: 0.545, score: 0.6876
threshold: 0.550, score: 0.6889
threshold: 0.555, score: 0.6901
threshold: 0.560, score: 0.6913
threshold: 0.565, score: 0.6926
threshold: 0.570, score: 0.6937
threshold: 0.575, score: 0.6947
threshold: 0.580, score: 0.6955
threshold: 0.585, score: 0.6964
threshold: 0.590, score: 0.6972
threshold: 0.595, score: 0.6981
threshold: 0.600, score: 0.6986
threshold: 0.605, score: 0.6990
threshold: 0.610, score: 0.6993
threshold: 0.615, score: 0.6994
threshold: 0.620, score: 0.6997
threshold: 0.625, score: 0.6998
threshold: 0.630, score: 0.6998
threshold: 0.635, score: 0.6997
threshold: 0.640, score: 0.6990
threshold: 0.645, score: 0.6986
threshold: 0.650, score: 0.6978
threshol

## estimator

In [27]:
df_estimator

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
1,919.0,672.0,604.0,636.0,594.0,569.0,573.0,509.0,422.0,392.0,420.0,434.0,721.0,551.0,574.0,513.0,463.0,667.0
2,951.0,672.0,784.0,1158.0,730.0,564.0,432.0,360.0,450.0,425.0,442.0,435.0,560.0,507.0,515.0,533.0,354.0,799.0
3,653.0,732.0,655.0,848.0,588.0,528.0,762.0,330.0,538.0,366.0,454.0,504.0,477.0,608.0,373.0,674.0,384.0,739.0
4,942.0,749.0,689.0,601.0,879.0,566.0,549.0,339.0,543.0,403.0,448.0,549.0,678.0,471.0,568.0,565.0,386.0,603.0
5,935.0,919.0,554.0,718.0,624.0,480.0,505.0,355.0,548.0,409.0,422.0,463.0,551.0,648.0,489.0,448.0,369.0,671.0


In [28]:
list_estimator_xgb = df_estimator.median().tolist()

## Retrain

In [42]:
PATH_MODEL_ALL_DATA = 'model/blend_for_psp/blend_gbdt/xgb_models/pre_raw/{}'

In [44]:
for t in range(1, 19):
    pre_exe = time.time()

    # USE THIS TRAIN DATA WITH THESE QUESTIONS
    if t <= 3: 
        grp = '0-4'
    elif t <= 13: 
        grp = '5-12'
    elif t <= 22: 
        grp = '13-22'

    df = dict_modeling_info[grp]['df'].copy()
    FEATURES = dict_modeling_info[grp]['feature'].copy()
    list_feature_xgb = [f'q_{j}' for j in range(1, t)]
    xgb_params['n_estimators'] = int(list_estimator_xgb[t-1])

    # TRAIN DATA
    df = pd.merge(df, targets.query(f'q == {t}')[['correct']], left_index=True, right_index=True, how='inner')

    X_train = df[FEATURES].astype('float32')
    y_train = df['correct']

    clf = XGBClassifier(**xgb_params)
    clf.fit(pd.merge(X_train, oof_xgb[list_feature_xgb], left_index=True, right_index=True, how='inner'), y_train, verbose=0)
    clf.save_model(PATH_MODEL_ALL_DATA.format(f'XGB_question{t}.xgb'))

    po_exe = time.time()

    print(f'  question: {t}, elapsed time：{(po_exe - pre_exe):.2f} sec"')

    del df, X_train, y_train, clf, FEATURES
    gc.collect()

  question: 1, elapsed time：15.04 sec"
  question: 2, elapsed time：8.87 sec"
  question: 3, elapsed time：9.78 sec"
  question: 4, elapsed time：33.56 sec"
  question: 5, elapsed time：30.84 sec"
  question: 6, elapsed time：27.35 sec"
  question: 7, elapsed time：28.00 sec"
  question: 8, elapsed time：19.60 sec"
  question: 9, elapsed time：26.73 sec"
  question: 10, elapsed time：22.02 sec"
  question: 11, elapsed time：23.57 sec"
  question: 12, elapsed time：22.54 sec"
  question: 13, elapsed time：27.69 sec"
  question: 14, elapsed time：55.73 sec"
  question: 15, elapsed time：52.08 sec"
  question: 16, elapsed time：50.77 sec"
  question: 17, elapsed time：39.94 sec"
  question: 18, elapsed time：55.70 sec"


In [78]:
import shutil

shutil.make_archive('gbdt_ensemble', format='zip', root_dir=f'model/ensemble_for_psp/{experiment_name}')

'/home/jovyan/kaggle/gbdt_ensemble.zip'