In [1]:
import json

import pandas as pd
import numpy as np
import warnings
import optuna

from xgboost import XGBClassifier
from sklearn.model_selection import KFold, GroupKFold
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings('ignore')



In [2]:
def search_best_threshold(df_preds, df_true, start=0.6, end=0.7, step=0.005):
    """search best threshol f1score"""
    scores = []
    thresholds = []
    best_score = 0
    for threshold in np.arange(start, end, step):
        preds = (df_preds.values.reshape((-1)) > threshold).astype('int')
        m = f1_score(df_true.values.reshape((-1)), preds, average='macro')
        scores.append(m)
        thresholds.append(threshold)
        if m > best_score:
            best_score = m
            best_threshold = threshold
    return best_score


def create_compare_data(df, df_target, all_users):
    """create compare score data for search thresold"""
    df_true = df.copy()
    for k in range(1, 19):
        tmp = df_target.loc[df_target.q == k].set_index('session').loc[all_users]
        df_true[f'q_{k}'] = tmp.correct.values
    return df_true

In [3]:
PATH_BASE = '/kaggle/input/psp-dataset-for-optuna/{}'
PATH_USR_FEATURE = PATH_BASE.format('use_features.json')
PATH_DATABASE = '/kaggle/input/optuna-database/optuna_study.db'


df1 = pd.read_csv(PATH_BASE.format('df1.csv'), index_col='session_id')
df2 = pd.read_csv(PATH_BASE.format('df2.csv'), index_col='session_id')
df3 = pd.read_csv(PATH_BASE.format('df3.csv'), index_col='session_id')
targets = pd.read_csv(PATH_BASE.format('targets.csv'), index_col='session')

with open(PATH_USR_FEATURE, mode='r') as fp:
    dict_use_feature = json.load(fp)

list_feautres_df1 = dict_use_feature['0-4']
list_feautres_df2 = dict_use_feature['5-12']
list_feautres_df3 = dict_use_feature['13-22']

In [4]:
dict_modeling_info = {
    '0-4': {
        'df': df1,
        'feature': list_feautres_df1,
    },
    '5-12': {
        'df': df2,
        'feature': list_feautres_df2,
    },
    '13-22': {
        'df': df3,
        'feature': list_feautres_df3
    }
}

In [6]:
ALL_USERS = df1.index.unique()
print('We will train with', len(ALL_USERS) ,'users info')

We will train with 23562 users info


In [7]:
def objective(trial):
    pred_xgb = pd.DataFrame(np.zeros((df1.shape[0], 18)), columns=[f'q_{i}' for i in range(1, 19)], index=df1.index)
    kf = KFold(n_splits=5, shuffle=True, random_state=0)
    lgb_params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'binary_logloss',
        'n_estimators': 1500,
        'random_state': 42,
        'learning_rate': trial.suggest_loguniform('leraning_rate', 0.005, 0.1),
        'reg_alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'max_depth': trial.suggest_int("max_depth", 2, 8),
        'subsample': trial.suggest_categorical('subsample', [0.5, 0.6, 0.7, 0.8, 0.9]),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),
        'subsample_freq': trial.suggest_int("subsample_freq", 0, 10),
        'num_leaves': trial.suggest_int("num_leaves", 10, 40),
    }
    for t in range(1, 19):
        # USE THIS TRAIN DATA WITH THESE QUESTIONS
        if t <= 3:
            grp = '0-4'
        elif t <= 13:
            grp = '5-12'
        elif t <= 22:
            grp = '13-22'

        df = dict_modeling_info[grp]['df'].copy()
        FEATURES = dict_modeling_info[grp]['feature'].copy()
        FEATURES += [f'q_{j}' for j in range(1, t)]

        # TRAIN DATA
        df = pd.merge(df, targets.query(f'q == {t}')[['correct']], left_index=True, right_index=True, how='inner')
        df = pd.merge(df, pred_xgb, left_index=True, right_index=True, how='inner')

        for i, (train_idx, valid_idx) in enumerate(kf.split(df), start=1):
            df_train, df_valid = df.iloc[train_idx], df.iloc[valid_idx]

            X_train = df_train[FEATURES].astype('float32')
            y_train = df_train['correct']
            X_valid = df_valid[FEATURES].astype('float32')
            y_valid = df_valid['correct']

            clf = LGBMClassifier(**lgb_params)
            clf.fit(X_train, y_train, early_stopping_rounds=30, eval_metric='binary_logloss', eval_set=[(X_valid, y_valid)], verbose=0)

            pred = clf.predict_proba(X_valid, num_iteration=clf.best_iteration_)[:, 1]

            pred_xgb.iloc[valid_idx, t-1] = pred

            del X_train, X_valid, y_train, y_valid, clf, pred

        del df

    df_true = create_compare_data(pred_xgb, targets.reset_index(), ALL_USERS)
    score = search_best_threshold(pred_xgb, df_true)
    return score

In [8]:
study = optuna.create_study(
    storage='sqlite:///optuna_study.db',
    study_name="lgb-params",
    load_if_exists=True,
    direction='maximize'
)

[32m[I 2023-05-25 12:36:01,943][0m A new study created in RDB with name: lgb-params[0m


In [None]:
study.optimize(objective, timeout=60*60*10, n_jobs=2)