In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import random
import numpy as np
import pandas as pd
import cudf, itertools
import scipy.sparse as ssp
from functools import lru_cache
from tqdm import tqdm, trange
from collections import Counter, defaultdict

import xgboost as xgb 
from sklearn.model_selection import GroupKFold
import time

In [2]:
merged_candidates_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/XGBoost/candidates/merged_candidates_feature.parquet'

In [3]:
@lru_cache(maxsize=1)
def read_merged_candidates():
    return pd.read_parquet(merged_candidates_path, engine='pyarrow')

In [4]:
candidates_with_features = read_merged_candidates()

In [6]:
candidates_with_features

Unnamed: 0,sess_id,sess_locale,product,target,sasrec_normalized_scores,roberta_normalized_scores,co_graph_normalized_counts,co_graph_normalized_counts_0,co_graph_normalized_counts_1,co_graph_normalized_counts_2,product_freq,gru4rec_scores,sess_avg_price,product_price,sasrec_scores
0,0,UK,B000OPPVCS,0.0,0.000000,0.000000,0.002748,0.001718,0.004051,0.003190,104,6.484859,7.388571,7.280000,8.668667
1,0,UK,B000V599Y2,0.0,0.000298,0.000000,0.000000,0.000000,0.000000,0.000000,37,4.342063,7.388571,5.200000,11.681057
2,0,UK,B0018HH444,0.0,0.000000,0.000000,0.002061,0.001718,0.004051,0.001595,7,3.220763,7.388571,15.800000,4.629130
3,0,UK,B0079JI4DU,0.0,0.000000,0.000000,0.002404,0.001718,0.002026,0.003190,67,0.000000,7.388571,22.097065,0.000000
4,0,UK,B0079JI4EY,0.0,0.000000,0.000000,0.002748,0.001718,0.004051,0.003190,77,0.000000,7.388571,22.097065,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85447871,361580,DE,B0BB7XV97M,0.0,0.000016,0.000000,0.000000,0.000000,0.000000,0.000000,56,9.268379,32.424000,47.990002,9.548986
85447872,361580,DE,B0BB7YSRBX,0.0,0.000019,0.000000,0.000000,0.000000,0.000000,0.000000,58,7.047796,32.424000,43.990002,9.711373
85447873,361580,DE,B0BB7ZMGY8,0.0,0.000055,0.000000,0.000000,0.000000,0.000000,0.000000,452,9.359167,32.424000,41.990002,10.757690
85447874,361580,DE,B0BD4CP7N3,0.0,0.000000,0.006677,0.000000,0.000000,0.000000,0.000000,1,-0.593306,32.424000,24.990000,-5.042848


## Calculate metric

In [None]:
candidates_with_features_sasrec = candidates_with_features[['sess_id', 'sess_locale', 'product', 'target', 'sasrec_normalized_scores']].copy()
candidates_with_features_sasrec

In [None]:
candidates_with_features_sasrec.sort_values(by=['sess_id', 'sasrec_normalized_scores'], ascending=[True, False], inplace=True)

In [None]:
candidates_with_features_sasrec.reset_index(drop=True, inplace=True)

In [None]:
candidates_with_features_sasrec['n'] = candidates_with_features_sasrec.groupby('sess_id')['product'].cumcount()

In [None]:
candidates_with_features_sasrec = candidates_with_features_sasrec[candidates_with_features_sasrec['n'] < 100]
candidates_with_features_sasrec.reset_index(drop=True, inplace=True)

In [None]:
rank = candidates_with_features_sasrec[candidates_with_features_sasrec['target'] == 1.0]['n'] + 1

In [None]:
rank = rank.to_numpy()

In [None]:
ndcg_100 = (1.0 / np.log2(rank + 1)).sum() / (candidates_with_features['sess_id'].max() + 1) * 1.0

In [None]:
mrr_100 = (1.0 / rank).sum() / (candidates_with_features['sess_id'].max() + 1) * 1.0

In [None]:
ndcg_100, mrr_100

In [None]:
len(rank)

In [None]:
candidates_with_features_sasrec

## Train xgboost 

In [6]:
candidates_with_features['target'] = candidates_with_features['target'].astype(np.int32)
candidates_with_features['sess_locale'] = candidates_with_features['sess_locale'].astype('category')

In [7]:
FEATURES = set(candidates_with_features.columns)
FEATURES.remove('sess_id'), FEATURES.remove('product'), FEATURES.remove('target')
FEATURES = list(FEATURES)
FEATURES.sort()
FOLDS = 5
SEED = 42
LR = 0.1

# XGB MODEL PARAMETERS
xgb_parms = { 
    'max_depth': 4, 
    'learning_rate': LR, 
    'subsample': 0.7,
    'colsample_bytree': 0.5, 
    'eval_metric': 'ndcg@100-',
    'objective': 'binary:logistic',
    'scale_pos_weight': 200,
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor',
    'random_state': SEED
}

In [8]:
skf = GroupKFold(n_splits=FOLDS)
cur_time = time.strftime(time.strftime("%Y_%m_%d_%H_%M_%S",time.localtime()))

with open(f'./logs/XGB_{cur_time}.log', 'a') as f:
    f.write('Using Features: \n')
    f.write(f'{str(FEATURES)}\n')
    f.write('XGBoost parameters : \n')
    for k, v in xgb_parms.items():
        f.write(f'{k} : {v} \n')

for fold,(train_idx, valid_idx) in enumerate(skf.split(candidates_with_features, candidates_with_features['target'], groups=candidates_with_features['sess_id'] )):
    
    print('#'*25)
    print('### Fold',fold+1)
    print('### Train size',len(train_idx),'Valid size',len(valid_idx))
    print('#'*25)

    st_time = time.time()

    X_train = candidates_with_features.loc[train_idx, FEATURES]
    y_train = candidates_with_features.loc[train_idx, 'target']
    sess_id_train = candidates_with_features.loc[train_idx, ['sess_id', 'target']]
    group_size_train = sess_id_train.groupby(by='sess_id').count()['target'].to_numpy()

    X_valid = candidates_with_features.loc[valid_idx, FEATURES]
    y_valid = candidates_with_features.loc[valid_idx, 'target']

    sess_id_valid = candidates_with_features.loc[valid_idx, ['sess_id', 'target']]
    group_size_valid = sess_id_valid.groupby(by='sess_id').count()['target'].to_numpy()

    dtrain = xgb.DMatrix(X_train, y_train, group=group_size_train, enable_categorical=True) 
    dvalid = xgb.DMatrix(X_valid, y_valid, group=group_size_valid, enable_categorical=True) 

    res = {'train' : {'ndcg@100-' : []}, 'valid' : {'ndcg@100-' : []}}
    model = xgb.train(xgb_parms, 
        dtrain=dtrain,
        evals=[(dtrain,'train'),(dvalid,'valid')],
        num_boost_round=10000,
        early_stopping_rounds=200,
        evals_result=res,
        verbose_eval=100)
    
    ed_time = time.time()
    
    print(f'Running time : {(ed_time-st_time):.2f}s')

    with open(f'./logs/XGB_{cur_time}.log', 'a') as f:
        f.write(f'Fold {fold+1}\n')
        f.write(f'Train size {len(train_idx)} Valid size {len(valid_idx)}\n')
        f.write(f'Running time {(ed_time-st_time):.2f}s\n')
        f.write(f'Best score : {model.best_score} Best iteration : {model.best_iteration}\n')

    model.save_model(f'./ckpt/XGB_{cur_time}_fold{fold}.xgb')

#########################
### Fold 1
### Train size 68358328 Valid size 17089548
#########################
[0]	train-ndcg@100-:0.32648	valid-ndcg@100-:0.32798
[100]	train-ndcg@100-:0.40100	valid-ndcg@100-:0.40369
[200]	train-ndcg@100-:0.40203	valid-ndcg@100-:0.40441
[300]	train-ndcg@100-:0.40254	valid-ndcg@100-:0.40475
[400]	train-ndcg@100-:0.40281	valid-ndcg@100-:0.40487
[500]	train-ndcg@100-:0.40311	valid-ndcg@100-:0.40482


KeyboardInterrupt: 

In [None]:
model.best_score, model.best_iteration