In [1]:
import os
import random
import numpy as np
import pandas as pd
import cudf, itertools
import scipy.sparse as ssp
from functools import lru_cache
from tqdm import tqdm, trange
from collections import Counter, defaultdict

import xgboost as xgb 
from sklearn.model_selection import GroupKFold
import time

In [2]:
merged_candidates_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/XGBoost/candidates/merged_candidates_feature.parquet'

In [3]:
@lru_cache(maxsize=1)
def read_merged_candidates():
    return pd.read_parquet(merged_candidates_path, engine='pyarrow')

In [40]:
candidates_with_features = read_merged_candidates()

## Calculate metric

In [6]:
candidates_with_features_sasrec = candidates_with_features[['sess_id', 'sess_locale', 'product', 'target', 'sasrec_normalized_scores']].copy()
candidates_with_features_sasrec

Unnamed: 0,sess_id,sess_locale,product,target,sasrec_normalized_scores
0,0,UK,B06XG1LZ6Z,1.0,0.354775
1,0,UK,B07C97X1VS,0.0,0.000480
2,0,UK,B01MYUDYP7,0.0,0.029109
3,0,UK,B09XQSP9LD,0.0,0.001922
4,0,UK,B09JRYQGY8,0.0,0.000000
...,...,...,...,...,...
85657530,361580,DE,B0855LJWKZ,0.0,0.000179
85657531,361580,DE,B097BSYMZF,0.0,0.000000
85657532,361580,DE,B0797QKH2W,0.0,0.000000
85657533,361580,DE,B017BJD7QG,0.0,0.000000


In [7]:
candidates_with_features_sasrec.sort_values(by=['sess_id', 'sasrec_normalized_scores'], ascending=[True, False], inplace=True)

In [10]:
candidates_with_features_sasrec.reset_index(drop=True, inplace=True)

In [12]:
candidates_with_features_sasrec['n'] = candidates_with_features_sasrec.groupby('sess_id')['product'].cumcount()

In [31]:
candidates_with_features_sasrec = candidates_with_features_sasrec[candidates_with_features_sasrec['n'] < 100]
candidates_with_features_sasrec.reset_index(drop=True, inplace=True)

In [33]:
rank = candidates_with_features_sasrec[candidates_with_features_sasrec['target'] == 1.0]['n'] + 1

In [34]:
rank = rank.to_numpy()

In [37]:
ndcg_100 = (1.0 / np.log2(rank + 1)).sum() / (candidates_with_features['sess_id'].max() + 1) * 1.0

In [38]:
mrr_100 = (1.0 / rank).sum() / (candidates_with_features['sess_id'].max() + 1) * 1.0

In [39]:
ndcg_100, mrr_100

(0.3762735117535992, 0.30919878699608405)

In [36]:
len(rank)

229240

In [32]:
candidates_with_features_sasrec

Unnamed: 0,sess_id,sess_locale,product,target,sasrec_normalized_scores,n
0,0,UK,B06XGDZVZR,0.0,0.388253,0
1,0,UK,B06XG1LZ6Z,1.0,0.354775,1
2,0,UK,B06XGD9VLV,0.0,0.092891,2
3,0,UK,B01MYUDYP7,0.0,0.029109,3
4,0,UK,B076PN1SKG,0.0,0.022821,4
...,...,...,...,...,...,...
36158095,361580,DE,B0976C4GSV,0.0,0.000009,95
36158096,361580,DE,B07VC79R5G,0.0,0.000009,96
36158097,361580,DE,B01LW4PT7T,0.0,0.000008,97
36158098,361580,DE,B07GNVHQKQ,0.0,0.000008,98


## Train xgboost 

In [41]:
candidates_with_features['target'] = candidates_with_features['target'].astype(np.int32)

In [42]:
FEATURES = set(candidates_with_features.columns)
FEATURES.remove('sess_id'), FEATURES.remove('product'), FEATURES.remove('sess_locale'), FEATURES.remove('target')
FEATURES = list(FEATURES)
FOLDS = 5
SEED = 42
LR = 0.1

# XGB MODEL PARAMETERS
xgb_parms = { 
    'max_depth': 4, 
    'learning_rate': LR, 
    'subsample': 0.7,
    'colsample_bytree': 0.5, 
    'eval_metric': 'ndcg@100-',
    'objective': 'rank:ndcg',
    # 'scale_pos_weight': 200,
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor',
    'random_state': SEED
}

In [43]:
import xgboost as xgb
from sklearn.model_selection import GroupKFold

skf = GroupKFold(n_splits=FOLDS)
cur_time = time.strftime(time.strftime("%Y_%m_%d_%H_%M_%S",time.localtime()))
for fold,(train_idx, valid_idx) in enumerate(skf.split(candidates_with_features, candidates_with_features['target'], groups=candidates_with_features['sess_id'] )):
    
    print('#'*25)
    print('### Fold',fold+1)
    print('### Train size',len(train_idx),'Valid size',len(valid_idx))
    print('#'*25)

    st_time = time.time()

    X_train = candidates_with_features.loc[train_idx, FEATURES]
    y_train = candidates_with_features.loc[train_idx, 'target']
    sess_id_train = candidates_with_features.loc[train_idx, ['sess_id', 'target']]
    group_size_train = sess_id_train.groupby(by='sess_id').count()['target'].to_numpy()

    X_valid = candidates_with_features.loc[valid_idx, FEATURES]
    y_valid = candidates_with_features.loc[valid_idx, 'target']

    sess_id_valid = candidates_with_features.loc[valid_idx, ['sess_id', 'target']]
    group_size_valid = sess_id_valid.groupby(by='sess_id').count()['target'].to_numpy()

    dtrain = xgb.DMatrix(X_train, y_train, group=group_size_train) 
    dvalid = xgb.DMatrix(X_valid, y_valid, group=group_size_valid) 

    res = {'train' : {'ndcg@100-' : []}, 'valid' : {'ndcg@100-' : []}}
    model = xgb.train(xgb_parms, 
        dtrain=dtrain,
        evals=[(dtrain,'train'),(dvalid,'valid')],
        num_boost_round=10000,
        early_stopping_rounds=200,
        evals_result= res,
        verbose_eval=100)
    
    ed_time = time.time()
    
    print(f'Running time : {(ed_time-st_time):.2f}s')

    with open(f'./logs/XGB_{cur_time}.log', 'a') as f:
        f.write(f'Fold {fold+1}\n')
        f.write(f'Train size {len(train_idx)} Valid size {len(valid_idx)}\n')
        f.write(f'Running time {(ed_time-st_time):.2f}s\n')
        f.write(f'Best score : {model.best_score} Best iteration : {model.best_iteration}')

    model.save_model(f'./ckpt/XGB_fold{fold}.xgb')

#########################
### Fold 1
### Train size 68526055 Valid size 17131480
#########################
[0]	train-ndcg@100-:0.32564	valid-ndcg@100-:0.32508
[100]	train-ndcg@100-:0.39547	valid-ndcg@100-:0.39425
[200]	train-ndcg@100-:0.39744	valid-ndcg@100-:0.39599
[300]	train-ndcg@100-:0.39802	valid-ndcg@100-:0.39640
[400]	train-ndcg@100-:0.39818	valid-ndcg@100-:0.39650
[500]	train-ndcg@100-:0.39833	valid-ndcg@100-:0.39669
[600]	train-ndcg@100-:0.39849	valid-ndcg@100-:0.39675
[700]	train-ndcg@100-:0.39863	valid-ndcg@100-:0.39684
[800]	train-ndcg@100-:0.39866	valid-ndcg@100-:0.39686
[900]	train-ndcg@100-:0.39877	valid-ndcg@100-:0.39691
[1000]	train-ndcg@100-:0.39888	valid-ndcg@100-:0.39688
[1100]	train-ndcg@100-:0.39897	valid-ndcg@100-:0.39689
[1200]	train-ndcg@100-:0.39907	valid-ndcg@100-:0.39698
[1300]	train-ndcg@100-:0.39915	valid-ndcg@100-:0.39696
[1400]	train-ndcg@100-:0.39926	valid-ndcg@100-:0.39697
[1500]	train-ndcg@100-:0.39931	valid-ndcg@100-:0.39700
[1600]	train-ndcg@100-:0.

In [103]:
model.best_score, model.best_iteration

(0.3965147149407589, 268)

'2023_04_27_15_48_11'

In [99]:
res['train']['ndcg@100-']

[0.38274415682679036,
 0.38525110401614965,
 0.3864662340732853,
 0.3856892701242305,
 0.3847776443043934,
 0.38533471812624565,
 0.38434389597064866,
 0.3838187058267055,
 0.3836032478157957,
 0.383860969838811,
 0.38428811920485106,
 0.38551229849663504,
 0.38553776543410123,
 0.38656335141508025,
 0.38709104003596845,
 0.38714139945797704,
 0.3875627586967175,
 0.3884293929313646,
 0.38817414351478147,
 0.38805425244998143,
 0.3877823724349577,
 0.38767259635477747,
 0.38758170136414066,
 0.3873067159366182,
 0.3880986438929723,
 0.38884141423857066,
 0.38935651750357986,
 0.389324921524964,
 0.38981563961057214,
 0.3896386485636532,
 0.38980044822079474,
 0.39007727124743324,
 0.3904067722967975,
 0.3907997942181276,
 0.3910714538401246,
 0.3911392480480035,
 0.3916225823765739,
 0.3915699056550614,
 0.3915298246535626,
 0.3913974298491467,
 0.39133137725701683,
 0.39138908247713744,
 0.3913098276311523,
 0.39123924995070547,
 0.3916864400482063,
 0.3921442214594338,
 0.39241222315