In [1]:
import os
import numpy as np
import pandas as pd
import scipy.sparse as ssp
from functools import lru_cache
from tqdm import tqdm, trange
import time
import matplotlib.pyplot as plt
import cudf 

In [2]:
def flatten_sessions(valid_sessions_df : pd.DataFrame):
    sess_id_list = []
    product_list = []
    in_hist = []
    for idx, row in tqdm(valid_sessions_df.iterrows(), total=valid_sessions_df.shape[0]):
        sess_id = idx
        prev_items = eval(row['prev_items'].replace(' ', ','))
        for prev_item in prev_items:
            sess_id_list.append(sess_id)
            product_list.append(prev_item)
            in_hist.append(1)
    return pd.DataFrame({'sess_id' : sess_id_list, 'product' : product_list, 'in_hist' : in_hist})

# Valid Data

In [3]:
valid_candidates = pd.read_parquet('/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/XGBoost/candidates/merged_candidates_all_items.parquet')
valid_candidates_feature = pd.read_parquet('/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/XGBoost/candidates/merged_candidates_no_hist_feature.parquet')
valid_sessions = pd.read_csv('/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/task1_data/task13_4_task1_valid_sessions.csv')

In [4]:
valid_sessions_flattened = flatten_sessions(valid_sessions)

100%|██████████| 361581/361581 [00:18<00:00, 19573.20it/s]


In [5]:
valid_sessions_flattened_g = cudf.from_pandas(valid_sessions_flattened)
valid_candidates_g = cudf.from_pandas(valid_candidates)

In [6]:
valid_candidates_merged_g = valid_candidates_g.merge(valid_sessions_flattened_g, how='left', left_on=['sess_id', 'product'], right_on=['sess_id', 'product'])
valid_candidates_merged = valid_candidates_merged_g
valid_candidates_merged['in_hist'] = valid_candidates_merged['in_hist'].fillna(0)

In [7]:
valid_candidates_merged['in_hist'].sum() / 361581

0.08067348671528647

In [8]:
del valid_sessions_flattened_g
del valid_candidates_g
del valid_candidates_merged_g

In [9]:
valid_sessions_flattened

Unnamed: 0,sess_id,product,in_hist
0,0,B09VSN9GLS,1
1,0,B09VSG9DCG,1
2,0,B0BJ5L1ZPH,1
3,0,B09VSN9GLS,1
4,0,B0BJ6V797Y,1
...,...,...,...
1520826,361580,B07N6J2M3K,1
1520827,361580,B081YZVG5L,1
1520828,361580,B011XK46F0,1
1520829,361580,B08427PFR5,1


In [11]:
valid_candidates

Unnamed: 0,sess_id,sess_locale,product,target
0,0,UK,B096FLR9LK,0.0
1,0,UK,B093SZNPZN,0.0
2,0,UK,B0856JQ3WJ,0.0
3,0,UK,B09XMX99F3,0.0
4,0,UK,B07L3L4PQH,0.0
...,...,...,...,...
77570148,361580,DE,B0013USA1M,0.0
77570149,361580,DE,B00O4TD1XG,0.0
77570150,361580,DE,B00816X8TU,0.0
77570151,361580,DE,B0857Z91PY,0.0


## calculate metrics

### recall

In [12]:
(valid_candidates['target'] == 1.0).sum() / (valid_candidates['sess_id'].max() + 1)

0.7176898122412405

### MRR and NDCG

In [5]:
candidates_features_sasrec = valid_candidates_feature[['sess_id', 'sess_locale', 'product', 'target', 'sasrec_normalized_scores_3']].copy()
candidates_features_sasrec

Unnamed: 0,sess_id,sess_locale,product,target,sasrec_normalized_scores_3
0,0,UK,B000OPPVCS,0.0,2.517129e-04
1,0,UK,B000V599Y2,0.0,2.031618e-04
2,0,UK,B0018HH444,0.0,2.036883e-06
3,0,UK,B0079JI4DU,0.0,4.685961e-09
4,0,UK,B0079JI4EY,0.0,4.685961e-09
...,...,...,...,...,...
84407334,361580,DE,B0BB7XV97M,0.0,3.403967e-05
84407335,361580,DE,B0BB7YSRBX,0.0,2.115080e-05
84407336,361580,DE,B0BB7ZMGY8,0.0,5.522656e-05
84407337,361580,DE,B0BD4CP7N3,0.0,4.433373e-10


In [6]:
candidates_features_sasrec.sort_values(by=['sess_id', 'sasrec_normalized_scores_3'], ascending=[True, False], inplace=True)
candidates_features_sasrec.reset_index(drop=True, inplace=True)
candidates_features_sasrec['n'] = candidates_features_sasrec.groupby('sess_id')['product'].cumcount()

In [9]:
candidates_features_sasrec_100 = candidates_features_sasrec[candidates_features_sasrec['n'] < 100]
candidates_features_sasrec_100.reset_index(drop=True, inplace=True)

In [10]:
rank = candidates_features_sasrec_100[candidates_features_sasrec_100['target'] == 1.0]['n'] + 1
rank = rank.to_numpy()

In [12]:
ndcg_100 = (1.0 / np.log2(rank + 1)).sum() / (candidates_features_sasrec['sess_id'].max() + 1) * 1.0

In [13]:
mrr_100 = (1.0 / rank).sum() / (candidates_features_sasrec['sess_id'].max() + 1) * 1.0

In [18]:
recall = (candidates_features_sasrec['target'] == 1.0).sum() / (candidates_features_sasrec['sess_id'].max() + 1)

In [19]:
ndcg_100, mrr_100, recall

(0.3841662541742146, 0.31367326245076066, 0.7080018031920925)

# Test Data

In [40]:
test_candidates = pd.read_parquet('/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/XGBoost/candidates/merged_candidates_test.parquet')
test_sessions = pd.read_csv('/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data/sessions_test_task1.csv')

In [42]:
test_sessions_flattened = flatten_sessions(test_sessions)

100%|██████████| 316971/316971 [00:16<00:00, 18988.56it/s]


In [46]:
test_sessions_flattened_g = cudf.from_pandas(test_sessions_flattened)
test_candidates_g = cudf.from_pandas(test_candidates)

In [48]:
test_candidates_merged_g = test_candidates_g.merge(test_sessions_flattened_g, how='left', left_on=['sess_id', 'product'], right_on=['sess_id', 'product'])
test_candidates_merged = test_candidates_merged_g
test_candidates_merged['in_hist'] = test_candidates_merged['in_hist'].fillna(0)

In [53]:
test_candidates_merged = test_candidates_merged.sort_values(by=['sess_id', 'product'])

In [50]:
test_candidates_merged['in_hist'].sum() / 316971

3.465111950304602

In [45]:
test_sessions_flattened.shape[0] / 316971

4.26263601402021

In [61]:
test_candidates_merged.query("sess_id == 0")[100:150]

Unnamed: 0,sess_id,sess_locale,product,in_hist
10378,0,DE,B08RJ6QGFV,0
5083,0,DE,B08TJTBSGP,0
50080,0,DE,B08V12CT4C,1
50081,0,DE,B08V12CT4C,1
58823,0,DE,B08V18K64V,0
51269,0,DE,B08V1KXBQD,1
51270,0,DE,B08V1KXBQD,1
24761,0,DE,B08WLXY7WK,0
24760,0,DE,B08WYRNVT6,0
24252,0,DE,B08XMTKDBX,0


In [59]:
test_sessions.iloc[0]['prev_items']

"['B08V12CT4C' 'B08V1KXBQD' 'B01BVG1XJS' 'B09VC5PKN5' 'B09V7KG931'\n 'B09PY75FWM' 'B09PXYT6BT' 'B08V12CT4C' 'B08V1KXBQD' 'B08496TCCQ'\n 'B01BVG1XJS' 'B099NQFMG7']"

In [47]:
test_sessions_flattened

Unnamed: 0,sess_id,product,in_hist
0,0,B08V12CT4C,1
1,0,B08V1KXBQD,1
2,0,B01BVG1XJS,1
3,0,B09VC5PKN5,1
4,0,B09V7KG931,1
...,...,...,...
1351127,316969,B01MCQMORK,1
1351128,316969,B09JYZ325W,1
1351129,316970,B0B8JX92YJ,1
1351130,316970,B09TN4MP6V,1


In [43]:
test_candidates

Unnamed: 0,sess_id,sess_locale,product
0,0,DE,B088T84WGZ
1,0,DE,B095C1CHMQ
2,0,DE,B008SFS0CE
3,0,DE,B08LSDKWRW
4,0,DE,B0BFR6C5SW
...,...,...,...
70320668,316970,UK,B0BF4ZF65H
70320669,316970,UK,B09MLYQDMS
70320670,316970,UK,B0BJ9638ZX
70320671,316970,UK,B07ZWBPYWL
