In [42]:
import os
import random
import numpy as np
import pandas as pd
import cudf, itertools
import scipy.sparse as ssp
from functools import lru_cache
from tqdm import tqdm, trange
from collections import Counter, defaultdict

In [43]:
merged_candidates_feature_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/XGBoost/candidates/merged_candidates_no_hist_feature.parquet'
valid_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/task1_data/task13_4_task1_valid_sessions.csv'
train_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/task1_data/task13_4_task1_train_sessions.csv'
test_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data/sessions_test_task1.csv'

In [44]:
@lru_cache(maxsize=1)
def read_merged_candidates_feature():
    return pd.read_parquet(merged_candidates_feature_path, engine='pyarrow')

@lru_cache(maxsize=1)
def read_valid_sessions():
    return pd.read_csv(valid_sessions_path)

@lru_cache(maxsize=1)
def read_train_sessions():
    return pd.read_csv(train_sessions_path)

In [45]:
def cast_dtype(df : pd.DataFrame):
    for k in df.columns:
        dt = type(df[k].iloc[0])
        if 'float' in str(dt):
            df[k] = df[k].astype('float32')
        elif 'int' in str(dt):
            df[k] = df[k].astype('int32')
        elif dt == list:
            dt_ = type(df.iloc[0][k][0])
            if 'float' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.float32))
            elif 'int' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.int32))

In [46]:
def cal_item_freq(item_counter:Counter, session_df:pd.DataFrame, test=False):
    for i in tqdm(range(session_df.shape[0])):
        sess = session_df.iloc[i]
        prev_items = eval(sess['prev_items'].replace(' ', ','))
        for item in prev_items:
            item_counter[item] += 1
        if not test:
            next_item = sess['next_item'] 
            item_counter[next_item] += 1 

# Merge valid item frequency

In [47]:
valid_sessions_df = read_valid_sessions()
train_sessions_df = read_train_sessions()

In [48]:
merged_candidates_feature = read_merged_candidates_feature()

In [8]:
item_counter = Counter()
cal_item_freq(item_counter, train_sessions_df, test=False)
cal_item_freq(item_counter, valid_sessions_df, test=True)

100%|██████████| 3557898/3557898 [03:57<00:00, 15003.09it/s]
100%|██████████| 361581/361581 [00:17<00:00, 20929.59it/s]


In [9]:
item_counter.most_common(10)

[('B07QPV9Z7X', 2993),
 ('B0BD5MFPMF', 2851),
 ('B01MXLEVR7', 2383),
 ('B09NQGVSPD', 2294),
 ('B08CN3G4N9', 2271),
 ('B08GWS298V', 2234),
 ('B00NTCH52W', 2202),
 ('B0BDML9477', 2156),
 ('B0BD88WWQ8', 2106),
 ('B07RHT52HX', 2101)]

In [10]:
products, counts = zip(*item_counter.items())
item_freq_df = pd.DataFrame({'product' : products, 'product_freq' : counts})

In [11]:
item_freq_df_g = cudf.from_pandas(item_freq_df)
merged_candidates_feature_g = cudf.from_pandas(merged_candidates_feature)

In [12]:
merged_candidates_freq_g = merged_candidates_feature_g.merge(item_freq_df_g, how='left', left_on=['product'], right_on=['product'])
merged_candidates_freq_g = merged_candidates_freq_g.sort_values(by=['sess_id', 'product']).reset_index(drop=True)
merged_candidates_freq_g['product_freq'] = merged_candidates_freq_g['product_freq'].fillna(0)
cast_dtype(merged_candidates_freq_g)

In [22]:
merged_candidates_freq = merged_candidates_freq_g.to_pandas()
merged_candidates_freq.to_parquet(merged_candidates_feature_path, engine='pyarrow')

In [35]:
del item_freq_df_g
del merged_candidates_feature_g
del merged_candidates_freq_g

In [49]:
merged_candidates_feature

Unnamed: 0,sess_id,sess_locale,product,target,sasrec_scores_2,sasrec_normalized_scores_2,product_freq,gru4rec_scores,gru4rec_normalized_scores
0,0,UK,B000OPPVCS,0.0,11.972421,2.286162e-04,104,6.484859,3.816029e-05
1,0,UK,B000V599Y2,0.0,13.152878,7.443427e-04,37,4.342063,4.477209e-06
2,0,UK,B0018HH444,0.0,5.606023,3.928400e-07,7,3.220763,1.458925e-06
3,0,UK,B0079JI4DU,0.0,0.000000,1.443945e-09,67,0.000000,5.824698e-08
4,0,UK,B0079JI4EY,0.0,0.000000,1.443945e-09,77,0.000000,5.824698e-08
...,...,...,...,...,...,...,...,...,...
84407334,361580,DE,B0BB7XV97M,0.0,9.117821,6.077226e-05,56,9.268379,1.396883e-05
84407335,361580,DE,B0BB7YSRBX,0.0,9.163816,6.363281e-05,58,7.047796,1.516259e-06
84407336,361580,DE,B0BB7ZMGY8,0.0,11.256460,5.158278e-04,452,9.359167,1.529639e-05
84407337,361580,DE,B0BD4CP7N3,0.0,-3.778687,1.523433e-10,1,-0.593306,7.282568e-10


In [23]:
merged_candidates_freq

Unnamed: 0,sess_id,sess_locale,product,target,sasrec_scores_2,sasrec_normalized_scores_2,product_freq
0,0,UK,B000OPPVCS,0.0,11.972421,2.286162e-04,104
1,0,UK,B000V599Y2,0.0,13.152878,7.443427e-04,37
2,0,UK,B0018HH444,0.0,5.606023,3.928400e-07,7
3,0,UK,B0079JI4DU,0.0,0.000000,1.443945e-09,67
4,0,UK,B0079JI4EY,0.0,0.000000,1.443945e-09,77
...,...,...,...,...,...,...,...
84407334,361580,DE,B0BB7XV97M,0.0,9.117821,6.077226e-05,56
84407335,361580,DE,B0BB7YSRBX,0.0,9.163816,6.363281e-05,58
84407336,361580,DE,B0BB7ZMGY8,0.0,11.256460,5.158278e-04,452
84407337,361580,DE,B0BD4CP7N3,0.0,-3.778687,1.523433e-10,1


# Merge test item frequency

In [24]:
merged_candidates_feature_test_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/XGBoost/candidates/merged_candidates_test_no_hist_feature.parquet'
valid_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/task1_data/task13_4_task1_valid_sessions.csv'
train_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/task1_data/task13_4_task1_train_sessions.csv'
test_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data/sessions_test_task1.csv'

In [25]:
@lru_cache(maxsize=1)
def read_merged_candidates_feature_test():
    return pd.read_parquet(merged_candidates_feature_test_path, engine='pyarrow')

@lru_cache(maxsize=1)
def read_valid_sessions():
    return pd.read_csv(valid_sessions_path)

@lru_cache(maxsize=1)
def read_train_sessions():
    return pd.read_csv(train_sessions_path)

@lru_cache(maxsize=1)
def read_test_sessions():
    return pd.read_csv(test_sessions_path)

In [26]:
merged_candidates_feature_test = read_merged_candidates_feature_test()
valid_sessions_df = read_valid_sessions()
train_sessions_df = read_train_sessions()
test_sessions_df = read_test_sessions()

In [27]:
# test sessions are included in train and valid sessions
item_counter = Counter()
cal_item_freq(item_counter, train_sessions_df, test=False)
# cal_item_freq(item_counter, valid_sessions_df, test=True)
cal_item_freq(item_counter, valid_sessions_df, test=False)


100%|██████████| 3557898/3557898 [03:13<00:00, 18392.68it/s]
100%|██████████| 361581/361581 [00:19<00:00, 18164.78it/s]


In [28]:
products, counts = zip(*item_counter.items())
item_freq_df = pd.DataFrame({'product' : products, 'product_freq' : counts})

In [30]:
item_freq_df_g = cudf.from_pandas(item_freq_df)
merged_candidates_feature_g = cudf.from_pandas(merged_candidates_feature_test)

In [32]:
merged_candidates_freq_g = merged_candidates_feature_g.merge(item_freq_df_g, how='left', left_on=['product'], right_on=['product'])
merged_candidates_freq_g = merged_candidates_freq_g.sort_values(by=['sess_id', 'product']).reset_index(drop=True)
merged_candidates_freq_g['product_freq'] = merged_candidates_freq_g['product_freq'].fillna(0)
cast_dtype(merged_candidates_freq_g)

In [34]:
merged_candidates_freq = merged_candidates_freq_g.to_pandas()
merged_candidates_freq.to_parquet(merged_candidates_feature_test_path, engine='pyarrow')

In [None]:
del item_freq_df_g
del merged_candidates_feature_g
del merged_candidates_freq_g

In [41]:
merged_candidates_freq

Unnamed: 0,sess_id,sess_locale,product,sasrec_scores_2,sasrec_normalized_scores_2,gru4rec_scores,gru4rec_normalized_scores,product_freq
0,0,DE,4088833651,0.000000,2.975813e-09,0.000000,1.580065e-09,828
1,0,DE,B000H6W2GW,0.000000,2.975813e-09,0.000000,1.580065e-09,875
2,0,DE,B000JG2RAG,7.665308,6.347557e-06,8.104032,5.226502e-06,24
3,0,DE,B000RYSOUW,-2.951060,1.555882e-10,-2.857798,9.068785e-11,5
4,0,DE,B000UGZVQM,3.977920,1.589257e-07,4.688567,1.717488e-07,4
...,...,...,...,...,...,...,...,...
69428426,316970,UK,B0BJCTH4NH,11.327528,1.041200e-04,10.629994,3.818184e-04,74
69428427,316970,UK,B0BJTQQWLG,5.604142,3.403292e-07,6.052083,3.923694e-06,6
69428428,316970,UK,B0BJV3RL4H,9.146974,1.176336e-05,7.667603,1.973815e-05,7
69428429,316970,UK,B0BK7SPC84,-10.383047,3.879279e-14,-6.356799,1.601719e-11,0


# candidates statistic

In [65]:
merged_candidates_feature_test = pd.read_parquet('/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/XGBoost/candidates/merged_candidates_feature_test_2.parquet', engine='pyarrow')

In [66]:
merged_candidates_feature_test

Unnamed: 0,sess_id,sess_locale,product,sasrec_normalized_scores,roberta_normalized_scores,co_graph_normalized_counts,co_graph_normalized_counts_0,co_graph_normalized_counts_1,co_graph_normalized_counts_2,product_freq
0,0,DE,B09T2ZXL4V,0.000357,0.008733,0.005862,0.012821,0.000000,0.000000,118
1,0,DE,B08QYYBTMC,0.002867,0.000000,0.012702,0.012821,0.007380,0.014286,574
2,0,DE,B0B7S7LBMB,0.000042,0.000000,0.019052,0.032051,0.011070,0.007143,551
3,0,DE,B0BD7MGXMM,0.000378,0.000000,0.002931,0.006410,0.000000,0.000000,101
4,0,DE,B000H6W2GW,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,873
...,...,...,...,...,...,...,...,...,...,...
70403346,316970,UK,B0BGMDVM7V,0.000011,0.014775,0.000000,0.000000,0.000000,0.000000,26
70403347,316970,UK,B09FSZNRD9,0.000000,0.007841,0.000000,0.000000,0.000000,0.000000,20
70403348,316970,UK,B0B7838HH6,0.000000,0.000000,0.038889,0.058824,0.033333,0.030303,169
70403349,316970,UK,B08TJRVWV1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,655


In [67]:
(merged_candidates_feature_test['roberta_normalized_scores'] == 0.0).sum() / 70403351.0

0.5497605930717702

In [68]:
(merged_candidates_feature_test['sasrec_normalized_scores'] == 0.0).sum() / 70403351.0

0.549195776206732

In [69]:
(merged_candidates_feature_test['co_graph_normalized_counts_0'] == 0.0).sum() / 70403351.0

0.6944105402028378

In [70]:
(merged_candidates_feature_test['co_graph_normalized_counts_1'] == 0.0).sum() / 70403351.0

0.7542271389894495

In [71]:
merged_candidates_feature = read_merged_candidates_feature()

In [72]:
(merged_candidates_feature['roberta_normalized_scores'] == 0.0).sum() / 85508299.0

0.57642176930686

In [73]:
(merged_candidates_feature['sasrec_normalized_scores'] == 0.0).sum() / 85508299.0

0.5760398531609195

In [74]:
(merged_candidates_feature['co_graph_normalized_counts_0'] == 0.0).sum() / 85508299.0

0.7768674242952722

In [75]:
(merged_candidates_feature['co_graph_normalized_counts_1'] == 0.0).sum() / 85508299.0

0.8249585341418147

In [15]:
merged_candidates_feature[merged_candidates_feature['product'] == 'B09P2YPF3H']

Unnamed: 0,sess_id,sess_locale,product,target,sasrec_normalized_scores,roberta_normalized_scores,co_graph_normalized_counts,co_graph_normalized_counts_0,co_graph_normalized_counts_1,co_graph_normalized_counts_2,product_freq
155143,649,UK,B09P2YPF3H,0.0,0.0,0.008176,0.000000,0.000000,0.0,0.0,7
306835,1287,UK,B09P2YPF3H,0.0,0.0,0.007995,0.000000,0.000000,0.0,0.0,7
335697,1407,UK,B09P2YPF3H,0.0,0.0,0.000000,0.014634,0.028571,0.0,0.0,7
353023,1482,UK,B09P2YPF3H,0.0,0.0,0.009932,0.027650,0.064516,0.0,0.0,7
373576,1568,UK,B09P2YPF3H,0.0,0.0,0.009154,0.000000,0.000000,0.0,0.0,7
...,...,...,...,...,...,...,...,...,...,...,...
85008145,358834,UK,B09P2YPF3H,0.0,0.0,0.005578,0.000000,0.000000,0.0,0.0,7
85240941,359819,UK,B09P2YPF3H,0.0,0.0,0.007549,0.000000,0.000000,0.0,0.0,7
85330404,360195,UK,B09P2YPF3H,0.0,0.0,0.010609,0.000000,0.000000,0.0,0.0,7
85332443,360203,UK,B09P2YPF3H,0.0,0.0,0.007566,0.000000,0.000000,0.0,0.0,7
