In [1]:
import os
import random
import numpy as np
import pandas as pd
import cudf, itertools
import scipy.sparse as ssp
from functools import lru_cache
from tqdm import tqdm, trange
from collections import Counter, defaultdict

In [2]:
def cast_dtype(df : pd.DataFrame, columns=None):
    if columns is None:
        columns = df.columns
    for k in columns:
        dt = type(df[k].iloc[0])
        if 'float' in str(dt):
            df[k] = df[k].astype('float32')
        elif 'int' in str(dt):
            df[k] = df[k].astype('int32')
        elif dt == list:
            dt_ = type(df.iloc[0][k][0])
            if 'float' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.float32))
            elif 'int' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.int32))

In [3]:
def cal_next_item_freq(item_counter:Counter, session_df:pd.DataFrame):
    for sess in tqdm(session_df.itertuples(), total=session_df.shape[0]):
        next_item = sess.next_item
        item_counter[next_item] += 1 

# Merge test item frequency

In [4]:
merged_candidates_feature_test_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/XGBoost/candidates_phase2/merged_candidates_150_test_feature.parquet'
raw_train_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data_split/task13_4_task1_raw_train_sessions_phase2.csv'
raw_valid_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data_split/task13_4_task1_raw_valid_sessions_phase2.csv'
train_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/task1_data/task13_4_task1_train_sessions_phase2.csv'
valid_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/task1_data/task13_4_task1_valid_sessions_phase2.csv'

In [5]:
@lru_cache(maxsize=1)
def read_merged_candidates_feature_test():
    return pd.read_parquet(merged_candidates_feature_test_path, engine='pyarrow')

@lru_cache(maxsize=1)
def read_raw_valid_sessions():
    return pd.read_csv(raw_valid_sessions_path)

@lru_cache(maxsize=1)
def read_raw_train_sessions():
    return pd.read_csv(raw_train_sessions_path)

@lru_cache(maxsize=1)
def read_valid_sessions():
    return pd.read_csv(valid_sessions_path)

@lru_cache(maxsize=1)
def read_train_sessions():
    return pd.read_csv(train_sessions_path)

In [6]:
merged_candidates_feature = read_merged_candidates_feature_test()

In [7]:
# valid_sessions_df = read_raw_valid_sessions()
# train_sessions_df = read_raw_train_sessions()

In [8]:
valid_sessions_df = read_valid_sessions()
train_sessions_df = read_train_sessions()

In [9]:
merged_candidates_feature['next_freq']

0            3.0
1           28.0
2           49.0
3            5.0
4            8.0
            ... 
96556030     0.0
96556031     1.0
96556032     7.0
96556033     2.0
96556034     6.0
Name: next_freq, Length: 96556035, dtype: float32

In [10]:
merged_candidates_feature['next_freq_']

0            3.0
1           37.0
2           99.0
3            6.0
4           10.0
            ... 
96556030     0.0
96556031     1.0
96556032     8.0
96556033     4.0
96556034     7.0
Name: next_freq_, Length: 96556035, dtype: float32

In [None]:
# test sessions are included in train and valid sessions
# item_counter = Counter()
# cal_next_item_freq(item_counter, train_sessions_df)
# cal_next_item_freq(item_counter, valid_sessions_df)


In [11]:
item_counter = Counter()
cal_next_item_freq(item_counter, train_sessions_df)
cal_next_item_freq(item_counter, valid_sessions_df)

100%|██████████| 3966659/3966659 [00:11<00:00, 334081.70it/s]
100%|██████████| 261816/261816 [00:01<00:00, 253234.13it/s]


In [12]:
item_counter.most_common(50)

[('B09QFPZ9B7', 229),
 ('B07QPV9Z7X', 217),
 ('B00NTCH52W', 213),
 ('B00CWNMV4G', 209),
 ('B07N8QY3YH', 188),
 ('B014I8SSD0', 185),
 ('B09QFPYX34', 180),
 ('B099DP3617', 173),
 ('B09QFJNDQX', 170),
 ('B019GNUT0C', 169),
 ('B0B2Q2VVGP', 165),
 ('B00HZV9WTM', 162),
 ('B01B8R6PF2', 162),
 ('B08GWS298V', 160),
 ('B07H27J698', 158),
 ('B08CN3G4N9', 155),
 ('B009ICDU2G', 153),
 ('B09QFN2DYJ', 152),
 ('B01N40PO2M', 152),
 ('B07CZ4DLCP', 150),
 ('B00MNV8E0C', 148),
 ('B07P95S37K', 146),
 ('B00NTCHCU2', 145),
 ('B0B1MPZWJG', 144),
 ('B08GYKNCCP', 144),
 ('B081FWVSG8', 142),
 ('B01MXLEVR7', 141),
 ('B078NPDRHL', 141),
 ('B015AOGP1S', 141),
 ('B0B2Q4ZRDW', 141),
 ('B09YCMWPF5', 140),
 ('B07QS4NMW6', 140),
 ('B07MLFBJG3', 140),
 ('B088FSHMQ3', 138),
 ('B01H1R0K68', 137),
 ('B09QFZ8KCB', 137),
 ('B082T6DHB6', 136),
 ('B09BTKNGN5', 136),
 ('B082T6P545', 135),
 ('B07H256MBK', 135),
 ('B09BQWLDXF', 135),
 ('B0931VRJT5', 134),
 ('B07PNL5STG', 133),
 ('B07TV22X9M', 132),
 ('B0875NB89J', 132),
 ('B01N75E

In [24]:
item_counter.most_common(50)

[('B014I8T0YQ', 88),
 ('B07QS4NMW6', 87),
 ('B01N75EALQ', 85),
 ('B09QFPZ9B7', 85),
 ('B003JKFEL8', 84),
 ('B00CWNMV4G', 84),
 ('B09YCMWPF5', 83),
 ('B07KXQX3S3', 83),
 ('B00MNV8E0C', 83),
 ('B014I8SSD0', 82),
 ('B09Z4RVLXN', 82),
 ('B017Q8ZVWK', 81),
 ('B08PY4KL8V', 80),
 ('B00NTCHCU2', 80),
 ('B014I8SIJY', 80),
 ('B08V4QCBNW', 79),
 ('B0B244R4KB', 78),
 ('B015OW3M1W', 78),
 ('B00NTCH52W', 78),
 ('B07232M876', 78),
 ('B082T6K8XX', 77),
 ('B07H256MBK', 77),
 ('B07QPV9Z7X', 76),
 ('B06XZH8SKF', 76),
 ('B07NWWLP5S', 75),
 ('B082T6DHB6', 75),
 ('B07RSCK4XS', 75),
 ('B093333693', 74),
 ('B07D9C8NP2', 74),
 ('B099K6PP31', 74),
 ('B0B23LW7NV', 73),
 ('B005FEGYCO', 72),
 ('B09YMV3TXX', 72),
 ('B095RTJH1M', 71),
 ('B00CWNMXQW', 71),
 ('B0B3CJHNZ5', 70),
 ('B09QFPYX34', 70),
 ('B082T6GVKJ', 70),
 ('B07XXZP2CK', 70),
 ('B00HZV9WTM', 69),
 ('B0B3CKH642', 69),
 ('B09BTKNGN5', 69),
 ('B078NPDRHL', 69),
 ('B07H27J698', 69),
 ('B07QQZD49D', 68),
 ('B09HYVGFZM', 68),
 ('B07D3M6892', 68),
 ('B00LH3DMUO

In [13]:
products, counts = zip(*item_counter.items())
item_freq_df = pd.DataFrame({'product' : products, 'next_freq_' : counts})

In [12]:
# item_freq_df_g = cudf.from_pandas(item_freq_df)
# merged_candidates_feature_g = cudf.from_pandas(merged_candidates_feature_test)

In [13]:
# merged_candidates_freq_g = merged_candidates_feature_g.merge(item_freq_df_g, how='left', left_on=['product'], right_on=['product'])
# merged_candidates_freq_g = merged_candidates_freq_g.sort_values(by=['sess_id', 'product']).reset_index(drop=True)
# merged_candidates_freq_g['product_freq'] = merged_candidates_freq_g['product_freq'].fillna(0)
# cast_dtype(merged_candidates_freq_g)

In [14]:
merged_candidates = merged_candidates_feature[['sess_id', 'product', 'sess_locale']]

In [15]:
merged_candidates_next_freq = merged_candidates.merge(item_freq_df, how='left', left_on=['product'], right_on=['product'])
merged_candidates_next_freq = merged_candidates_next_freq.sort_values(by=['sess_id', 'product']).reset_index(drop=True)
merged_candidates_next_freq['next_freq_'] = merged_candidates_next_freq['next_freq_'].fillna(0)

In [16]:
merged_candidates_feature['next_freq_'] = merged_candidates_next_freq['next_freq_']

In [55]:
# merged_candidates_freq = merged_candidates_freq_g.to_pandas()
cast_dtype(merged_candidates_feature, ['next_freq_'])
merged_candidates_feature.to_parquet(merged_candidates_feature_test_path, engine='pyarrow')

In [None]:
# del item_freq_df_g
# del merged_candidates_feature_g
# del merged_candidates_freq_g

In [18]:
merged_candidates_next_freq['next_freq_']

0             3.0
1            39.0
2           107.0
3             6.0
4            11.0
            ...  
96556030      0.0
96556031      1.0
96556032      9.0
96556033      4.0
96556034      7.0
Name: next_freq_, Length: 96556035, dtype: float64

In [56]:
merged_candidates_next_freq['next_freq_']

0            3.0
1           37.0
2           99.0
3            6.0
4           10.0
            ... 
96556030     0.0
96556031     1.0
96556032     8.0
96556033     4.0
96556034     7.0
Name: next_freq_, Length: 96556035, dtype: float64

In [57]:
merged_candidates_feature['next_freq_']

0            3.0
1           37.0
2           99.0
3            6.0
4           10.0
            ... 
96556030     0.0
96556031     1.0
96556032     8.0
96556033     4.0
96556034     7.0
Name: next_freq_, Length: 96556035, dtype: float32

In [51]:
merged_candidates_feature['next_freq_']

0             3.0
1            39.0
2           107.0
3             6.0
4            11.0
            ...  
96556030      0.0
96556031      1.0
96556032      9.0
96556033      4.0
96556034      7.0
Name: next_freq_, Length: 96556035, dtype: float32

In [30]:
merged_candidates_feature['next_freq']

0            3.0
1           28.0
2           49.0
3            5.0
4            8.0
            ... 
96556030     0.0
96556031     1.0
96556032     7.0
96556033     2.0
96556034     6.0
Name: next_freq, Length: 96556035, dtype: float64

In [13]:
merged_candidates_feature['next_freq']

0            3.0
1           30.0
2           57.0
3            5.0
4            9.0
            ... 
96556030     0.0
96556031     1.0
96556032     8.0
96556033     2.0
96556034     6.0
Name: next_freq, Length: 96556035, dtype: float32

In [19]:
merged_candidates_feature['next_freq_']

0             3.0
1            39.0
2           107.0
3             6.0
4            11.0
            ...  
96556030      0.0
96556031      1.0
96556032      9.0
96556033      4.0
96556034      7.0
Name: next_freq_, Length: 96556035, dtype: float32

In [20]:
merged_candidates_feature['product_freq']

0             11.0
1            123.0
2           1095.0
3             25.0
4             51.0
             ...  
96556030       4.0
96556031       5.0
96556032      44.0
96556033      23.0
96556034      24.0
Name: product_freq, Length: 96556035, dtype: float32

In [None]:
merged_candidates_next_freq

Unnamed: 0,sess_id,sess_locale,product,product_freq
0,0,DE,B000Q87D0Q,11.0
1,0,DE,B000QB30DW,114.0
2,0,DE,B004BIG55Q,1015.0
3,0,DE,B0053FTNQY,25.0
4,0,DE,B007QWII1S,44.0
...,...,...,...,...
96556030,316971,UK,B0B82N3CQQ,3.0
96556031,316971,UK,B0BB9NW3F3,5.0
96556032,316971,UK,B0BDMVKTQ3,41.0
96556033,316971,UK,B0BHW1D5VP,14.0


In [32]:
merged_candidates_feature['product_freq']

0             11.0
1            123.0
2           1095.0
3             25.0
4             51.0
             ...  
96556030       4.0
96556031       5.0
96556032      44.0
96556033      23.0
96556034      24.0
Name: product_freq, Length: 96556035, dtype: float32

In [33]:
merged_candidates_feature['next_freq']


0            3.0
1           28.0
2           49.0
3            5.0
4            8.0
            ... 
96556030     0.0
96556031     1.0
96556032     7.0
96556033     2.0
96556034     6.0
Name: next_freq, Length: 96556035, dtype: float32