In [1]:
import os
import random
import numpy as np
import pandas as pd
import cudf, itertools
import scipy.sparse as ssp
from functools import lru_cache
from tqdm import tqdm, trange
from collections import Counter, defaultdict

In [19]:
def cast_dtype(df : pd.DataFrame, columns=None):
    if columns is None:
        columns = df.columns
    for k in columns:
        dt = type(df[k].iloc[0])
        if 'float' in str(dt):
            df[k] = df[k].astype('float32')
        elif 'int' in str(dt):
            df[k] = df[k].astype('int32')
        elif dt == list:
            dt_ = type(df.iloc[0][k][0])
            if 'float' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.float32))
            elif 'int' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.int32))

In [3]:
def cal_item_freq(item_counter:Counter, session_df:pd.DataFrame, test=False):
    for sess in tqdm(session_df.itertuples(), total=session_df.shape[0]):
        prev_items = eval(sess.prev_items.replace(' ', ','))
        for item in prev_items:
            item_counter[item] += 1
        if not test:
            next_item = sess.next_item 
            item_counter[next_item] += 1 

In [4]:
def cal_next_item_freq(item_counter:Counter, session_df:pd.DataFrame):
    for sess in tqdm(session_df.itertuples(), total=session_df.shape[0]):
        next_item = sess.next_item
        item_counter[next_item] += 1 

# Merge test item frequency

In [5]:
merged_candidates_feature_test_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/XGBoost/candidates_phase2/merged_candidates_150_test_feature.parquet'
valid_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/task1_data/task13_4_task1_valid_sessions_phase2.csv'
train_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/task1_data/task13_4_task1_train_sessions_phase2.csv'
test_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data/sessions_test_task1_phase2.csv'

In [6]:
@lru_cache(maxsize=1)
def read_merged_candidates_feature_test():
    return pd.read_parquet(merged_candidates_feature_test_path, engine='pyarrow')

@lru_cache(maxsize=1)
def read_valid_sessions():
    return pd.read_csv(valid_sessions_path)

@lru_cache(maxsize=1)
def read_train_sessions():
    return pd.read_csv(train_sessions_path)

@lru_cache(maxsize=1)
def read_test_sessions():
    return pd.read_csv(test_sessions_path)

In [8]:
merged_candidates_feature = read_merged_candidates_feature_test()
valid_sessions_df = read_valid_sessions()
train_sessions_df = read_train_sessions()
test_sessions_df = read_test_sessions()

In [9]:
# test sessions are included in train and valid sessions
item_counter = Counter()
cal_item_freq(item_counter, train_sessions_df, test=False)
cal_next_item_freq(item_counter, valid_sessions_df)
# cal_item_freq(item_counter, valid_sessions_df, test=True)
# cal_item_freq(item_counter, valid_sessions_df, test=False)

100%|██████████| 3966659/3966659 [01:56<00:00, 33989.45it/s]
100%|██████████| 261816/261816 [00:00<00:00, 295398.96it/s]


In [None]:
# cal_item_freq(item_counter, test_sessions_df, test=True)

In [10]:
products, counts = zip(*item_counter.items())
item_freq_df = pd.DataFrame({'product' : products, 'product_freq' : counts})

In [None]:
# item_freq_df_g = cudf.from_pandas(item_freq_df)
# merged_candidates_feature_g = cudf.from_pandas(merged_candidates_feature_test)

In [None]:
# merged_candidates_freq_g = merged_candidates_feature_g.merge(item_freq_df_g, how='left', left_on=['product'], right_on=['product'])
# merged_candidates_freq_g = merged_candidates_freq_g.sort_values(by=['sess_id', 'product']).reset_index(drop=True)
# merged_candidates_freq_g['product_freq'] = merged_candidates_freq_g['product_freq'].fillna(0)
# cast_dtype(merged_candidates_freq_g)

In [11]:
merged_candidates = merged_candidates_feature[['sess_id', 'sess_locale', 'product']]

In [12]:
merged_candidates_freq = merged_candidates.merge(item_freq_df, how='left', left_on=['product'], right_on=['product'])
merged_candidates_freq = merged_candidates_freq.sort_values(by=['sess_id', 'product']).reset_index(drop=True)
merged_candidates_freq['product_freq'] = merged_candidates_freq['product_freq'].fillna(0)

In [14]:
merged_candidates_freq['product_freq']

0             11.0
1            125.0
2           1103.0
3             25.0
4             52.0
             ...  
96556030       4.0
96556031       5.0
96556032      45.0
96556033      23.0
96556034      24.0
Name: product_freq, Length: 96556035, dtype: float64

In [13]:
merged_candidates_feature['product_freq']

0             11.0
1            123.0
2           1095.0
3             25.0
4             51.0
             ...  
96556030       4.0
96556031       5.0
96556032      44.0
96556033      23.0
96556034      24.0
Name: product_freq, Length: 96556035, dtype: float32

In [15]:
merged_candidates_feature['product_freq'] = merged_candidates_freq['product_freq']

In [20]:
# merged_candidates_freq = merged_candidates_freq_g.to_pandas()
cast_dtype(merged_candidates_feature, ['product_freq'])
merged_candidates_feature.to_parquet(merged_candidates_feature_test_path, engine='pyarrow')

In [23]:
# del item_freq_df_g
# del merged_candidates_feature_g
# del merged_candidates_freq_g

In [22]:
merged_candidates_feature['product_freq']

0             11.0
1            125.0
2           1103.0
3             25.0
4             52.0
             ...  
96556030       4.0
96556031       5.0
96556032      45.0
96556033      23.0
96556034      24.0
Name: product_freq, Length: 96556035, dtype: float32

In [24]:
merged_candidates_freq

Unnamed: 0,sess_id,sess_locale,product,sasrec_scores_2,normalized_sasrec_scores_2,sasrec_scores_3,normalized_sasrec_scores_3,sess_avg_price,product_price,seqmlp_scores,...,co_graph_counts_1,normalized_co_graph_counts_1,co_graph_counts_2,normalized_co_graph_counts_2,cos_text_bert_scores,text_bert_scores,normalized_text_bert_scores,roberta_scores,normalized_roberta_scores,product_freq
0,0,DE,B000Q87D0Q,0.000000,3.282997e-10,0.000000,6.689660e-10,67.527199,36.761604,0.000000,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000e+00,0.000000,0.000000e+00,11.0
1,0,DE,B000QB30DW,0.501346,5.420036e-10,-0.588501,3.713825e-10,67.527199,9.990000,7.260942,...,0.600000,0.000826,0.0,0.0,0.924509,382.443390,1.724279e-04,278.861938,1.579214e-06,123.0
2,0,DE,B004BIG55Q,6.917523,3.315223e-07,5.737720,2.076175e-07,67.527199,8.990000,2.454817,...,0.311111,0.000428,0.0,0.0,0.906834,376.119781,3.092420e-07,280.436859,7.628168e-06,1095.0
3,0,DE,B0053FTNQY,-0.100895,2.967921e-10,1.507319,3.020121e-09,67.527199,36.761604,3.837643,...,0.090909,0.000125,0.0,0.0,0.885923,366.794250,2.755989e-11,279.552673,3.150818e-06,25.0
4,0,DE,B007QWII1S,3.768980,1.422714e-08,4.594047,6.615662e-08,67.527199,54.950001,4.923371,...,0.000000,0.000000,0.0,0.0,0.904845,377.558044,1.302938e-06,286.498260,3.272302e-03,51.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96556030,316971,UK,B0B82N3CQQ,-1.076433,6.007382e-08,-0.457645,1.105378e-07,19.459999,13.990000,6.433315,...,0.000000,0.000000,0.0,0.0,0.979710,421.320526,6.821542e-04,286.819031,3.196098e-03,4.0
96556031,316971,UK,B0BB9NW3F3,0.000000,1.762683e-07,0.000000,1.746882e-07,19.459999,22.097065,0.000000,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000e+00,0.000000,0.000000e+00,5.0
96556032,316971,UK,B0BDMVKTQ3,-1.079334,5.989980e-08,-1.901198,2.609658e-08,19.459999,41.990002,-1.094359,...,0.000000,0.000000,0.0,0.0,0.952480,410.857330,1.948851e-08,272.765411,2.518899e-09,44.0
96556033,316971,UK,B0BHW1D5VP,6.722834,1.465088e-04,6.111193,7.876277e-05,19.459999,26.990000,8.700006,...,0.000000,0.000000,0.0,0.0,0.973597,418.673431,4.833641e-05,285.864410,1.230364e-03,23.0


In [None]:
merged_candidates_freq

Unnamed: 0,sess_id,sess_locale,product,product_freq
0,0,DE,B000Q87D0Q,11.0
1,0,DE,B000QB30DW,114.0
2,0,DE,B004BIG55Q,1015.0
3,0,DE,B0053FTNQY,25.0
4,0,DE,B007QWII1S,44.0
...,...,...,...,...
96556030,316971,UK,B0B82N3CQQ,3.0
96556031,316971,UK,B0BB9NW3F3,5.0
96556032,316971,UK,B0BDMVKTQ3,41.0
96556033,316971,UK,B0BHW1D5VP,14.0
