In [None]:
import os
import random
import gzip
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
import html
import re
from sklearn.utils import shuffle
import pickle

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)


import warnings
warnings.filterwarnings("ignore")

In [None]:
def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    os.environ['PYTHONHASHSEED'] = str(seed) 

setup_seed(2025)

In [None]:
base_path = ".."

In [None]:
item_features = pd.read_pickle(f"{base_path}/raw_data/item_info.pkl")
item_features.head()
item_features.shape

item_vocab = item_features.set_index('item_id',drop=False).to_dict('index')
item_vocab[1]

item_features['item_id'].nunique()
item_features['item_id'].max()

<!-- ### Build BM25 index -->

In [None]:
item_features.head()
item_features.shape

In [None]:
def get_item_text(row):
    return ' '.join([row['title'], row['description']])
    

item_features['text'] = item_features.parallel_apply(get_item_text, axis=1)

In [None]:
def count_null_text(row):
    row_text = row['text'].strip()
    return len(row_text)==0

item_df_null_text = item_features.apply(count_null_text,axis=1)
item_df_null_text
item_df_null_text.sum()

In [None]:
item_index = item_features[['item_id','text']].rename(columns={'item_id':'id', 'text':'contents'})
item_index = item_index.astype({'id':'str', 'contents':'str'})
item_index.head()
item_index.shape

In [None]:
item_vocab[14885]

In [None]:
query_vocab = pickle.load(open('../vocab/query_vocab.pkl', 'rb'))
len(query_vocab)
query_vocab[1]['query']

## Load data

In [None]:
user_features = pd.read_pickle(f"{base_path}/raw_data/user_profile.pkl")
user_features.head()

user_vocab = user_features.set_index('user_id',drop=False).to_dict('index')
user_vocab[0]

user_features['user_id'].nunique()

In [None]:
rec_inter = pd.read_pickle(f"{base_path}/raw_data/rec_inter.pkl")
rec_inter.head()
rec_inter.shape

In [None]:
src_inter = pd.read_pickle(f"{base_path}/raw_data/src_inter.pkl")
src_inter.head()
src_inter.shape

In [None]:
session_info = pd.read_pickle(f"{base_path}/raw_data/session_info.pkl")
session_info.head()
session_info.shape

In [None]:
session_vocab = session_info.set_index('search_session_id',drop=False).to_dict('index')

In [None]:
rec_item_set = set(rec_inter['item_id'].unique())
src_item_set = set(src_inter['item_id'].unique())

len(rec_item_set)
len(src_item_set)
len(rec_item_set | src_item_set)
len(rec_item_set & src_item_set)

In [None]:
rec_user_set = set(rec_inter['user_id'].unique())
src_user_set = set(src_inter['user_id'].unique())

len(rec_user_set)
len(src_user_set)
len(rec_user_set | src_user_set)
len(rec_user_set & src_user_set)

In [None]:
sub_rec_inter = rec_inter[['user_id','item_id','ts','score']].copy()
sub_rec_inter['search_session_id'] = 'nan'
sub_rec_inter['behavior'] = 1

sub_session_src_inter = src_inter[['user_id','ts','search_session_id']].copy()
sub_session_src_inter['item_id'] = 'nan'
sub_session_src_inter['behavior'] = 2
sub_session_src_inter['score'] = 10000

sar_inter = pd.concat([sub_rec_inter,sub_session_src_inter],axis=0)
sar_inter = sar_inter.sort_values(by=['user_id','ts','score']).reset_index(drop=True)
sar_inter.head()

In [None]:
# user_vocab = {}
for key in rec_inter['user_id'].unique():
    # user_vocab[key] = {}

    user_vocab[key]['rec_his'] = []
    user_vocab[key]['rec_his_ts'] = []
    user_vocab[key]['src_session_his'] = []
    user_vocab[key]['src_session_his_ts'] = []
    user_vocab[key]['src_his'] = []
    user_vocab[key]['src_his_ts'] = []
    user_vocab[key]['src_his_query'] = []
    user_vocab[key]['all_his'] = []
    user_vocab[key]['all_his_ts'] = []
    user_vocab[key]['all_his_query'] = []

new_sar_inter_list = []
for _, line in tqdm(sar_inter.iterrows()):
    user_id, item_id, timestamp,\
        search_session_id, behavior = line['user_id'], line['item_id'], \
            line['ts'], line['search_session_id'], line['behavior']
    
    cur_rec_his_len = len(user_vocab[user_id]['rec_his'])
    cur_src_session_his_len = len(user_vocab[user_id]['src_session_his'])
    cur_src_his_len = len(user_vocab[user_id]['src_his'])
    cur_all_his_len = len(user_vocab[user_id]['all_his'])

    if behavior == 2:
        if session_vocab[search_session_id]['pos_items'][0] == user_vocab[user_id]['rec_his'][-1]:
            cur_rec_his_len -= 1
    
    new_sar_inter_list.append((user_id,item_id,timestamp, search_session_id,behavior,\
                               cur_rec_his_len,cur_src_session_his_len,cur_src_his_len,cur_all_his_len))

    if behavior == 1:
        user_vocab[user_id]['rec_his'].append(item_id)
        user_vocab[user_id]['rec_his_ts'].append(timestamp)
        user_vocab[user_id]['all_his'].append(item_id)
        user_vocab[user_id]['all_his_ts'].append(timestamp)
        user_vocab[user_id]['all_his_query'].append(0)
    elif behavior == 2:
        user_vocab[user_id]['src_session_his'].append(search_session_id)
        user_vocab[user_id]['src_session_his_ts'].append(timestamp)

        cur_query = session_vocab[search_session_id]['keyword']

        cur_session_pos = session_vocab[search_session_id]['pos_items']
        user_vocab[user_id]['src_his'].extend(cur_session_pos)
        user_vocab[user_id]['src_his_ts'].extend([timestamp]*len(cur_session_pos))
        user_vocab[user_id]['src_his_query'].extend([cur_query]*len(cur_session_pos))

        user_vocab[user_id]['all_his'].extend(cur_session_pos)
        user_vocab[user_id]['all_his_ts'].extend([timestamp]*len(cur_session_pos))
        user_vocab[user_id]['all_his_query'].extend([cur_query]*len(cur_session_pos))


In [None]:
new_sar_inter_df = pd.DataFrame(data=new_sar_inter_list,
                                columns=['user_id','item_id','ts','search_session_id','behavior',
                                         'rec_his','src_session_his','src_his','all_his'])
new_sar_inter_df.head()
new_sar_inter_df.shape

In [None]:
pickle.dump(item_vocab,open(f"{base_path}/vocab/item_vocab.pkl",'wb'))

pickle.dump(user_vocab,open(f"{base_path}/vocab/user_vocab.pkl",'wb'))

pickle.dump(session_vocab,open(f"{base_path}/vocab/src_session_vocab.pkl",'wb'))

In [None]:
def splitTrainTest(user_df):
    user_df['train'].iloc[-1] = 3
    user_df['train'].iloc[-2] = 2
    return user_df

## Rec Data

In [None]:
rec_w_his_inter = rec_inter.copy()
rec_w_his_inter = rec_w_his_inter.sort_values(by=['user_id','ts']).reset_index(drop=True)
rec_w_his_inter.shape
rec_w_his_inter.head(1)

rec_new_sar_inter_df = new_sar_inter_df[new_sar_inter_df.behavior==1]
rec_new_sar_inter_df = rec_new_sar_inter_df.sort_values(by=['user_id','ts']).reset_index(drop=True)
rec_new_sar_inter_df.shape
rec_new_sar_inter_df.head(1)

In [None]:
rec_w_his_inter[['rec_his','src_session_his',
                 'src_his','all_his']] = rec_new_sar_inter_df[['rec_his','src_session_his','src_his','all_his']]
rec_w_his_inter.head()

In [None]:
rec_w_his_inter = rec_w_his_inter[(rec_w_his_inter['rec_his']!=0) & (rec_w_his_inter['src_session_his']!=0)]
rec_w_his_inter.shape

In [None]:
rec_inter_num = rec_w_his_inter.groupby(by=['user_id']).count().reset_index()
filtered_users_rec = rec_inter_num[rec_inter_num['item_id'] >= 3]
filtered_users_rec.head(3), filtered_users_rec['item_id'].describe()

In [None]:
rec_w_his_inter = rec_w_his_inter[rec_w_his_inter['user_id'].isin(set(filtered_users_rec['user_id'].unique()))]
rec_w_his_inter = rec_w_his_inter.reset_index(drop=True)
rec_w_his_inter.head()
rec_w_his_inter.shape

In [None]:
rec_w_his_inter['train'] = 1
rec_w_his_inter_train = rec_w_his_inter.groupby('user_id').apply(splitTrainTest)

In [None]:
rec_train = rec_w_his_inter_train[rec_w_his_inter_train.train==1].reset_index(drop=True)
rec_train.drop(['train'],axis=1,inplace=True)
rec_train.shape

rec_val = rec_w_his_inter_train[rec_w_his_inter_train.train==2].reset_index(drop=True)
rec_val.drop(['train'],axis=1,inplace=True)
rec_val.shape

rec_test = rec_w_his_inter_train[rec_w_his_inter_train.train==3].reset_index(drop=True)
rec_test.drop(['train'],axis=1,inplace=True)
rec_test.shape

- Sample negative for val and test 

In [None]:
num_train_neg_samples = 4
num_test_neg_samples = 99

item_set = rec_w_his_inter['item_id'].to_list()

def SampleNegatives(row, cur_num_samples):
    count = 0 
    user_id = int(row['user_id'])
    cur_pos = int(row['item_id'])
    cur_all_his = user_vocab[user_id]['all_his'][:int(row['all_his'])]


    neg_samples = []
    while count < cur_num_samples:
        cur_neg = random.choice(item_set)
        if (cur_neg in cur_all_his) or (cur_neg in neg_samples) or (cur_neg == cur_pos):
            continue
        count += 1
        neg_samples.append(cur_neg)
    return neg_samples

In [None]:
rec_train['neg_items'] = rec_train.parallel_apply(SampleNegatives,cur_num_samples=4,axis=1)
rec_train.head()

In [None]:
rec_val['neg_items'] = rec_val.parallel_apply(SampleNegatives,cur_num_samples=99,axis=1)

In [None]:
rec_test['neg_items'] = rec_test.parallel_apply(SampleNegatives,cur_num_samples=99,axis=1)

In [None]:
rec_train.to_pickle(f"{base_path}/dataset/rec_train.pkl")

rec_val.to_pickle(f"{base_path}/dataset/rec_val.pkl")

rec_test.to_pickle(f"{base_path}/dataset/rec_test.pkl")