In [None]:
import os
import random
import gzip
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
import html
import re
from sklearn.utils import shuffle
import pickle

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [None]:
def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    os.environ['PYTHONHASHSEED'] = str(seed) 

setup_seed(2025)

## Item Features

In [None]:
def clean_text(raw_text):
    if isinstance(raw_text, list):
        new_raw_text = []
        for raw in raw_text:
            raw = html.unescape(raw)
            raw = re.sub(r'</?\w+[^>]*>', '', raw)
            raw = re.sub(r'["\n\r]*', '', raw)
            new_raw_text.append(raw.strip())
        cleaned_text = ' '.join(new_raw_text)
    else:
        if isinstance(raw_text, dict):
            cleaned_text = str(raw_text)[1:-1].strip()
        else:
            cleaned_text = raw_text.strip()
        cleaned_text = html.unescape(cleaned_text)
        cleaned_text = re.sub(r'</?\w+[^>]*>', '', cleaned_text)
        cleaned_text = re.sub(r'["\n\r]*', '', cleaned_text)
    index = -1
    while -index < len(cleaned_text) and cleaned_text[index] == '.':
        index -= 1
    index += 1
    if index == 0:
        cleaned_text = cleaned_text + '.'
    else:
        cleaned_text = cleaned_text[:index] + '.'
    if len(cleaned_text) >= 2000:
        cleaned_text = ''
    return cleaned_text

In [None]:
item_feats = []
with gzip.open('../orig_data/meta_Electronics.json.gz', "r") as fp:
    for idx, line in tqdm(enumerate(fp), desc="Load metas"):
        
        # data = json.loads(line)
        data = eval(line) # 2014
        item = data["asin"]

        if 'title' in data.keys():
            title = clean_text(data["title"])
        else:
            title = ''

        if 'description' in data.keys():
            descriptions = data["description"]
            descriptions = clean_text(descriptions)
        else:
            descriptions = ''

        if 'brand' in data.keys():
            brand = data["brand"].replace("by\n", "").strip()
        else:
            brand = ''

        if 'category' in data.keys():
            category_key = 'category'
        elif 'categories' in data.keys():
            category_key = 'categories'
        else:
            category_key = None

        if category_key:
            categories = data[category_key]
            if category_key == 'categories':
                categories = sum(categories,[])
            
            new_categories = []
            for category in categories:
                if "</span>" in category:
                    break
                new_categories.append(category.strip())
            categories = ",".join(new_categories).strip()
        else:
            categories = ''

        item_feats.append({
            "asin": item,
            "title": title,
            "description": descriptions,
            "brand": brand,
            "categories": categories
        })
item_df = pd.DataFrame(item_feats)
item_df.head()
item_df.shape

In [None]:
item_df = item_df.drop_duplicates(subset=['asin'])
item_df = item_df.reset_index(drop=True)
item_df.head()
item_df.shape

In [None]:
def get_item_text(row):
    return ' '.join([row['title'], row['description']])
    
item_df['text'] = item_df.apply(get_item_text, axis=1)

In [None]:
def count_null_text(row):
    row_text = row['text'].strip()
    return len(row_text)==0

item_df_null_text = item_df.apply(count_null_text,axis=1)

In [None]:
item_df[~item_df_null_text].shape

In [None]:
item_df = item_df[~item_df_null_text].reset_index(drop=True)
item_df.shape

## Rec Inter

In [None]:
userID, itemID, score, ts = [],[],[],[]
with gzip.open('../orig_data/reviews_Electronics_5.json.gz') as f:
    for l in tqdm(f, desc="Load rec inter"):
        line = json.loads(l.strip())
        userID.append(line['reviewerID'])
        itemID.append(line['asin'])
        score.append(line['overall'])
        ts.append(line['unixReviewTime'])
    
rec_inter = pd.DataFrame(
    data = list(zip(userID, itemID, score, ts)), columns=['user_id','item_id','score','ts']
).sort_values(by=['user_id','ts']).reset_index(drop=True)
rec_inter.head()
rec_inter.shape

In [None]:
filter_rec_inter = rec_inter[rec_inter['item_id'].isin(item_df['asin'].tolist())].reset_index(drop=True)
filter_rec_inter.head()
filter_rec_inter.shape

## Src Inter

In [None]:
u_id_list, q_id_list, i_id_list, label_list = [],[],[],[]
with gzip.open('../orig_data/train.qrels.gz') as f:
    for l in tqdm(f):
        u_q_id, _, i_id, label = l.strip().decode().split(' ')
        u_id, q_id = u_q_id.split("_")
        u_id_list.append(u_id)
        q_id_list.append(int(q_id))
        i_id_list.append(i_id)
        label_list.append(label)

train_src = pd.DataFrame(
    data = list(zip(u_id_list, q_id_list, i_id_list, label_list)), columns=['user_id','query_id','item_id','label']
)
train_src.head()
train_src.shape

In [None]:
u_id_list, q_id_list, i_id_list, label_list = [],[],[],[]
with gzip.open('../orig_data/test.qrels.gz') as f:
    for l in tqdm(f):
        u_q_id, _, i_id, label = l.strip().decode().split(' ')
        u_id, q_id = u_q_id.split("_")
        u_id_list.append(u_id)
        q_id_list.append(int(q_id))
        i_id_list.append(i_id)
        label_list.append(label)

test_src = pd.DataFrame(
    data = list(zip(u_id_list, q_id_list, i_id_list, label_list)), columns=['user_id','query_id','item_id','label']
)
test_src.head()
test_src.shape

In [None]:
all_src_inter = pd.concat([train_src,test_src],axis=0).reset_index(drop=True)
all_src_inter.shape
all_src_inter.tail()

df = shuffle(all_src_inter)

new_src_Data = df.drop_duplicates(subset=['user_id', 'item_id'], keep='first')
new_src_Data.head()
new_src_Data.shape

In [None]:
merge_src_inter = rec_inter.merge(new_src_Data,how='inner',on=['user_id','item_id'])
merge_src_inter.shape
merge_src_inter.head()

In [None]:
filter_merge_src_inter = merge_src_inter[merge_src_inter['item_id'].isin(item_df['asin'].tolist())].reset_index(drop=True)
filter_merge_src_inter.shape
filter_merge_src_inter.head()

## Get Item Set

In [None]:
rec_item_set = set(rec_inter['item_id'].unique())
src_item_set = set(merge_src_inter['item_id'].unique())
len(rec_item_set)
len(src_item_set)
len(rec_item_set & src_item_set)
len(rec_item_set | src_item_set)

all_item_set = list(rec_item_set | src_item_set)
len(all_item_set)

In [None]:
item_df = item_df[item_df['asin'].isin(all_item_set)].reset_index(drop=True)
item_df.head()
item_df.shape

In [None]:
item_df = item_df.astype({"asin":'category'})
id2item = item_df['asin'].cat.categories.to_list()
# id2item[0]
item2id = {id2item[k]:k+1 for k in range(len(id2item))} # +1 for padding
# item2id[0]
item_df['item_id'] = item_df['asin'].map(item2id)

In [None]:
pad_item_df = pd.DataFrame({"item_id":[0], "asin":["<pad>"], "title":[""], "description":[""], "brand":[""], "categories":[""]})
pad_item_df

all_item_df = pd.concat([pad_item_df, item_df],axis=0)
all_item_df['item_id'] = all_item_df['item_id'].astype('int')
all_item_df = all_item_df.sort_values(by=['item_id'])
all_item_df = all_item_df.reset_index(drop=True)

all_item_df['item_id'].nunique()
all_item_df.head()
all_item_df.shape

all_item_df.to_pickle('../raw_data/item_info.pkl')

## Get User Set

In [None]:
rec_user_set = set(filter_rec_inter['user_id'].unique())
src_user_set = set(filter_merge_src_inter['user_id'].unique())
len(rec_user_set)
len(src_user_set) 
len(rec_user_set & src_user_set)
len(rec_user_set | src_user_set)

all_user_set = list(rec_user_set & src_user_set)
len(all_user_set)

In [None]:
user_info = pd.DataFrame(data=all_user_set, columns=['user'])
# user_info.head()

user_info = user_info.astype({"user":'category'})
id2user = user_info['user'].cat.categories.to_list()
# id2user[0]
user2id = {id2user[k]:k for k in range(len(id2user))}

user_info['user_id'] = user_info['user'].map(user2id)
user_info.head()
user_info.shape

In [None]:
filter_rec_inter = filter_rec_inter[filter_rec_inter['user_id'].isin(all_user_set)].reset_index(drop=True)
filter_rec_inter.shape

filter_merge_src_inter = filter_merge_src_inter[filter_merge_src_inter['user_id'].isin(all_user_set)].reset_index(drop=True)
filter_merge_src_inter.shape

## Map user/item to ID

In [None]:
filter_rec_inter['user_id'] = filter_rec_inter['user_id'].map(user2id)
filter_rec_inter['item_id'] = filter_rec_inter['item_id'].map(item2id)
filter_rec_inter.head()
filter_rec_inter.shape

In [None]:
filter_merge_src_inter['user_id'] = filter_merge_src_inter['user_id'].map(user2id)
filter_merge_src_inter['item_id'] = filter_merge_src_inter['item_id'].map(item2id)
filter_merge_src_inter.head()
filter_merge_src_inter.shape

## Get Src Session

In [None]:
query_text = []
with gzip.open('../orig_data/query_text.txt.gz') as f:
    for l in tqdm(f):
        query_text.append(l.strip().decode())

In [None]:
word2id = {'<pad>': 0}
id2word = ['<pad>']

query2id = {'<pad>': 0}
id2query = [{"query":"<pad>", "words":["<pad>"], "words_id":[0]}]

session_idx = 0

search_session_info_list = []
src_inter_list = []
for _, line in tqdm(filter_merge_src_inter.iterrows()):
    session_idx += 1

    user_id = line['user_id']
    session_id = session_idx
    click_item_ls = [line['item_id']]
    this_ts = line['ts']
    
    query = query_text[line['query_id']]
    query_words = query.split(' ')

    for word in query_words:
        if word not in word2id.keys():
            word2id[word] = len(word2id)
            id2word.append(word)
    
    query_words_id = [word2id[x] for x in query_words]

    if query not in query2id.keys():
        query2id[query] = len(query2id)
        id2query.append({"query":query, 
                         "words":query_words, 
                         "words_id":query_words_id})
        

    search_session_info_list.append({
        "search_session_id": session_id,
        "pos_items": click_item_ls,
        "keyword": query_words_id,
        'query_id':query2id[query],
        'click_list':[1],
        'time_list':[this_ts]
    })

    src_inter_list.append({
        "user_id": user_id,
        "item_id": click_item_ls[0],
        "search_session_id": session_id,
        "ts": this_ts,
        "keyword": query_words_id,
        'query_id':query2id[query]
    })

    
search_session_info = pd.DataFrame(search_session_info_list)
search_session_info.head()
search_session_info.shape

src_inter = pd.DataFrame(src_inter_list)
src_inter.head()
src_inter.shape

In [None]:
id2query[0]
id2query[1]

id2word[0]
id2word[1]

## Save Data

In [None]:
filter_rec_inter.to_pickle("../raw_data/rec_inter.pkl")

In [None]:
src_inter.to_pickle('../raw_data/src_inter.pkl')
search_session_info.to_pickle('../raw_data/session_info.pkl')

In [None]:
user_info.to_pickle('../raw_data/user_profile.pkl')
all_item_df.to_pickle('../raw_data/item_info.pkl')

In [None]:
pickle.dump(id2query, open("../vocab/query_vocab.pkl",'wb'))
pickle.dump(id2word, open("../vocab/word_vocab.pkl",'wb'))