In [1]:
import os
import random
import gzip
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
import html
import re
from sklearn.utils import shuffle
import pickle

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [2]:
def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    os.environ['PYTHONHASHSEED'] = str(seed)  # 为了禁止hash随机化，使得实验可复现

setup_seed(2024)

## Item Features

In [3]:
def clean_text(raw_text):
    if isinstance(raw_text, list):
        new_raw_text = []
        for raw in raw_text:
            raw = html.unescape(raw)
            raw = re.sub(r'</?\w+[^>]*>', '', raw)
            raw = re.sub(r'["\n\r]*', '', raw)
            new_raw_text.append(raw.strip())
        cleaned_text = ' '.join(new_raw_text)
    else:
        if isinstance(raw_text, dict):
            cleaned_text = str(raw_text)[1:-1].strip()
        else:
            cleaned_text = raw_text.strip()
        cleaned_text = html.unescape(cleaned_text)
        cleaned_text = re.sub(r'</?\w+[^>]*>', '', cleaned_text)
        cleaned_text = re.sub(r'["\n\r]*', '', cleaned_text)
    index = -1
    while -index < len(cleaned_text) and cleaned_text[index] == '.':
        index -= 1
    index += 1
    if index == 0:
        cleaned_text = cleaned_text + '.'
    else:
        cleaned_text = cleaned_text[:index] + '.'
    if len(cleaned_text) >= 2000:
        cleaned_text = ''
    return cleaned_text

In [4]:
item_feats = []
with gzip.open('../orig_data/meta_Electronics_2014.json.gz', "r") as fp:
    for idx, line in tqdm(enumerate(fp), desc="Load metas"):
        
        # data = json.loads(line)
        data = eval(line) # 2014
        item = data["asin"]

        if 'title' in data.keys():
            title = clean_text(data["title"])
        else:
            title = ''

        if 'description' in data.keys():
            descriptions = data["description"]
            descriptions = clean_text(descriptions)
        else:
            descriptions = ''

        if 'brand' in data.keys():
            brand = data["brand"].replace("by\n", "").strip()
        else:
            brand = ''

        if 'category' in data.keys():
            category_key = 'category'
        elif 'categories' in data.keys():
            category_key = 'categories'
        else:
            category_key = None

        if category_key:
            categories = data[category_key]
            if category_key == 'categories':
                categories = sum(categories,[])
            
            new_categories = []
            for category in categories:
                if "</span>" in category:
                    break
                new_categories.append(category.strip())
            categories = ",".join(new_categories).strip()
        else:
            categories = ''

        item_feats.append({
            "asin": item,
            "title": title,
            "description": descriptions,
            "brand": brand,
            "categories": categories
        })
item_df = pd.DataFrame(item_feats)
item_df.head()
item_df.shape

Load metas: 498196it [02:48, 2955.98it/s]


Unnamed: 0,asin,title,description,brand,categories
0,132793040,Kelby Training DVD: Mastering Blend Modes in A...,The Kelby Training DVD Mastering Blend Modes i...,,"Electronics,Computers & Accessories,Cables & A..."
1,321732944,Kelby Training DVD: Adobe Photoshop CS5 Crash ...,,,"Electronics,Computers & Accessories,Cables & A..."
2,439886341,Digital Organizer and Messenger.,Digital Organizer and Messenger.,,"Electronics,Computers & Accessories,PDAs, Hand..."
3,511189877,CLIKR-5 Time Warner Cable Remote Control UR5U-...,The CLIKR-5 UR5U-8780L remote control is desig...,,"Electronics,Accessories & Supplies,Audio & Vid..."
4,528881469,Rand McNally 528881469 7-inch Intelliroute TND...,,,"Electronics,GPS & Navigation,Vehicle GPS,Truck..."


(498196, 5)

In [5]:
item_df = item_df.drop_duplicates(subset=['asin'])
item_df = item_df.reset_index(drop=True)
item_df.head()
item_df.shape

Unnamed: 0,asin,title,description,brand,categories
0,132793040,Kelby Training DVD: Mastering Blend Modes in A...,The Kelby Training DVD Mastering Blend Modes i...,,"Electronics,Computers & Accessories,Cables & A..."
1,321732944,Kelby Training DVD: Adobe Photoshop CS5 Crash ...,,,"Electronics,Computers & Accessories,Cables & A..."
2,439886341,Digital Organizer and Messenger.,Digital Organizer and Messenger.,,"Electronics,Computers & Accessories,PDAs, Hand..."
3,511189877,CLIKR-5 Time Warner Cable Remote Control UR5U-...,The CLIKR-5 UR5U-8780L remote control is desig...,,"Electronics,Accessories & Supplies,Audio & Vid..."
4,528881469,Rand McNally 528881469 7-inch Intelliroute TND...,,,"Electronics,GPS & Navigation,Vehicle GPS,Truck..."


(498196, 5)

In [6]:
def get_item_text(row):
    return ' '.join([row['title'], row['description']])
    
item_df['text'] = item_df.apply(get_item_text, axis=1)

In [7]:
def count_null_text(row):
    row_text = row['text'].strip()
    return len(row_text)==0

item_df_null_text = item_df.apply(count_null_text,axis=1)

In [8]:
item_df[~item_df_null_text].shape

(497343, 6)

In [9]:
# item_df = item_df[~item_df_null_text].reset_index(drop=True)
# item_df.shape

## Rec Inter

In [10]:
userID, itemID, score, ts = [],[],[],[]
with gzip.open('../orig_data/reviews_Electronics_5.json.gz') as f:
    for l in tqdm(f, desc="Load rec inter"):
        line = json.loads(l.strip())
        userID.append(line['reviewerID'])
        itemID.append(line['asin'])
        score.append(line['overall'])
        ts.append(line['unixReviewTime'])
    
rec_inter = pd.DataFrame(
    data = list(zip(userID, itemID, score, ts)), columns=['user_id','item_id','score','ts']
).sort_values(by=['user_id','ts']).reset_index(drop=True)
rec_inter.head()
rec_inter.shape

Load rec inter: 1689188it [00:37, 44970.07it/s]


Unnamed: 0,user_id,item_id,score,ts
0,A000715434M800HLCENK9,B000UYYZ0M,1.0,1400457600
1,A000715434M800HLCENK9,B001EHAI6Y,5.0,1400457600
2,A000715434M800HLCENK9,B003AFONFU,3.0,1400457600
3,A000715434M800HLCENK9,B003ES5ZUU,2.0,1400457600
4,A000715434M800HLCENK9,B00HMZG3YS,5.0,1400457600


(1689188, 4)

In [11]:
filter_rec_inter = rec_inter[rec_inter['item_id'].isin(item_df['asin'].tolist())].reset_index(drop=True)
filter_rec_inter.head()
filter_rec_inter.shape

Unnamed: 0,user_id,item_id,score,ts
0,A000715434M800HLCENK9,B000UYYZ0M,1.0,1400457600
1,A000715434M800HLCENK9,B001EHAI6Y,5.0,1400457600
2,A000715434M800HLCENK9,B003AFONFU,3.0,1400457600
3,A000715434M800HLCENK9,B003ES5ZUU,2.0,1400457600
4,A000715434M800HLCENK9,B00HMZG3YS,5.0,1400457600


(1689188, 4)

## Src Inter

In [12]:
u_id_list, q_id_list, i_id_list, label_list = [],[],[],[]
with gzip.open('../orig_data/train.qrels.gz') as f:
    for l in tqdm(f):
        u_q_id, _, i_id, label = l.strip().decode().split(' ')
        u_id, q_id = u_q_id.split("_")
        u_id_list.append(u_id)
        q_id_list.append(int(q_id))
        i_id_list.append(i_id)
        label_list.append(label)

train_src = pd.DataFrame(
    data = list(zip(u_id_list, q_id_list, i_id_list, label_list)), columns=['user_id','query_id','item_id','label']
)
train_src.head()
train_src.shape

1344310it [00:05, 232776.54it/s]


Unnamed: 0,user_id,query_id,item_id,label
0,A26ZDKC53OP6JD,22,B000M51NRM,1
1,A26ZDKC53OP6JD,422,B007KFVQXI,1
2,A26ZDKC53OP6JD,338,B0033UPL3A,1
3,A26ZDKC53OP6JD,30,B001D0E4EE,1
4,A3PU0U2IOF9IKV,42,B000HPV3RW,1


(1344310, 4)

In [13]:
u_id_list, q_id_list, i_id_list, label_list = [],[],[],[]
with gzip.open('../orig_data/test.qrels.gz') as f:
    for l in tqdm(f):
        u_q_id, _, i_id, label = l.strip().decode().split(' ')
        u_id, q_id = u_q_id.split("_")
        u_id_list.append(u_id)
        q_id_list.append(int(q_id))
        i_id_list.append(i_id)
        label_list.append(label)

test_src = pd.DataFrame(
    data = list(zip(u_id_list, q_id_list, i_id_list, label_list)), columns=['user_id','query_id','item_id','label']
)
test_src.head()
test_src.shape

5541it [00:00, 274151.42it/s]


Unnamed: 0,user_id,query_id,item_id,label
0,A35D452UPESD7P,259,B000S5Q9CA,1
1,A2IPVT2ZRN0VX1,640,B004R9OVEG,1
2,A3VEGTSHTWDZNI,630,B0079UAT0A,1
3,A1X71ZRAH72HEH,155,B005DOK8NW,1
4,A2OZ6PW2W5FQTP,520,B0094CX5M0,1


(5541, 4)

In [14]:
all_src_inter = pd.concat([train_src,test_src],axis=0).reset_index(drop=True)
all_src_inter.shape
all_src_inter.tail()

df = shuffle(all_src_inter)

new_src_Data = df.drop_duplicates(subset=['user_id', 'item_id'], keep='first')
new_src_Data.head()
new_src_Data.shape

(1349851, 4)

Unnamed: 0,user_id,query_id,item_id,label
1349846,A15CCQLICLK4KF,585,B006GWO5WK,1
1349847,A35UO4FBTUV47Y,908,B004XZHY34,1
1349848,AHXSLU9C78I2G,585,B006GWO5WK,1
1349849,A3C20ENKWB1NZ3,155,B005DOK8NW,1
1349850,A3C20ENKWB1NZ3,906,B0026RHUQC,1


Unnamed: 0,user_id,query_id,item_id,label
1344339,APZ050ESISCN6,986,B000B9RI14,1
991784,A1TR1R2QKWRSRA,26,B005A0B3FQ,1
131977,A1EV3O3KX1N39L,21,B00CGVF8E4,1
214006,A2OIF6ZL17EQSM,59,B001QY1KSK,1
1296126,A1RIZP61N768LW,158,B003L1ZYZ6,1


(1280465, 4)

In [15]:
merge_src_inter = rec_inter.merge(new_src_Data,how='inner',on=['user_id','item_id'])
merge_src_inter.shape
merge_src_inter.head()

(1280465, 6)

Unnamed: 0,user_id,item_id,score,ts,query_id,label
0,A000715434M800HLCENK9,B000UYYZ0M,1.0,1400457600,433,1
1,A000715434M800HLCENK9,B001EHAI6Y,5.0,1400457600,834,1
2,A000715434M800HLCENK9,B003ES5ZUU,2.0,1400457600,22,1
3,A000715434M800HLCENK9,B00HMZG3YS,5.0,1400457600,104,1
4,A00101847G3FJTWYGNQA,B00C7NSIO8,5.0,1379548800,57,1


In [16]:
filter_merge_src_inter = merge_src_inter[merge_src_inter['item_id'].isin(item_df['asin'].tolist())].reset_index(drop=True)
filter_merge_src_inter.shape
filter_merge_src_inter.head()

(1275989, 6)

Unnamed: 0,user_id,item_id,score,ts,query_id,label
0,A000715434M800HLCENK9,B000UYYZ0M,1.0,1400457600,433,1
1,A000715434M800HLCENK9,B001EHAI6Y,5.0,1400457600,834,1
2,A000715434M800HLCENK9,B003ES5ZUU,2.0,1400457600,22,1
3,A000715434M800HLCENK9,B00HMZG3YS,5.0,1400457600,104,1
4,A00101847G3FJTWYGNQA,B00C7NSIO8,5.0,1379548800,57,1


## Get Item Set

In [16]:
rec_item_set = set(rec_inter['item_id'].unique())
src_item_set = set(merge_src_inter['item_id'].unique())
len(rec_item_set)
len(src_item_set)
len(rec_item_set & src_item_set)
len(rec_item_set | src_item_set)

all_item_set = list(rec_item_set | src_item_set)
len(all_item_set)

63001

62993

62993

63001

63001

In [17]:
item_df = item_df[item_df['asin'].isin(all_item_set)].reset_index(drop=True)
item_df.head()
item_df.shape

Unnamed: 0,asin,title,description,brand,categories,text
0,528881469,Rand McNally 528881469 7-inch Intelliroute TND...,,,"Electronics,GPS & Navigation,Vehicle GPS,Truck...",Rand McNally 528881469 7-inch Intelliroute TND...
1,594451647,Barnes & Noble HDTV Adapter Kit for NOOK HD an...,HDTV Adapter Kit for NOOK HD and NOOK HD+This ...,,"Electronics,Computers & Accessories,Touch Scre...",Barnes & Noble HDTV Adapter Kit for NOOK HD an...
2,594481813,Barnes & Noble OV/HB-ADP Universal Power Kit.,Power up your device with this Barnes & Noble ...,Barnes &amp; Noble,"Electronics,eBook Readers & Accessories,Power ...",Barnes & Noble OV/HB-ADP Universal Power Kit. ...
3,972683275,VideoSecu 24 Long Arm TV Wall Mount Low Profil...,The VideoSecu TV mount is a mounting solution ...,VideoSecu,"Electronics,Accessories & Supplies,Audio & Vid...",VideoSecu 24 Long Arm TV Wall Mount Low Profil...
4,1400532620,Barnes & Noble Nook eReader - no 3G.,Barnes & Noble Nook eReader - no 3GMeet nook. ...,Barnes &amp; Noble,"Electronics,eBook Readers & Accessories",Barnes & Noble Nook eReader - no 3G. Barnes & ...


(63001, 6)

In [18]:
item_df_null_text = item_df.apply(count_null_text,axis=1)

In [19]:
item_df[~item_df_null_text].shape

(62882, 6)

In [20]:
(63001 - 62882) / 63001

0.001888858907001476

In [19]:
item_df = item_df.astype({"asin":'category'})
id2item = item_df['asin'].cat.categories.to_list()
# id2item[0]
item2id = {id2item[k]:k+1 for k in range(len(id2item))} # +1 for padding
# item2id[0]
item_df['item_id'] = item_df['asin'].map(item2id)

In [20]:
pad_item_df = pd.DataFrame({"item_id":[0], "asin":["<pad>"], "title":[""], "description":[""], "brand":[""], "categories":[""]})
pad_item_df

all_item_df = pd.concat([pad_item_df, item_df],axis=0)
all_item_df['item_id'] = all_item_df['item_id'].astype('int')
all_item_df = all_item_df.sort_values(by=['item_id'])
all_item_df = all_item_df.reset_index(drop=True)

all_item_df['item_id'].nunique()
all_item_df.head()
all_item_df.shape

all_item_df.to_pickle('../raw_data/item_info.pkl')

Unnamed: 0,item_id,asin,title,description,brand,categories
0,0,<pad>,,,,


62883

Unnamed: 0,item_id,asin,title,description,brand,categories,text
0,0,<pad>,,,,,
1,1,0528881469,Rand McNally 528881469 7-inch Intelliroute TND...,,,"Electronics,GPS & Navigation,Vehicle GPS,Truck...",Rand McNally 528881469 7-inch Intelliroute TND...
2,2,0594451647,Barnes & Noble HDTV Adapter Kit for NOOK HD an...,HDTV Adapter Kit for NOOK HD and NOOK HD+This ...,,"Electronics,Computers & Accessories,Touch Scre...",Barnes & Noble HDTV Adapter Kit for NOOK HD an...
3,3,0594481813,Barnes & Noble OV/HB-ADP Universal Power Kit.,Power up your device with this Barnes & Noble ...,Barnes &amp; Noble,"Electronics,eBook Readers & Accessories,Power ...",Barnes & Noble OV/HB-ADP Universal Power Kit. ...
4,4,0972683275,VideoSecu 24 Long Arm TV Wall Mount Low Profil...,The VideoSecu TV mount is a mounting solution ...,VideoSecu,"Electronics,Accessories & Supplies,Audio & Vid...",VideoSecu 24 Long Arm TV Wall Mount Low Profil...


(62883, 7)

## Get User Set

In [21]:
rec_user_set = set(filter_rec_inter['user_id'].unique())
src_user_set = set(filter_merge_src_inter['user_id'].unique())
len(rec_user_set)
len(src_user_set) 
len(rec_user_set & src_user_set)
len(rec_user_set | src_user_set)

all_user_set = list(rec_user_set & src_user_set)
len(all_user_set)

192403

192403

192403

192403

192403

In [22]:
user_info = pd.DataFrame(data=all_user_set, columns=['user'])
# user_info.head()

user_info = user_info.astype({"user":'category'})
id2user = user_info['user'].cat.categories.to_list()
# id2user[0]
user2id = {id2user[k]:k for k in range(len(id2user))}

user_info['user_id'] = user_info['user'].map(user2id)
user_info.head()
user_info.shape

Unnamed: 0,user,user_id
0,A26ITT1SYI2M80,60375
1,A1Y3TXS72GQQ4W,48445
2,ATDC6GCUBYPVJ,183215
3,A1RWIXUIPWK5RS,39731
4,A2IXJ6Q88KCH2I,77600


(192403, 2)

In [23]:
filter_rec_inter = filter_rec_inter[filter_rec_inter['user_id'].isin(all_user_set)].reset_index(drop=True)
filter_rec_inter.shape

filter_merge_src_inter = filter_merge_src_inter[filter_merge_src_inter['user_id'].isin(all_user_set)].reset_index(drop=True)
filter_merge_src_inter.shape

(1684135, 4)

(1275989, 6)

## Map user/item to ID

In [24]:
filter_rec_inter['user_id'] = filter_rec_inter['user_id'].map(user2id)
filter_rec_inter['item_id'] = filter_rec_inter['item_id'].map(item2id)
filter_rec_inter.head()
filter_rec_inter.shape

Unnamed: 0,user_id,item_id,score,ts
0,0,13169,1.0,1400457600
1,0,17983,5.0,1400457600
2,0,28305,3.0,1400457600
3,0,29220,2.0,1400457600
4,0,62158,5.0,1400457600


(1684135, 4)

In [25]:
filter_merge_src_inter['user_id'] = filter_merge_src_inter['user_id'].map(user2id)
filter_merge_src_inter['item_id'] = filter_merge_src_inter['item_id'].map(item2id)
filter_merge_src_inter.head()
filter_merge_src_inter.shape

Unnamed: 0,user_id,item_id,score,ts,query_id,label
0,0,13169,1.0,1400457600,433,1
1,0,17983,5.0,1400457600,834,1
2,0,29220,2.0,1400457600,22,1
3,0,62158,5.0,1400457600,104,1
4,1,58032,5.0,1379548800,57,1


(1275989, 6)

## Get Src Session

In [26]:
query_text = []
with gzip.open('../orig_data/query_text.txt.gz') as f:
    for l in tqdm(f):
        query_text.append(l.strip().decode())

989it [00:00, 393713.62it/s]


In [27]:
word2id = {'<pad>': 0}
id2word = ['<pad>']

query2id = {'<pad>': 0}
id2query = [{"query":"<pad>", "words":["<pad>"], "words_id":[0]}]

session_idx = 0

search_session_info_list = []
src_inter_list = []
for _, line in tqdm(filter_merge_src_inter.iterrows()):
    session_idx += 1

    user_id = line['user_id']
    session_id = session_idx
    click_item_ls = [line['item_id']]
    this_ts = line['ts']
    
    query = query_text[line['query_id']]
    query_words = query.split(' ')

    for word in query_words:
        if word not in word2id.keys():
            word2id[word] = len(word2id)
            id2word.append(word)
    
    query_words_id = [word2id[x] for x in query_words]

    if query not in query2id.keys():
        query2id[query] = len(query2id)
        id2query.append({"query":query, 
                         "words":query_words, 
                         "words_id":query_words_id})
        

    search_session_info_list.append({
        "search_session_id": session_id,
        "pos_items": click_item_ls,
        "keyword": query_words_id,
        'query_id':query2id[query],
        'click_list':[1],
        'time_list':[this_ts]
    })

    src_inter_list.append({
        "user_id": user_id,
        "item_id": click_item_ls[0],
        "search_session_id": session_id,
        "ts": this_ts,
        "keyword": query_words_id,
        'query_id':query2id[query]
    })

    
search_session_info = pd.DataFrame(search_session_info_list)
search_session_info.head()
search_session_info.shape

src_inter = pd.DataFrame(src_inter_list)
src_inter.head()
src_inter.shape

1275989it [02:15, 9419.36it/s] 


Unnamed: 0,search_session_id,pos_items,keyword,query_id,click_list,time_list
0,1,[13169],"[1, 2, 3, 4, 5, 6]",1,[1],[1400457600]
1,2,[17983],"[3, 7, 8, 9, 10, 11, 4, 12, 13]",2,[1],[1400457600]
2,3,[29220],"[3, 1, 8, 4, 14, 9, 15, 16]",3,[1],[1400457600]
3,4,[62158],"[3, 17, 18, 19, 20]",4,[1],[1400457600]
4,5,[58032],"[3, 21, 4, 22]",5,[1],[1379548800]


(1275989, 6)

Unnamed: 0,user_id,item_id,search_session_id,ts,keyword,query_id
0,0,13169,1,1400457600,"[1, 2, 3, 4, 5, 6]",1
1,0,17983,2,1400457600,"[3, 7, 8, 9, 10, 11, 4, 12, 13]",2
2,0,29220,3,1400457600,"[3, 1, 8, 4, 14, 9, 15, 16]",3
3,0,62158,4,1400457600,"[3, 17, 18, 19, 20]",4
4,1,58032,5,1379548800,"[3, 21, 4, 22]",5


(1275989, 6)

In [28]:
id2query[0]
id2query[1]

id2word[0]
id2word[1]

{'query': '<pad>', 'words': ['<pad>'], 'words_id': [0]}

{'query': 'supplies office electronics accessory projection screen',
 'words': ['supplies',
  'office',
  'electronics',
  'accessory',
  'projection',
  'screen'],
 'words_id': [1, 2, 3, 4, 5, 6]}

'<pad>'

'supplies'

## Save Data

In [29]:
filter_rec_inter.to_pickle("../raw_data/rec_inter.pkl")

In [30]:
src_inter.to_pickle('../raw_data/src_inter.pkl')
search_session_info.to_pickle('../raw_data/session_info.pkl')

In [31]:
user_info.to_pickle('../raw_data/user_profile.pkl')
all_item_df.to_pickle('../raw_data/item_info.pkl')

In [32]:
pickle.dump(id2query, open("../vocab/query_vocab.pkl",'wb'))
pickle.dump(id2word, open("../vocab/word_vocab.pkl",'wb'))