In [3]:
import os
import random
import gzip
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
import html
import re
from sklearn.utils import shuffle
import pickle

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)


import warnings
warnings.filterwarnings("ignore")

INFO: Pandarallel will run on 28 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    os.environ['PYTHONHASHSEED'] = str(seed)  # 为了禁止hash随机化，使得实验可复现

setup_seed(2024)

In [3]:
base_path = ".."

In [4]:
item_features = pd.read_pickle(f"{base_path}/raw_data/item_info.pkl")
item_features.head()
item_features.shape

item_vocab = item_features.set_index('item_id',drop=False).to_dict('index')
item_vocab[1]

item_features['item_id'].nunique()
item_features['item_id'].max()

Unnamed: 0,item_id,asin,title,description,brand,categories,text
0,0,<pad>,,,,,
1,1,0528881469,Rand McNally 528881469 7-inch Intelliroute TND...,,,"Electronics,GPS & Navigation,Vehicle GPS,Truck...",Rand McNally 528881469 7-inch Intelliroute TND...
2,2,0594451647,Barnes & Noble HDTV Adapter Kit for NOOK HD an...,HDTV Adapter Kit for NOOK HD and NOOK HD+This ...,,"Electronics,Computers & Accessories,Touch Scre...",Barnes & Noble HDTV Adapter Kit for NOOK HD an...
3,3,0594481813,Barnes & Noble OV/HB-ADP Universal Power Kit.,Power up your device with this Barnes & Noble ...,Barnes &amp; Noble,"Electronics,eBook Readers & Accessories,Power ...",Barnes & Noble OV/HB-ADP Universal Power Kit. ...
4,4,0972683275,VideoSecu 24 Long Arm TV Wall Mount Low Profil...,The VideoSecu TV mount is a mounting solution ...,VideoSecu,"Electronics,Accessories & Supplies,Audio & Vid...",VideoSecu 24 Long Arm TV Wall Mount Low Profil...


(62883, 7)

{'item_id': 1,
 'asin': '0528881469',
 'title': 'Rand McNally 528881469 7-inch Intelliroute TND 700 Truck GPS.',
 'description': '',
 'brand': '',
 'categories': 'Electronics,GPS & Navigation,Vehicle GPS,Trucking GPS',
 'text': 'Rand McNally 528881469 7-inch Intelliroute TND 700 Truck GPS. '}

62883

np.int64(62882)

### Build BM25 index

In [5]:
item_features.head()
item_features.shape

Unnamed: 0,item_id,asin,title,description,brand,categories,text
0,0,<pad>,,,,,
1,1,0528881469,Rand McNally 528881469 7-inch Intelliroute TND...,,,"Electronics,GPS & Navigation,Vehicle GPS,Truck...",Rand McNally 528881469 7-inch Intelliroute TND...
2,2,0594451647,Barnes & Noble HDTV Adapter Kit for NOOK HD an...,HDTV Adapter Kit for NOOK HD and NOOK HD+This ...,,"Electronics,Computers & Accessories,Touch Scre...",Barnes & Noble HDTV Adapter Kit for NOOK HD an...
3,3,0594481813,Barnes & Noble OV/HB-ADP Universal Power Kit.,Power up your device with this Barnes & Noble ...,Barnes &amp; Noble,"Electronics,eBook Readers & Accessories,Power ...",Barnes & Noble OV/HB-ADP Universal Power Kit. ...
4,4,0972683275,VideoSecu 24 Long Arm TV Wall Mount Low Profil...,The VideoSecu TV mount is a mounting solution ...,VideoSecu,"Electronics,Accessories & Supplies,Audio & Vid...",VideoSecu 24 Long Arm TV Wall Mount Low Profil...


(62883, 7)

In [6]:
# import jieba

def get_item_text(row):
    return ' '.join([row['title'], row['description']])
    
# get_item_text(all_item_df.iloc[1])

item_features['text'] = item_features.parallel_apply(get_item_text, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2246), Label(value='0 / 2246'))), …

In [7]:
def count_null_text(row):
    row_text = row['text'].strip()
    return len(row_text)==0

item_df_null_text = item_features.apply(count_null_text,axis=1)
item_df_null_text
item_df_null_text.sum()

0         True
1        False
2        False
3        False
4        False
         ...  
62878    False
62879    False
62880    False
62881    False
62882    False
Length: 62883, dtype: bool

np.int64(1)

In [8]:
item_index = item_features[['item_id','text']].rename(columns={'item_id':'id', 'text':'contents'})
item_index = item_index.astype({'id':'str', 'contents':'str'})
item_index.head()
item_index.shape

Unnamed: 0,id,contents
0,0,
1,1,Rand McNally 528881469 7-inch Intelliroute TND...
2,2,Barnes & Noble HDTV Adapter Kit for NOOK HD an...
3,3,Barnes & Noble OV/HB-ADP Universal Power Kit. ...
4,4,VideoSecu 24 Long Arm TV Wall Mount Low Profil...


(62883, 2)

In [10]:
json.dump(item_index.to_dict('records'), 
        open(('../item_corpus/corpus.json'), "w"), indent=4, ensure_ascii=False)

In [11]:
print(item_index.iloc[:10].to_dict(orient='records'))

[{'id': '0', 'contents': ' '}, {'id': '1', 'contents': 'Rand McNally 528881469 7-inch Intelliroute TND 700 Truck GPS. '}, {'id': '2', 'contents': "Barnes & Noble HDTV Adapter Kit for NOOK HD and NOOK HD+. HDTV Adapter Kit for NOOK HD and NOOK HD+This handy kit enables you to stream content from your NOOK HD or NOOK HD+ to a high-definition TV, via the included adapter and High Speed HDMI Cable. The kit also includes a pass-through that allows you to charge your NOOK while streaming, so there's no danger of running out of battery power before you're done. (A compatible NOOK Power Kit is required for this function and is not included with the adapter kit.)."}, {'id': '3', 'contents': 'Barnes & Noble OV/HB-ADP Universal Power Kit. Power up your device with this Barnes & Noble OV/HB-ADP Universal Power Kit._x000D_ _x000D_ This Universal Power Kit is specifically designed for your Nook HD and Nook HD+. The kit comes in handy in case you need to replace an old AV adapter or just happen to ne

python -m pyserini.index.lucene --collection JsonCollection --input item_corpus --index item_index --generator DefaultLuceneDocumentGenerator --threads 1 --storePositions --storeDocvectors --storeRaw

In [None]:
from pyserini.search.lucene import LuceneSearcher

searcher = LuceneSearcher('../item_index')

In [13]:
hits = searcher.search('Rand McNally', k=10)

for i in range(len(hits)):
    print(f'{i+1:2} {hits[i].docid:4} {hits[i].score:.5f}')

 1 46328 12.88880
 2 56506 10.71190
 3 37479 10.67840
 4 37608 10.67840
 5 39277 10.61220
 6 58029 10.61220
 7 1    10.57940
 8 43342 10.57940
 9 51788 10.54670
10 46327 10.51430


In [14]:
item_vocab[46328]

{'item_id': 46328,
 'asin': 'B006ZOI9YY',
 'title': 'Rand McNally 7-Inch GPS Hard Case.',
 'description': 'Rand Mcnally 0528005197 7 GPS Hard Case.',
 'brand': 'Rand McNally',
 'categories': 'Electronics,GPS & Navigation,GPS System Accessories,Cases',
 'text': 'Rand McNally 7-Inch GPS Hard Case. Rand Mcnally 0528005197 7 GPS Hard Case.'}

In [15]:
query_vocab = pickle.load(open('../vocab/query_vocab.pkl', 'rb'))
len(query_vocab)
query_vocab[1]['query']

983

'supplies office electronics accessory projection screen'

## Load data

In [16]:
user_features = pd.read_pickle(f"{base_path}/raw_data/user_profile.pkl")
user_features.head()

user_vocab = user_features.set_index('user_id',drop=False).to_dict('index')
user_vocab[0]

user_features['user_id'].nunique()

Unnamed: 0,user,user_id
0,A26ITT1SYI2M80,60375
1,A1Y3TXS72GQQ4W,48445
2,ATDC6GCUBYPVJ,183215
3,A1RWIXUIPWK5RS,39731
4,A2IXJ6Q88KCH2I,77600


{'user': 'A000715434M800HLCENK9', 'user_id': 0}

192403

In [17]:
rec_inter = pd.read_pickle(f"{base_path}/raw_data/rec_inter.pkl")
rec_inter.head()
rec_inter.shape

Unnamed: 0,user_id,item_id,score,ts
0,0,13169,1.0,1400457600
1,0,17983,5.0,1400457600
2,0,28305,3.0,1400457600
3,0,29220,2.0,1400457600
4,0,62158,5.0,1400457600


(1684135, 4)

In [18]:
src_inter = pd.read_pickle(f"{base_path}/raw_data/src_inter.pkl")
src_inter.head()
src_inter.shape

Unnamed: 0,user_id,item_id,search_session_id,ts,keyword,query_id
0,0,13169,1,1400457600,"[1, 2, 3, 4, 5, 6]",1
1,0,17983,2,1400457600,"[3, 7, 8, 9, 10, 11, 4, 12, 13]",2
2,0,29220,3,1400457600,"[3, 1, 8, 4, 14, 9, 15, 16]",3
3,0,62158,4,1400457600,"[3, 17, 18, 19, 20]",4
4,1,58032,5,1379548800,"[3, 21, 4, 22]",5


(1275989, 6)

In [19]:
session_info = pd.read_pickle(f"{base_path}/raw_data/session_info.pkl")
session_info.head()
session_info.shape

Unnamed: 0,search_session_id,pos_items,keyword,query_id,click_list,time_list
0,1,[13169],"[1, 2, 3, 4, 5, 6]",1,[1],[1400457600]
1,2,[17983],"[3, 7, 8, 9, 10, 11, 4, 12, 13]",2,[1],[1400457600]
2,3,[29220],"[3, 1, 8, 4, 14, 9, 15, 16]",3,[1],[1400457600]
3,4,[62158],"[3, 17, 18, 19, 20]",4,[1],[1400457600]
4,5,[58032],"[3, 21, 4, 22]",5,[1],[1379548800]


(1275989, 6)

In [20]:
session_vocab = session_info.set_index('search_session_id',drop=False).to_dict('index')

In [21]:
rec_item_set = set(rec_inter['item_id'].unique())
src_item_set = set(src_inter['item_id'].unique())

len(rec_item_set)
len(src_item_set)
len(rec_item_set | src_item_set)
len(rec_item_set & src_item_set)

62882

62874

62882

62874

In [22]:
rec_user_set = set(rec_inter['user_id'].unique())
src_user_set = set(src_inter['user_id'].unique())

len(rec_user_set)
len(src_user_set)
len(rec_user_set | src_user_set)
len(rec_user_set & src_user_set)

192403

192403

192403

192403

In [23]:
sub_rec_inter = rec_inter[['user_id','item_id','ts','score']].copy()
sub_rec_inter['search_session_id'] = 'nan'
sub_rec_inter['behavior'] = 1

sub_session_src_inter = src_inter[['user_id','ts','search_session_id']].copy()
sub_session_src_inter['item_id'] = 'nan'
sub_session_src_inter['behavior'] = 2
sub_session_src_inter['score'] = 10000

sar_inter = pd.concat([sub_rec_inter,sub_session_src_inter],axis=0)
sar_inter = sar_inter.sort_values(by=['user_id','ts','score']).reset_index(drop=True)
sar_inter.head()

Unnamed: 0,user_id,item_id,ts,score,search_session_id,behavior
0,0,13169,1400457600,1.0,,1
1,0,29220,1400457600,2.0,,1
2,0,28305,1400457600,3.0,,1
3,0,17983,1400457600,5.0,,1
4,0,62158,1400457600,5.0,,1


In [24]:
# user_vocab = {}
for key in rec_inter['user_id'].unique():
    # user_vocab[key] = {}

    user_vocab[key]['rec_his'] = []
    user_vocab[key]['rec_his_ts'] = []
    user_vocab[key]['src_session_his'] = []
    user_vocab[key]['src_session_his_ts'] = []
    user_vocab[key]['src_his'] = []
    user_vocab[key]['src_his_ts'] = []
    user_vocab[key]['src_his_query'] = []
    user_vocab[key]['all_his'] = []
    user_vocab[key]['all_his_ts'] = []
    user_vocab[key]['all_his_query'] = []

new_sar_inter_list = []
for _, line in tqdm(sar_inter.iterrows()):
    user_id, item_id, timestamp,\
        search_session_id, behavior = line['user_id'], line['item_id'], \
            line['ts'], line['search_session_id'], line['behavior']
    
    cur_rec_his_len = len(user_vocab[user_id]['rec_his'])
    cur_src_session_his_len = len(user_vocab[user_id]['src_session_his'])
    cur_src_his_len = len(user_vocab[user_id]['src_his'])
    cur_all_his_len = len(user_vocab[user_id]['all_his'])

    if behavior == 2:
        if session_vocab[search_session_id]['pos_items'][0] == user_vocab[user_id]['rec_his'][-1]:
            cur_rec_his_len -= 1
    
    # if (cur_rec_his_len > 0) and (cur_src_session_his_len > 0):
    new_sar_inter_list.append((user_id,item_id,timestamp, search_session_id,behavior,\
                               cur_rec_his_len,cur_src_session_his_len,cur_src_his_len,cur_all_his_len))

    if behavior == 1:
        user_vocab[user_id]['rec_his'].append(item_id)
        user_vocab[user_id]['rec_his_ts'].append(timestamp)
        user_vocab[user_id]['all_his'].append(item_id)
        user_vocab[user_id]['all_his_ts'].append(timestamp)
        user_vocab[user_id]['all_his_query'].append(0)
    elif behavior == 2:
        user_vocab[user_id]['src_session_his'].append(search_session_id)
        user_vocab[user_id]['src_session_his_ts'].append(timestamp)

        cur_query = session_vocab[search_session_id]['keyword']
        # cur_query = session_vocab[search_session_id]['query_id']

        cur_session_pos = session_vocab[search_session_id]['pos_items']
        user_vocab[user_id]['src_his'].extend(cur_session_pos)
        user_vocab[user_id]['src_his_ts'].extend([timestamp]*len(cur_session_pos))
        user_vocab[user_id]['src_his_query'].extend([cur_query]*len(cur_session_pos))

        user_vocab[user_id]['all_his'].extend(cur_session_pos)
        user_vocab[user_id]['all_his_ts'].extend([timestamp]*len(cur_session_pos))
        user_vocab[user_id]['all_his_query'].extend([cur_query]*len(cur_session_pos))

        # user_vocab[user_id]['all_his'].append(search_session_id)
        # user_vocab[user_id]['all_his_ts'].append(timestamp)

2960124it [04:44, 10412.19it/s]


In [25]:
new_sar_inter_df = pd.DataFrame(data=new_sar_inter_list,
                                columns=['user_id','item_id','ts','search_session_id','behavior',
                                         'rec_his','src_session_his','src_his','all_his'])
new_sar_inter_df.head()
new_sar_inter_df.shape

Unnamed: 0,user_id,item_id,ts,search_session_id,behavior,rec_his,src_session_his,src_his,all_his
0,0,13169,1400457600,,1,0,0,0,0
1,0,29220,1400457600,,1,1,0,0,1
2,0,28305,1400457600,,1,2,0,0,2
3,0,17983,1400457600,,1,3,0,0,3
4,0,62158,1400457600,,1,4,0,0,4


(2960124, 9)

In [26]:
pickle.dump(item_vocab,open(f"{base_path}/vocab/item_vocab.pkl",'wb'))

pickle.dump(user_vocab,open(f"{base_path}/vocab/user_vocab.pkl",'wb'))

pickle.dump(session_vocab,open(f"{base_path}/vocab/src_session_vocab.pkl",'wb'))

In [27]:
def splitTrainTest(user_df):
    user_df['train'].iloc[-1] = 3
    user_df['train'].iloc[-2] = 2
    return user_df

## Rec Data

In [28]:
rec_w_his_inter = rec_inter.copy()
rec_w_his_inter = rec_w_his_inter.sort_values(by=['user_id','ts']).reset_index(drop=True)
rec_w_his_inter.shape
rec_w_his_inter.head(1)

rec_new_sar_inter_df = new_sar_inter_df[new_sar_inter_df.behavior==1]
rec_new_sar_inter_df = rec_new_sar_inter_df.sort_values(by=['user_id','ts']).reset_index(drop=True)
rec_new_sar_inter_df.shape
rec_new_sar_inter_df.head(1)

(1684135, 4)

Unnamed: 0,user_id,item_id,score,ts
0,0,13169,1.0,1400457600


(1684135, 9)

Unnamed: 0,user_id,item_id,ts,search_session_id,behavior,rec_his,src_session_his,src_his,all_his
0,0,13169,1400457600,,1,0,0,0,0


In [29]:
rec_w_his_inter[['rec_his','src_session_his',
                 'src_his','all_his']] = rec_new_sar_inter_df[['rec_his','src_session_his','src_his','all_his']]
rec_w_his_inter.head()

Unnamed: 0,user_id,item_id,score,ts,rec_his,src_session_his,src_his,all_his
0,0,13169,1.0,1400457600,0,0,0,0
1,0,17983,5.0,1400457600,1,0,0,1
2,0,28305,3.0,1400457600,2,0,0,2
3,0,29220,2.0,1400457600,3,0,0,3
4,0,62158,5.0,1400457600,4,0,0,4


In [30]:
rec_w_his_inter = rec_w_his_inter[(rec_w_his_inter['rec_his']!=0) & (rec_w_his_inter['src_session_his']!=0)]
rec_w_his_inter.shape

(1288753, 8)

In [31]:
rec_inter_num = rec_w_his_inter.groupby(by=['user_id']).count().reset_index()
filtered_users_rec = rec_inter_num[rec_inter_num['item_id'] >= 3]
filtered_users_rec.head(3), filtered_users_rec['item_id'].describe()

(   user_id  item_id  score  ts  rec_his  src_session_his  src_his  all_his
 0        1        4      4   4        4                4        4        4
 2        3        4      4   4        4                4        4        4
 3        4        6      6   6        6                6        6        6,
 count    168258.000000
 mean          7.529526
 std           8.634085
 min           3.000000
 25%           4.000000
 50%           5.000000
 75%           8.000000
 max         426.000000
 Name: item_id, dtype: float64)

In [32]:
rec_w_his_inter = rec_w_his_inter[rec_w_his_inter['user_id'].isin(set(filtered_users_rec['user_id'].unique()))]
rec_w_his_inter = rec_w_his_inter.reset_index(drop=True)
rec_w_his_inter.head()
rec_w_his_inter.shape

Unnamed: 0,user_id,item_id,score,ts,rec_his,src_session_his,src_his,all_his
0,1,41809,5.0,1384041600,2,1,1,3
1,1,45935,5.0,1385769600,3,2,2,5
2,1,54080,5.0,1385769600,4,2,2,6
3,1,56442,4.0,1385769600,5,2,2,7
4,3,21975,1.0,1375142400,1,1,1,2


(1266903, 8)

In [33]:
rec_w_his_inter['train'] = 1
rec_w_his_inter_train = rec_w_his_inter.groupby('user_id').apply(splitTrainTest)

In [34]:
rec_train = rec_w_his_inter_train[rec_w_his_inter_train.train==1].reset_index(drop=True)
rec_train.drop(['train'],axis=1,inplace=True)
rec_train.shape

rec_val = rec_w_his_inter_train[rec_w_his_inter_train.train==2].reset_index(drop=True)
rec_val.drop(['train'],axis=1,inplace=True)
rec_val.shape

rec_test = rec_w_his_inter_train[rec_w_his_inter_train.train==3].reset_index(drop=True)
rec_test.drop(['train'],axis=1,inplace=True)
rec_test.shape

(930387, 8)

(168258, 8)

(168258, 8)

In [1]:
930387+168258+168258

1266903

- Sample negative for val and test 

In [35]:
num_train_neg_samples = 4
num_test_neg_samples = 99

item_set = rec_w_his_inter['item_id'].to_list()
# sample with item popularity
# item_set = item_features['item_id'].unique()

def SampleNegatives(row, cur_num_samples):
    count = 0 
    user_id = int(row['user_id'])
    cur_pos = int(row['item_id'])
    # cur_rec_his = user_vocab[user_id]['rec_his'][:int(row['rec_his'])]
    # cur_src_his = user_vocab[user_id]['src_his'][:int(row['src_his'])]
    cur_all_his = user_vocab[user_id]['all_his'][:int(row['all_his'])]

    # cur_num_samples = num_train_neg_samples if row['train'] == 1 else num_test_neg_samples

    neg_samples = []
    while count < cur_num_samples:
        cur_neg = random.choice(item_set)
        if (cur_neg in cur_all_his) or (cur_neg in neg_samples) or (cur_neg == cur_pos):
            continue
        count += 1
        neg_samples.append(cur_neg)
    return neg_samples

In [36]:
rec_train['neg_items'] = rec_train.parallel_apply(SampleNegatives,cur_num_samples=4,axis=1)
rec_train.head()

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=33229), Label(value='0 / 33229')))…

Unnamed: 0,user_id,item_id,score,ts,rec_his,src_session_his,src_his,all_his,neg_items
0,1,41809,5.0,1384041600,2,1,1,3,"[17639, 10132, 57740, 35737]"
1,1,45935,5.0,1385769600,3,2,2,5,"[50455, 28528, 60124, 52188]"
2,3,21975,1.0,1375142400,1,1,1,2,"[41765, 25374, 22250, 2729]"
3,3,58342,5.0,1402876800,2,2,2,4,"[53083, 49845, 3275, 23152]"
4,4,3113,4.0,1361145600,1,1,1,2,"[46462, 11778, 43604, 21629]"


In [37]:
rec_val['neg_items'] = rec_val.parallel_apply(SampleNegatives,cur_num_samples=99,axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6010), Label(value='0 / 6010'))), …

In [38]:
rec_test['neg_items'] = rec_test.parallel_apply(SampleNegatives,cur_num_samples=99,axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6010), Label(value='0 / 6010'))), …

In [39]:
rec_train.to_pickle(f"{base_path}/dataset/rec_train.pkl")

rec_val.to_pickle(f"{base_path}/dataset/rec_val.pkl")

rec_test.to_pickle(f"{base_path}/dataset/rec_test.pkl")

## Search Data

### Get Processed Search data

In [40]:
session_w_his = new_sar_inter_df[new_sar_inter_df.behavior==2]
session_w_his.drop(['item_id','behavior'],axis=1,inplace=True)
session_w_his.head()
session_w_his.shape

Unnamed: 0,user_id,ts,search_session_id,rec_his,src_session_his,src_his,all_his
5,0,1400457600,1,5,0,0,5
6,0,1400457600,2,5,1,1,6
7,0,1400457600,3,5,2,2,7
8,0,1400457600,4,4,3,3,8
11,1,1379548800,5,1,0,0,2


(1275989, 7)

In [41]:
session_his_vocab = session_w_his[['search_session_id','rec_his',
                                   'src_session_his','src_his','all_his']].set_index('search_session_id').to_dict('index')

In [42]:
src_w_his_inter = src_inter.copy()

In [43]:
src_w_his_inter['rec_his'] = src_inter['search_session_id'].apply(lambda x:session_his_vocab[x]['rec_his'])
src_w_his_inter['src_session_his'] = src_inter['search_session_id'].apply(lambda x:session_his_vocab[x]['src_session_his'])
src_w_his_inter['src_his'] = src_inter['search_session_id'].apply(lambda x:session_his_vocab[x]['src_his'])
src_w_his_inter['all_his'] = src_inter['search_session_id'].apply(lambda x:session_his_vocab[x]['all_his'])

In [44]:
src_w_his_inter = src_w_his_inter.sort_values(by=['user_id','ts']).reset_index(drop=True)

src_w_his_inter.head()
src_w_his_inter.shape

Unnamed: 0,user_id,item_id,search_session_id,ts,keyword,query_id,rec_his,src_session_his,src_his,all_his
0,0,13169,1,1400457600,"[1, 2, 3, 4, 5, 6]",1,5,0,0,5
1,0,17983,2,1400457600,"[3, 7, 8, 9, 10, 11, 4, 12, 13]",2,5,1,1,6
2,0,29220,3,1400457600,"[3, 1, 8, 4, 14, 9, 15, 16]",3,5,2,2,7
3,0,62158,4,1400457600,"[3, 17, 18, 19, 20]",4,4,3,3,8
4,1,58032,5,1379548800,"[3, 21, 4, 22]",5,1,0,0,2


(1275989, 10)

In [45]:
src_w_his_inter = src_w_his_inter[(src_w_his_inter['rec_his']!=0) & (src_w_his_inter['src_session_his']!=0)]
src_w_his_inter.shape

(1083586, 10)

In [46]:
src_inter_num = src_w_his_inter.groupby(by=['user_id']).count().reset_index()
filtered_users_src = src_inter_num[src_inter_num['item_id'] >= 3]
filtered_users_src.head(3), filtered_users_src['item_id'].describe()

(   user_id  item_id  search_session_id  ts  keyword  query_id  rec_his  \
 0        0        3                  3   3        3         3        3   
 1        1        4                  4   4        4         4        4   
 2        2        3                  3   3        3         3        3   
 
    src_session_his  src_his  all_his  
 0                3        3        3  
 1                4        4        4  
 2                3        3        3  ,
 count    191566.000000
 mean          5.647839
 std           5.800029
 min           3.000000
 25%           3.000000
 50%           4.000000
 75%           6.000000
 max         303.000000
 Name: item_id, dtype: float64)

In [47]:
src_w_his_inter = src_w_his_inter[src_w_his_inter['user_id'].isin(set(filtered_users_src['user_id'].unique()))]
src_w_his_inter = src_w_his_inter.reset_index(drop=True)
src_w_his_inter.head()
src_w_his_inter.shape

Unnamed: 0,user_id,item_id,search_session_id,ts,keyword,query_id,rec_his,src_session_his,src_his,all_his
0,0,17983,2,1400457600,"[3, 7, 8, 9, 10, 11, 4, 12, 13]",2,5,1,1,6
1,0,29220,3,1400457600,"[3, 1, 8, 4, 14, 9, 15, 16]",3,5,2,2,7
2,0,62158,4,1400457600,"[3, 17, 18, 19, 20]",4,4,3,3,8
3,1,41809,6,1384041600,"[3, 4, 21, 23, 24, 1]",6,2,1,1,4
4,1,45935,7,1385769600,"[3, 21, 16, 4, 25, 26, 27]",7,6,2,2,8


(1081934, 10)

In [48]:
src_w_his_inter['train'] = 1
src_w_his_inter_train = src_w_his_inter.groupby('user_id').apply(splitTrainTest)
src_w_his_inter_train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,user_id,item_id,search_session_id,ts,keyword,query_id,rec_his,src_session_his,src_his,all_his,train
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,0,0,17983,2,1400457600,"[3, 7, 8, 9, 10, 11, 4, 12, 13]",2,5,1,1,6,1
0,1,0,29220,3,1400457600,"[3, 1, 8, 4, 14, 9, 15, 16]",3,5,2,2,7,2
0,2,0,62158,4,1400457600,"[3, 17, 18, 19, 20]",4,4,3,3,8,3
1,3,1,41809,6,1384041600,"[3, 4, 21, 23, 24, 1]",6,2,1,1,4,1
1,4,1,45935,7,1385769600,"[3, 21, 16, 4, 25, 26, 27]",7,6,2,2,8,1


In [49]:
src_train = src_w_his_inter_train[src_w_his_inter_train.train==1].reset_index(drop=True)
src_train.drop(['train'],axis=1,inplace=True)
src_train.shape
src_train.head(1)

src_val = src_w_his_inter_train[src_w_his_inter_train.train==2].reset_index(drop=True)
src_val.drop(['train'],axis=1,inplace=True)
src_val.shape
src_val.head(1)

src_test = src_w_his_inter_train[src_w_his_inter_train.train==3].reset_index(drop=True)
src_test.drop(['train'],axis=1,inplace=True)
src_test.shape
src_test.head(1)

(698802, 10)

Unnamed: 0,user_id,item_id,search_session_id,ts,keyword,query_id,rec_his,src_session_his,src_his,all_his
0,0,17983,2,1400457600,"[3, 7, 8, 9, 10, 11, 4, 12, 13]",2,5,1,1,6


(191566, 10)

Unnamed: 0,user_id,item_id,search_session_id,ts,keyword,query_id,rec_his,src_session_his,src_his,all_his
0,0,29220,3,1400457600,"[3, 1, 8, 4, 14, 9, 15, 16]",3,5,2,2,7


(191566, 10)

Unnamed: 0,user_id,item_id,search_session_id,ts,keyword,query_id,rec_his,src_session_his,src_his,all_his
0,0,62158,4,1400457600,"[3, 17, 18, 19, 20]",4,4,3,3,8


In [2]:
698802 + 191566 + 191566

1081934

In [50]:
# num_train_neg_samples = 4
# num_test_neg_samples = 99

# src_item_set = src_w_his_inter['item_id'].to_list()
# # sample with item popularity
# # item_set = item_features['item_id'].unique()

# def SampleNegatives_src(row, cur_num_samples):
#     count = 0 
#     user_id = int(row['user_id'])
#     cur_pos = int(row['item_id'])
#     # cur_rec_his = user_vocab[user_id]['rec_his'][:int(row['rec_his'])]
#     # cur_src_his = user_vocab[user_id]['src_his'][:int(row['src_his'])]
#     cur_all_his = user_vocab[user_id]['all_his'][:int(row['all_his'])]

#     # cur_num_samples = num_train_neg_samples if row['train'] == 1 else num_test_neg_samples

#     neg_samples = []
#     while count < cur_num_samples:
#         cur_neg = random.choice(src_item_set)
#         if (cur_neg in cur_all_his) or (cur_neg in neg_samples) or (cur_neg == cur_pos):
#             continue
#         count += 1
#         neg_samples.append(cur_neg)
#     return neg_samples

In [51]:
num_train_neg_samples = 4
num_test_neg_samples = 99

src_item_set = src_w_his_inter['item_id'].to_list()
# sample with item popularity
# item_set = item_features['item_id'].unique()

def SampleNegatives_src(row, cur_num_samples):
    count = 0 
    user_id = int(row['user_id'])
    cur_pos = int(row['item_id'])

    cur_hits = searcher.search(query_vocab[row['query_id']]['query'], k=cur_num_samples+10)
    neg_samples = [int(x.docid) for x in cur_hits if int(x.docid)!=cur_pos][:cur_num_samples]
    count += len(neg_samples)

    # cur_rec_his = user_vocab[user_id]['rec_his'][:int(row['rec_his'])]
    # cur_src_his = user_vocab[user_id]['src_his'][:int(row['src_his'])]
    cur_all_his = user_vocab[user_id]['all_his'][:int(row['all_his'])]

    # cur_num_samples = num_train_neg_samples if row['train'] == 1 else num_test_neg_samples
    # neg_samples = []
    
    while count < cur_num_samples:
        cur_neg = random.choice(src_item_set)
        if (cur_neg in cur_all_his) or (cur_neg in neg_samples) or (cur_neg == cur_pos):
            continue
        count += 1
        neg_samples.append(cur_neg)
    return neg_samples

    
    # return cur_hits_items
    

In [52]:
neg_sample_list = []
for _, line in tqdm(src_train.iterrows()):
    neg_sample_list.append(SampleNegatives_src(line, cur_num_samples=4))
src_train['neg_items'] = neg_sample_list
src_train.head()

698802it [23:13, 501.36it/s]


Unnamed: 0,user_id,item_id,search_session_id,ts,keyword,query_id,rec_his,src_session_his,src_his,all_his,neg_items
0,0,17983,2,1400457600,"[3, 7, 8, 9, 10, 11, 4, 12, 13]",2,5,1,1,6,"[11690, 15890, 25110, 1547]"
1,1,41809,6,1384041600,"[3, 4, 21, 23, 24, 1]",6,2,1,1,4,"[31823, 59578, 6597, 14107]"
2,1,45935,7,1385769600,"[3, 21, 16, 4, 25, 26, 27]",7,6,2,2,8,"[33884, 34327, 31772, 59462]"
3,2,50597,11,1366156800,"[3, 38, 27, 4, 39, 40]",11,3,1,1,4,"[24385, 24354, 37478, 38237]"
4,3,21975,15,1375142400,"[3, 4, 21, 16, 46]",14,1,1,1,3,"[59462, 1547, 23288, 12270]"


In [53]:
# src_train['neg_items'] = src_train.parallel_apply(SampleNegatives_src,cur_num_samples=4,axis=1)
# src_train.head()

In [54]:
src_train.to_pickle(f"{base_path}/dataset/src_train.pkl")

In [55]:
neg_sample_list = []
for _, line in tqdm(src_val.iterrows()):
    neg_sample_list.append(SampleNegatives_src(line, cur_num_samples=99))
src_val['neg_items'] = neg_sample_list
src_val.head()

191566it [25:52, 123.37it/s]


Unnamed: 0,user_id,item_id,search_session_id,ts,keyword,query_id,rec_his,src_session_his,src_his,all_his,neg_items
0,0,29220,3,1400457600,"[3, 1, 8, 4, 14, 9, 15, 16]",3,5,2,2,7,"[12745, 17526, 3375, 24464, 31583, 49481, 4220..."
1,1,54080,8,1385769600,"[3, 21, 4, 28, 29, 30, 31, 32, 33]",8,5,3,3,9,"[17484, 46956, 48206, 59817, 60052, 22939, 517..."
2,2,42337,12,1370563200,"[3, 21, 41, 6, 37, 4, 42, 20]",12,5,2,2,7,"[22861, 32067, 59381, 41598, 51965, 45572, 373..."
3,3,59964,16,1402876800,"[3, 21, 4, 28, 29, 47, 48, 33]",15,5,2,2,7,"[27760, 12136, 17631, 3717, 15509, 24564, 5247..."
4,4,30244,21,1389744000,"[3, 1, 58, 4, 59]",20,4,3,3,8,"[30881, 44332, 59462, 50403, 51850, 33115, 287..."


In [56]:
# src_val['neg_items'] = src_val.parallel_apply(SampleNegatives_src,cur_num_samples=99,axis=1)

In [57]:
src_val.to_pickle(f"{base_path}/dataset/src_val.pkl")

In [58]:
neg_sample_list = []
for _, line in tqdm(src_test.iterrows()):
    neg_sample_list.append(SampleNegatives_src(line, cur_num_samples=99))
src_test['neg_items'] = neg_sample_list
src_test.head()

191566it [25:26, 125.51it/s]


Unnamed: 0,user_id,item_id,search_session_id,ts,keyword,query_id,rec_his,src_session_his,src_his,all_his,neg_items
0,0,62158,4,1400457600,"[3, 17, 18, 19, 20]",4,4,3,3,8,"[61907, 52357, 24890, 24891, 41243, 59035, 366..."
1,1,56442,9,1385769600,"[3, 4, 23, 21, 20]",9,6,4,4,10,"[59462, 33693, 15295, 8619, 8690, 16586, 28729..."
2,2,47278,13,1370563200,"[3, 21, 16, 4, 34, 35, 36, 37]",10,4,3,3,8,"[49379, 62546, 57397, 51594, 56339, 21351, 514..."
3,3,62157,17,1402876800,"[3, 21, 16, 4, 49]",16,4,3,3,8,"[59462, 44938, 8619, 8690, 34388, 32536, 40119..."
4,4,61907,22,1402358400,"[3, 17, 18, 19, 20]",4,7,4,4,11,"[52357, 24890, 24891, 41243, 59035, 36680, 578..."


In [59]:
# src_test['neg_items'] = src_test.parallel_apply(SampleNegatives_src,cur_num_samples=99,axis=1)

In [60]:
src_test.to_pickle(f"{base_path}/dataset/src_test.pkl")

## Get rec/src item set

In [4]:
src_train = pd.read_pickle('../dataset/src_train.pkl')
src_train.shape

src_val = pd.read_pickle('../dataset/src_val.pkl')
src_val.shape

src_test = pd.read_pickle('../dataset/src_test.pkl')
src_test.shape

(698802, 11)

(191566, 11)

(191566, 11)

In [5]:
src_train.head()

Unnamed: 0,user_id,item_id,search_session_id,ts,keyword,query_id,rec_his,src_session_his,src_his,all_his,neg_items
0,0,17983,2,1400457600,"[3, 7, 8, 9, 10, 11, 4, 12, 13]",2,5,1,1,6,"[11690, 15890, 25110, 1547]"
1,1,41809,6,1384041600,"[3, 4, 21, 23, 24, 1]",6,2,1,1,4,"[31823, 59578, 6597, 14107]"
2,1,45935,7,1385769600,"[3, 21, 16, 4, 25, 26, 27]",7,6,2,2,8,"[33884, 34327, 31772, 59462]"
3,2,50597,11,1366156800,"[3, 38, 27, 4, 39, 40]",11,3,1,1,4,"[24385, 24354, 37478, 38237]"
4,3,21975,15,1375142400,"[3, 4, 21, 16, 46]",14,1,1,1,3,"[59462, 1547, 23288, 12270]"


In [6]:
src_item_set = set(src_train['item_id'].tolist()) | set(src_val['item_id'].tolist()) | set(src_test['item_id'].tolist())
src_item_set = list(src_item_set)
len(src_item_set)

62649

In [7]:
with open('../vocab/src_item_set.pkl', 'wb') as fp:
    pickle.dump(src_item_set,fp)

In [8]:
rec_train = pd.read_pickle('../dataset/rec_train.pkl')
rec_train.shape

rec_val = pd.read_pickle('../dataset/rec_val.pkl')
rec_val.shape

rec_test = pd.read_pickle('../dataset/rec_test.pkl')
rec_test.shape

(930387, 9)

(168258, 9)

(168258, 9)

In [9]:
rec_item_set = set(rec_train['item_id'].tolist()) | set(rec_val['item_id'].tolist()) | set(rec_test['item_id'].tolist())
rec_item_set = list(rec_item_set)
len(rec_item_set)

62777

In [10]:
with open('../vocab/rec_item_set.pkl', 'wb') as fp:
    pickle.dump(rec_item_set,fp)

In [11]:
len(set(src_item_set) | set(rec_item_set))

len(set(src_item_set) & set(rec_item_set))

62818

62608