In [1]:
import os
import numpy as np
import pandas as pd
from functools import lru_cache
import json
from tqdm import tqdm 
from numba import jit

In [2]:
data_for_recstudio  = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio'
raw_data_dir = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data'


In [4]:
@lru_cache(maxsize=1)
def read_product_data():
    return pd.read_csv(os.path.join(raw_data_dir, f'products_train.csv'))

@lru_cache(maxsize=1)
def read_train_data():
    return pd.read_csv(os.path.join(data_for_recstudio, 'all_task_1_train_sessions.csv'))

@lru_cache(maxsize=1)
def read_valid_data():
    return pd.read_csv(os.path.join(data_for_recstudio, 'all_task_1_valid_sessions.csv'))

@lru_cache(maxsize=3)
def read_test_data(task):
    return pd.read_csv(os.path.join(raw_data_dir, f'sessions_test_{task}.csv'))

@lru_cache(maxsize=3)
def read_test1_data(locale):
    return pd.read_csv(os.path.join(data_for_recstudio, f'session_test_task1_{locale}.csv'))



In [4]:
product_data = read_product_data()
train_sessions = read_train_data()
valid_sessions = read_valid_data()
test1_UK_data = read_test1_data('UK')

In [37]:
# title fill nan 
product_data['title'] = product_data['title'].fillna("")

In [71]:
product_map_id = {}
reindex_product_data = product_data.set_index(product_data['id'] + '_' + product_data['locale'])
for i in tqdm(range(len(reindex_product_data))):
    product_map_id[reindex_product_data.index[i]] = i
product_map_id

100%|██████████| 1551057/1551057 [00:03<00:00, 465666.67it/s]


{'B005ZSSN10_DE': 0,
 'B08PRYN6LD_DE': 1,
 'B09MBZJ48V_DE': 2,
 'B08ZN6F26S_DE': 3,
 'B094DGRV7D_DE': 4,
 'B09JNNBDH5_DE': 5,
 'B08R62WZ1Y_DE': 6,
 'B09WK4YNX8_DE': 7,
 'B07KTKFYYS_DE': 8,
 'B0B9MPKYJK_DE': 9,
 'B0B5RMZ9FQ_DE': 10,
 'B000LXWTUU_DE': 11,
 'B06XKPB3GT_DE': 12,
 'B07Q3471S2_DE': 13,
 'B0BG4V5PBQ_DE': 14,
 'B094V3614V_DE': 15,
 'B07K27BTM8_DE': 16,
 'B078RFH53S_DE': 17,
 'B08K8MHZ4N_DE': 18,
 'B087JPRYFZ_DE': 19,
 'B00V5BR1RS_DE': 20,
 'B07K15DDKQ_DE': 21,
 'B0B3JRWV4S_DE': 22,
 'B084GYK4B9_DE': 23,
 'B08BR2NLTN_DE': 24,
 'B096B95D9Y_DE': 25,
 'B07J65G9DD_DE': 26,
 'B07XPFNBHW_DE': 27,
 'B0868TBNZZ_DE': 28,
 'B000S1KWPE_DE': 29,
 'B0916LVFGG_DE': 30,
 'B087BKQ98X_DE': 31,
 'B01MREMJ90_DE': 32,
 'B09QMCVW6P_DE': 33,
 'B09JCC7FVK_DE': 34,
 'B00IK6I6YS_DE': 35,
 'B00V07XW3E_DE': 36,
 'B07PNR4KM2_DE': 37,
 'B0B8YBSMJ1_DE': 38,
 'B06XTTLZ1B_DE': 39,
 'B0009585PI_DE': 40,
 '3954531038_DE': 41,
 'B09TV4BNB9_DE': 42,
 'B09TLCZ14D_DE': 43,
 'B075TL67K8_DE': 44,
 'B07NWCJZWV_DE': 45

In [40]:
# save as json 
# id_locale as new index
def save_corpus_as_json(corpus_path, map_id_path):
    with open(corpus_path, 'w', encoding='utf-8') as f, open(map_id_path, 'w', encoding='utf-8') as fid:
        for i in tqdm(range(len(product_data))):
            product = product_data.iloc[i]
            if pd.isna(product['title']):
                data = {'id' : f"{product['id']}_{product['locale']}", 'title' : " "}
            else:
                data = {'id' : f"{product['id']}_{product['locale']}", 'title' : product['title']}
            f.write(json.dumps(data) + '\n')
            fid.write(f"{product['id']}_{product['locale']}" + "\t" + str(i) + "\n")

def save_dev_corpus_as_json(corpus_path, map_id_path):
    with open(corpus_path, 'w', encoding='utf-8') as f, open(map_id_path, 'w', encoding='utf-8') as fid:
        for i in tqdm(range(500000, 1000000)):
            product = product_data.iloc[i]
            if pd.isna(product['title']):
                data = {'id' : f"{product['id']}_{product['locale']}", 'title' : " "}
            else:
                data = {'id' : f"{product['id']}_{product['locale']}", 'title' : product['title']}
            f.write(json.dumps(data) + '\n')
            fid.write(f"{product['id']}_{product['locale']}" + "\t" + str(i) + "\n")

In [77]:
# save query as json
def save_query_as_json(train_query_path, map_id_path, train_qrels_path, sess_len=5):
    with open(train_query_path, 'w', encoding='utf-8') as f, open(map_id_path, 'w', encoding='utf-8') as fid, \
        open(train_qrels_path, 'w', encoding='utf-8') as f_qrel:
        for i in tqdm(range(len(train_sessions))):
            sess = train_sessions.iloc[i]
            sess_locale = sess['locale']
            prev_items = sess['prev_items']
            next_item = sess['next_item']

            prev_items = eval(prev_items.replace(" ", ","))
            prev_items = prev_items[-sess_len:]
            prev_items = list(map(lambda x : x + "_"+ sess_locale, prev_items))
            # prev_items = list(map(lambda x : product_map_id[x], prev_items))
            prev_items_titles = reindex_product_data.loc[prev_items]['title'].to_list()

            # train query
            data = {'id' : str(i), 'title_list' : prev_items_titles}
            f.write(json.dumps(data) + '\n')
            fid.write(str(i) + "\t" + str(i) + "\n")

            # train qrel
            f_qrel.write(str(i) + "\t" + next_item + '_' + sess_locale + "\n")

def save_dev_query_as_json(train_query_path, map_id_path, train_qrels_path, sess_len=5):
    with open(train_query_path, 'w', encoding='utf-8') as f, open(map_id_path, 'w', encoding='utf-8') as fid, \
        open(train_qrels_path, 'w', encoding='utf-8') as f_qrel:
        for i in tqdm(range(50000)):
            sess = train_sessions.iloc[i]
            sess_locale = sess['locale']
            prev_items = sess['prev_items']
            next_item = sess['next_item']
            prev_items = eval(prev_items.replace(" ", ","))
            prev_items = prev_items[-sess_len:]
            prev_items = list(map(lambda x : x + "_"+ sess_locale, prev_items))
            prev_items_titles = reindex_product_data.loc[prev_items]['title'].to_list()

            # train query
            data = {'id' : str(i), 'title_list' : prev_items_titles}
            f.write(json.dumps(data) + '\n')
            fid.write(str(i) + "\t" + str(i) + "\n")

            # train qrel
            f_qrel.write(str(i) + "\t" + next_item + '_' + sess_locale + "\n")

In [43]:
if not os.path.exists('./data/corpus/'):
        os.makedirs('./data/corpus/')
save_corpus_as_json('./data/corpus.json', './data/corpus/mapping_id.txt')

100%|██████████| 1551057/1551057 [02:21<00:00, 10968.55it/s]


In [42]:
if not os.path.exists('./data/corpus/'):
        os.makedirs('./data/corpus/')
save_dev_corpus_as_json('./data/dev_corpus.json', './data/corpus/dev_mapping_id.txt')

100%|██████████| 500000/500000 [00:47<00:00, 10564.90it/s]


In [9]:
train_sessions.iloc[:10]

Unnamed: 0,prev_items,next_item,locale
0,['B005ZJTUXE' 'B005ZJTUXE' 'B00P8VIBBG'],B07TVSL9TW,FR
1,['B09M8HSN22' 'B09MTKZNB2' 'B07XWK3G8K' 'B09H7...,B01J5EEEQW,DE
2,['B088ZH3JF1' 'B003YCIEWC' 'B008U4NBE0' 'B073V...,B00TDNLSBU,DE
3,['B01B4S990I' 'B07QJQCHRJ' 'B0723H7YMT'],B09T77G4B5,DE
4,['B079D9HDHP' 'B01E0KS6JC' 'B079D9HDHP'],B079DDND7C,DE
5,['B08456GQ8R' 'B08455SX6C' 'B084562Z83' 'B0845...,B08456DP3R,JP
6,['B07DRPH58X' 'B098TF21JM'],B098TG1D3C,UK
7,['B07Q9ZTSJD' 'B00D8CXYXC'],B00D8CXXTC,UK
8,['B09JFRT3KK' 'B09Q5KG4G7' 'B09QRZQ9DQ' 'B0BG2...,B09FQFM117,UK
9,['B085ZGTQMQ' 'B00KL44ROA' 'B00KL44ROA' 'B06WD...,B009GX10XI,JP


In [79]:
if not os.path.exists('./data/BertTokenizer_data/train_query/'):
        os.makedirs('./data/BertTokenizer_data/train_query/')
save_query_as_json('./data/train_query.json', './data/BertTokenizer_data/train_query/mapping_id.txt', './data/BertTokenizer_data/train_qrel.txt')

 10%|▉         | 345393/3557898 [03:38<33:55, 1577.92it/s] 


KeyboardInterrupt: 

In [46]:
if not os.path.exists('./data/BertTokenizer_data/dev_query/'):
        os.makedirs('./data/BertTokenizer_data/dev_query/')
save_dev_query_as_json('./data/dev_query.json', './data/BertTokenizer_data/dev_query/dev_mapping_id.txt', './data/BertTokenizer_data/dev_qrel.txt')

100%|██████████| 50000/50000 [02:24<00:00, 344.89it/s] 


In [25]:
product_data.iloc[0]['title']

'RED DRAGON Amberjack 3 - Steel Tip 22 Gramm Wolfram Profi Dartpfeile Set mit Flights und Schäfte'

In [10]:
valid_sessions = read_valid_data()

In [11]:
valid_sessions.head(5)

Unnamed: 0,prev_items,next_item,locale
0,['B09VSN9GLS' 'B09VSG9DCG' 'B0BJ5L1ZPH' 'B09VS...,B06XG1LZ6Z,UK
1,['B00390YWXE' 'B00390YWXE' 'B09WM9W6WQ'],B01MSUI4FE,JP
2,['B01BM9V6H8' 'B01MG55XDR' 'B07VYSSRL7'],B01M6625ME,UK
3,['B092ZG24S7' 'B09BNHWWZM' 'B08CB1WG5M' '17880...,0241558573,UK
4,['B0B6NY5RM8' 'B09BJGBBBR'],B09BJF6N8K,JP


In [12]:
product_data = read_product_data()

In [13]:
product_data.head(5)

Unnamed: 0,id,locale,title,price,brand,color,size,model,material,author,desc
0,B005ZSSN10,DE,RED DRAGON Amberjack 3 - Steel Tip 22 Gramm Wo...,30.95,RED DRAGON,,,RDD0089,,,Amberjacks Steel Dartpfeile sind verfügbar in ...
1,B08PRYN6LD,DE,Simply Keto Lower Carb* Schokodrops ohne Zucke...,17.9,Simply Keto,,750 g (1er Pack),,,,🌱 NATÜRLICHE SÜSSE DURCH ERYTHRIT - Wir stelle...
2,B09MBZJ48V,DE,"Sennheiser 508377 PC 5.2 Chat, Stilvolles Mult...",68.89,Sennheiser,Multi-Colour,One size,508377,Kunstleder,,3.5 MM BUCHSE - Kann problemlos an Geräte mit ...
3,B08ZN6F26S,DE,AmyBenton Auto ab 1 2 3 ahre - Baby Aufziehbar...,18.99,Amy & Benton,Animal Car,,2008B,aufziehauto 1 jahr,,【Auto aufziehbar】: Drücken Sie einfach leicht ...
4,B094DGRV7D,DE,PLAYMOBIL - 70522 - Cavaliere mit grauem Pony,7.17,PLAYMOBIL,Nicht Zutreffend.,OneSize,70522,Polypropylen,,Inhalt: 1 Stück


In [5]:
product_data = read_product_data()

In [10]:
product_data.query("id == 'B00IVLANGG'")['title']

500010    wolfcraft Türfutter-Montageset PRO 3676000 / 8...
Name: title, dtype: object

In [15]:
reindex_product_data = product_data.set_index(product_data['id'] + '_' + product_data['locale'])

In [11]:
train_sessions = read_train_data()

In [12]:
train_UK_sessions = train_sessions.query('locale == "UK"')

In [24]:
train_UK_sessions.iloc[20]

prev_items    ['B09BP1G2TR' 'B0BK54QNZP']
next_item                      B09PD2727K
locale                                 UK
Name: 63, dtype: object

In [26]:
reindex_product_data.loc['B09BP1G2TR_UK']['title']

'Rainbow High 578314EUC Jewel Richie-Emerald Green Fashion Doll with Vitiligo Includes 2 Mix & Match Designer Outfits with Accessories-for Kids 6-12 Years Old and Collectors'