In [1]:
import sys
sys.path = ['../RecStudio'] + sys.path
import pandas as pd 
import numpy as np 
import torch
from recstudio.data.advance_dataset import KDDCUPSeqDataset, KDDCUPSessionDataset
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def _load_cache(path):
    with open(path, 'rb') as f:
        download_obj = pickle.load(f)
    return download_obj

In [3]:
def load_datasets_from_cache(data_dir):
    cache_datasets = _load_cache(data_dir)
    datasets = []
    for i in range(len(cache_datasets)):
        datasets.append(KDDCUPSessionDataset(None, data_dir, None, True))
        for k in cache_datasets[i].__dict__:
            attr = getattr(cache_datasets[i], k)
            setattr(datasets[i], k, attr)
    return datasets 

In [4]:
product_data = pd.read_csv('/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data/products_train.csv')

# DE

In [5]:
DE_product_vectors = np.load('/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/text_method/phase2_task1_bert_results_DE/results/item_reps/item.npy')
padding_vector = np.array([[0.0 for _ in range(DE_product_vectors.shape[-1])]])
DE_product_vectors = np.concatenate([padding_vector, DE_product_vectors], axis=0)

In [6]:
DE_product_vectors.shape

(518328, 768)

In [7]:
DE_dataset = load_datasets_from_cache('/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/.recstudio/cache/cf49a486c59e9dd4de37544db3d11d4f')[0]

In [8]:
DE_index = DE_dataset.item_feat.get_col('DE_index').long()

In [57]:
reordered_DE_product_vectors = DE_product_vectors[DE_index]

In [58]:
vector = reordered_DE_product_vectors[10]

In [59]:
reordered_DE_product_vectors, DE_product_vectors

(array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [-0.27767107,  0.46169734,  0.1656774 , ...,  0.0504719 ,
          0.06277502, -0.71079695],
        [-0.09146871,  0.25685158, -0.15032132, ..., -0.05625479,
          0.12564886, -0.44113448],
        ...,
        [ 0.25588366,  0.10712019, -0.16894843, ..., -0.09348757,
         -0.05167353, -0.48440662],
        [-0.18512085,  0.63365602,  0.05480206, ...,  0.19784965,
         -0.47778049, -0.89562184],
        [-0.11996938,  0.10017258, -0.44667378, ...,  0.15710057,
         -0.48249194, -0.30656368]]),
 array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.15347071, -0.04191907, -0.02111584, ...,  0.26753294,
         -0.42893904, -0.15342619],
        [ 0.10978135,  0.18713956, -0.26135033, ..., -0.15469003,
          0.10276753, -0.40611464],
        ...,
        [ 0.05648566,  0.17951466, -0.13526079, ..., -

In [60]:
(-np.matmul(reordered_DE_product_vectors, vector)).argsort()[:10]

array([    10,  79610,     11,  79611, 220141, 363759, 233091, 212236,
       205996, 256106])

In [64]:
DE_dataset.field2tokens['product_id'][10], DE_dataset.field2tokens['product_id'][79610]

('B003YCIEWC', 'B003YCIEWW')

In [39]:
product_data.query("id=='B003YCIEWC'").iloc[0]['title']

'ROMMELSBACHER Kleinbackofen BG 1055/E - 18 Liter Backraum, 7 Heizarten inkl. Umluft, Temperaturen von 80 - 230 °C, Doppelverglasung, Innenbeleuchtung, Zeitschaltuhr, 1050 W, Edelstahl'

In [65]:
product_data.query("id=='B003YCIEWW'").iloc[0]['title']

'ROMMELSBACHER Kleinbackofen BG 950 "Speedy" - 10 Liter Backraum, 2 Quarzheizelemente, Temperatur von 80 - 230 °, Ober-/Unterhitze, Zeitschaltuhr, antihaftbeschichteter Backraum, Doppelverglasung'

In [22]:
reordered_DE_product_vectors, reordered_DE_product_vectors.shape

(array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [-0.27767107,  0.46169734,  0.1656774 , ...,  0.0504719 ,
          0.06277502, -0.71079695],
        [-0.09146871,  0.25685158, -0.15032132, ..., -0.05625479,
          0.12564886, -0.44113448],
        ...,
        [ 0.25588366,  0.10712019, -0.16894843, ..., -0.09348757,
         -0.05167353, -0.48440662],
        [-0.18512085,  0.63365602,  0.05480206, ...,  0.19784965,
         -0.47778049, -0.89562184],
        [-0.11996938,  0.10017258, -0.44667378, ...,  0.15710057,
         -0.48249194, -0.30656368]]),
 (518328, 768))

In [23]:
np.save('/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/text_method/phase2_task1_bert_results_DE/results/item_reps/reordered_item.npy', reordered_DE_product_vectors)

# JP

In [24]:
JP_product_vectors = np.load('/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/text_method/phase2_task1_bert_results_JP/results/item_reps/item.npy')
padding_vector = np.array([[0.0 for _ in range(JP_product_vectors.shape[-1])]])
JP_product_vectors = np.concatenate([padding_vector, JP_product_vectors], axis=0)

In [25]:
JP_product_vectors.shape

(395010, 768)

In [26]:
JP_dataset = load_datasets_from_cache('/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/.recstudio/cache/0a61def413f0594014cbda0db39a5d35')[0]

In [27]:
JP_index = JP_dataset.item_feat.get_col('JP_index').long()

In [28]:
reordered_JP_product_vectors = JP_product_vectors[JP_index]

In [29]:
reordered_JP_product_vectors, reordered_JP_product_vectors.shape

(array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [-0.07792613, -0.20774005, -0.11552254, ..., -0.14221342,
         -0.01400352,  0.45575038],
        [-0.15231362, -0.08534466, -0.02059302, ..., -0.3459157 ,
         -0.01568456,  0.2924411 ],
        ...,
        [-0.04140345,  0.36435166, -0.1477115 , ...,  0.11874112,
          0.21343319, -0.23807478],
        [ 0.14280741, -0.09587425, -0.50497603, ...,  0.24368274,
          0.10076301,  0.17239091],
        [-0.34945658,  0.17122674, -0.46473494, ..., -0.43367937,
          0.0645022 , -0.05137298]]),
 (395010, 768))

In [30]:
np.save('/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/text_method/phase2_task1_bert_results_JP/results/item_reps/reordered_item.npy', reordered_JP_product_vectors)

# UK

In [5]:
UK_product_vectors = np.load('/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/text_method/phase2_task1_roberta_results_UK/results/item_reps/item.npy')
padding_vector = np.array([[0.0 for _ in range(UK_product_vectors.shape[-1])]])
UK_product_vectors = np.concatenate([padding_vector, UK_product_vectors], axis=0)

In [6]:
UK_product_vectors.shape

(500181, 768)

In [22]:
one_vector = UK_product_vectors[2000]

In [23]:
(-np.matmul(UK_product_vectors, one_vector)).argsort()[:10]

array([  2000, 112424, 202301, 472272, 272461, 391722, 303370, 232505,
       147477, 469652])

In [9]:
UK_product_data = product_data.query("locale=='UK'")

In [10]:
UK_product_data

Unnamed: 0,id,locale,title,price,brand,color,size,model,material,author,desc
913336,B087LZNPHS,UK,"SOCHOW Sherpa Fleece Throw Blanket, Double-Sid...",24.99,SOCHOW,Teal Green,127cm×150cm,,100% Polyester,,COLOR: The sherpa throw blanket is available i...
913337,B08THFN1KX,UK,Hippowarehouse Personalised Photo Printed Mous...,9.95,Hippowarehouse,White,240mm x 190mm x 60mm,50245-Mat-Perso,Rubber,,Competitively priced
913338,0804185328,UK,"500 Easy Recipes for Every Machine, Both Stove...",16.49,Clarkson Potter,White,,,,"Scarbrough, Mark",
913339,B09VBKDBW6,UK,"TYHJOY Mini Bag Sealer, Handheld Vacuum Heat S...",11.99,TYHJOY,Black,,FBA-sealer-black,Acrylonitrile Butadiene Styrene,,【AFTER-SALE】This handheld food heat sealer sho...
913340,B096ZW8B49,UK,Lucosobie Steering Wheel Lock - Car Anti-Theft...,26.99,Lucosobie,Black,,,Alloy Steel,,🔐【 Anti-Friction & Customer First】Each box of ...
...,...,...,...,...,...,...,...,...,...,...,...
1413511,B08D7KW8VK,UK,TOMHOUSEE Anime Cosplay Short Straight Hair Wi...,9.99,TOMHOUSEE,Deep Grey Yuki,,,Synthetic,,
1413512,B073WXLXR9,UK,Crystals NEW brilliant ink twister bingo dabbe...,8.99,CRYSTALS,"Orange,blue,green,pink,red,purple",,,Plastic,,
1413513,1529393833,UK,"Before I Do: the new, funny and unexpected lov...",4.50,Hodder Paperbacks,,,,,"Cousens, Sophie",
1413514,B0B3TJ1NDN,UK,"Black iPhone Charger Cable, iPhone Charger Bra...",4.49,AA-TECH,Black,2M,brd-ip-black-2022,Nylon Braided,,Added Protection: An additional layer of prote...


In [24]:
UK_product_data.iloc[1999]['title']

'Lest We Forget Flag Remembrance Day Flag Poppy Flag 3 X 5 ft Remembrance Sunday Poppy Flag Remembrance Day Decorations for Heroes Soldiers Outdoors Street Square Lest We Forget Banner'

In [25]:
UK_product_data.iloc[112423]['title']

'Lest We Forget Flag Remembrance Day Flag Poppy Flag 3 X 5 ft Remembrance Sunday Poppy Flag Remembrance Day Decorations for Heroes Soldiers Outdoors Street Square Lest We Forget Banner'

In [26]:
UK_dataset = load_datasets_from_cache('/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/.recstudio/cache/5bd28611fbebac9d3034ecb047ad8235')[0]

In [17]:
len(UK_dataset.field2tokens['product_id'])

500181

In [22]:
len(UK_dataset.title_feat['input_ids'][1])

43

In [27]:
UK_index = UK_dataset.item_feat.get_col('UK_index').long()

In [28]:
reordered_UK_product_vectors = UK_product_vectors[UK_index]

In [29]:
reordered_UK_product_vectors

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.31055942,  0.26857582, -0.02708881, ..., -0.20840503,
         0.22763547,  0.09966043],
       [ 0.28270358,  0.23816437, -0.08710322, ..., -0.31988525,
         0.05173429,  0.10050996],
       ...,
       [ 0.01956559,  0.38649058, -0.10972144, ..., -0.23125383,
         0.18344085, -0.13115814],
       [ 0.3251771 ,  0.32289335,  0.0486753 , ..., -0.50031304,
         0.06716381,  0.1378964 ],
       [-0.03344749,  0.29696861,  0.03735647, ..., -0.255467  ,
         0.19779138,  0.2023138 ]])

In [16]:
np.save('/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/text_method/phase2_task1_roberta_results_UK/results/item_reps/reordered_item.npy', reordered_UK_product_vectors)