use loc[i, field] instead of iloc[i, field], iloc is very slow.

In [1]:
import sys
sys.path = ['/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/RecStudio/'] + sys.path
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import random
import numpy as np
import pandas as pd
import cudf, itertools
import scipy.sparse as ssp
from functools import lru_cache, partial
from tqdm import tqdm, trange
from collections import Counter, defaultdict
import torch
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def cast_dtype(df : pd.DataFrame):
    for k in df.columns:
        dt = type(df[k].iloc[0])
        if 'float' in str(dt):
            df[k] = df[k].astype('float32')
        elif 'int' in str(dt):
            df[k] = df[k].astype('int32')
        elif dt == list:
            dt_ = type(df.iloc[0][k][0])
            if 'float' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.float32))
            elif 'int' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.int32))

In [3]:
def _load_cache(path):
    with open(path, 'rb') as f:
        download_obj = pickle.load(f)
    return download_obj

In [4]:
def get_scores(merged_candidates_df, query_embeddings, product_embeddings):
    batch_size = 2048
    num_iter = (len(merged_candidates_df) - 1) // batch_size + 1
    score_list = []
    with torch.no_grad():
        for i in tqdm(range(num_iter)):
            st, ed = i * batch_size, (i + 1) * batch_size 
            batch_sess = merged_candidates_df.iloc[st : ed]
            batch_sess_id = torch.tensor(batch_sess['sess_id'].tolist(), device=query_embeddings.device)
            batch_product_id = torch.tensor(batch_sess['dataset_id'].tolist(), device=product_embeddings.device)
            query_emb = query_embeddings[batch_sess_id]
            product_emb = product_embeddings[batch_product_id]
            batch_score = (query_emb * product_emb).sum(dim=-1) 
            score_list.append(batch_score.cpu())
        score_list = torch.cat(score_list, dim=0).cpu().tolist()
        return score_list 

In [5]:
def normalize_scores(score_df, score_name, normalized_score_name):
    score_df_g = cudf.from_pandas(score_df)
    score_df_g['exp_score'] = np.exp(score_df_g[score_name].to_numpy())
    scores_sum_g = score_df_g[['sess_id', 'exp_score']].groupby('sess_id').sum()
    scores_sum_g.reset_index(inplace=True)
    scores_sum_g = scores_sum_g.sort_values(by=['sess_id'], ascending=True)
    scores_sum_g.reset_index(drop=True, inplace=True)
    scores_sum_g.rename(columns={'exp_score' : 'score_sum'}, inplace=True)

    merged_score_df_g = score_df_g.merge(scores_sum_g, how='left', left_on=['sess_id'], right_on=['sess_id'])
    merged_score_df_g = merged_score_df_g.sort_values(by=['sess_id', 'product'])
    merged_score_df_g.reset_index(drop=True, inplace=True)
    
    merged_score_df = merged_score_df_g.to_pandas(merged_score_df_g)
    score_df[normalized_score_name] = merged_score_df['exp_score'] / merged_score_df['score_sum']
    score_df['exp_score'] = merged_score_df['exp_score']
    score_df['score_sum'] = merged_score_df['score_sum']

    del scores_sum_g
    del merged_score_df_g 

# Merge valid score

In [6]:
merged_candidates_feature_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/XGBoost/candidates/merged_candidates_no_hist_feature.parquet'
valid_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/task1_data/task13_4_task1_valid_sessions.csv'
test_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data/sessions_test_task1.csv'

In [7]:
@lru_cache(maxsize=1)
def read_merged_candidates_feature():
    return pd.read_parquet(merged_candidates_feature_path, engine='pyarrow')

@lru_cache(maxsize=1)
def read_valid_sessions():
    return pd.read_csv(valid_sessions_path)

@lru_cache(maxsize=1)
def read_test_sessions():
    return pd.read_csv(test_sessions_path)

In [8]:
DE_product_embeddings_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/candidates/query_embeddings/SeqMLP/kdd_cup_2023_DE/product_embeddings_2023-05-15-16-46-30.pt'
DE_valid_embeddings_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/candidates/query_embeddings/SeqMLP/kdd_cup_2023_DE/valid_embeddings_2023-05-15-16-46-53.pt'
JP_product_embeddings_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/candidates/query_embeddings/SeqMLP/kdd_cup_2023_JP/product_embeddings_2023-05-15-16-50-01.pt'
JP_valid_embeddings_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/candidates/query_embeddings/SeqMLP/kdd_cup_2023_JP/valid_embeddings_2023-05-15-16-50-18.pt'
UK_product_embeddings_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/candidates/query_embeddings/SeqMLP/kdd_cup_2023_UK/product_embeddings_2023-05-15-16-52-41.pt'
UK_valid_embeddings_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/candidates/query_embeddings/SeqMLP/kdd_cup_2023_UK/valid_embeddings_2023-05-15-16-53-03.pt'

In [9]:
DE_dataset_cache = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/.recstudio/cache/c76eddf0a07106ffcce7ce8010856a3b'
JP_dataset_cache = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/.recstudio/cache/81a71d0a18766af84b3beab69bf53e69'
UK_dataset_cache = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/.recstudio/cache/250dbc09c30162452e00486051e47756'

In [10]:
DE_train_dataset, DE_valid_dataset = _load_cache(DE_dataset_cache)
JP_train_dataset, JP_valid_dataset = _load_cache(JP_dataset_cache)
UK_train_dataset, UK_valid_dataset = _load_cache(UK_dataset_cache)
locale_map = {
    'DE' : DE_train_dataset.field2token2idx['product_id'], 
    'JP' : JP_train_dataset.field2token2idx['product_id'], 
    'UK' : UK_train_dataset.field2token2idx['product_id']
    }

In [11]:
merged_candidates = read_merged_candidates_feature()
valid_sessions = read_valid_sessions()
EMBED_DIM = 256
merged_candidates.sort_values(by=['sess_id', 'product'], inplace=True)
merged_candidates.reset_index(drop=True, inplace=True)

In [14]:
# sess embeddings 
valid_DE_query_emb = torch.load(DE_valid_embeddings_path, map_location='cpu')
valid_JP_query_emb = torch.load(JP_valid_embeddings_path, map_location='cpu')
valid_UK_query_emb = torch.load(UK_valid_embeddings_path, map_location='cpu')
valid_query_embeddings = torch.empty(len(valid_sessions), EMBED_DIM)
valid_query_embeddings[(valid_sessions[valid_sessions['locale'] == 'DE'].index).tolist()] = valid_DE_query_emb
valid_query_embeddings[(valid_sessions[valid_sessions['locale'] == 'JP'].index).tolist()] = valid_JP_query_emb
valid_query_embeddings[(valid_sessions[valid_sessions['locale'] == 'UK'].index).tolist()] = valid_UK_query_emb

In [15]:
# product_embeddings 
DE_product_emb = torch.load(DE_product_embeddings_path, map_location='cpu')
JP_product_emb = torch.load(JP_product_embeddings_path, map_location='cpu')
UK_product_emb = torch.load(UK_product_embeddings_path, map_location='cpu')
product_embeddings = torch.cat([DE_product_emb, JP_product_emb, UK_product_emb], dim=0)

In [16]:
DE_product_emb.shape

torch.Size([518328, 256])

In [17]:
merged_candidates_seqmlp = merged_candidates[['sess_id', 'sess_locale', 'product']].copy()

In [18]:
DE_product_list, DE_id_list = list(zip(*locale_map['DE'].items()))
JP_product_list, JP_id_list = list(zip(*locale_map['JP'].items()))
UK_product_list, UK_id_list = list(zip(*locale_map['UK'].items()))
product_list = list(DE_product_list) + list(JP_product_list) + list(UK_product_list)
id_list = list(DE_id_list) + list(JP_id_list) + list(UK_id_list)
locale_list = ['DE'] * len(DE_id_list) + ['JP'] * len(JP_id_list) + ['UK'] * len(UK_id_list)
product_id_df = pd.DataFrame({'locale' : locale_list, 'product' : product_list, 'dataset_id' : id_list})

In [19]:
merged_candidates_seqmlp_g = cudf.from_pandas(merged_candidates_seqmlp)
product_id_df_g = cudf.from_pandas(product_id_df)

In [20]:
merged_candidates_seqmlp_score_g = merged_candidates_seqmlp_g.merge(product_id_df_g, how='left', left_on=['sess_locale', 'product'], right_on=['locale', 'product'])
merged_candidates_seqmlp_score_g['dataset_id'] = merged_candidates_seqmlp_score_g['dataset_id'].fillna(0)
merged_candidates_seqmlp_score_g.drop(columns=['locale'], inplace=True)
merged_candidates_seqmlp_score_g = merged_candidates_seqmlp_score_g.sort_values(by=['sess_id', 'product'])
merged_candidates_seqmlp_score_g.reset_index(drop=True, inplace=True)
merged_candidates_seqmlp_score = merged_candidates_seqmlp_score_g.to_pandas()
assert len(merged_candidates_seqmlp_score) == len(merged_candidates)

In [21]:
del merged_candidates_seqmlp_g
del product_id_df_g
del merged_candidates_seqmlp_score_g

In [22]:
locale_offset = {'DE' : 0, 'JP' : len(DE_product_list), 'UK' : len(DE_product_list) + len(JP_product_list)}
for locale in ['DE', 'JP', 'UK']:
    merged_candidates_seqmlp_score['dataset_id'][merged_candidates_seqmlp_score['sess_locale'] == locale] = \
        merged_candidates_seqmlp_score['dataset_id'][merged_candidates_seqmlp_score['sess_locale'] == locale] + locale_offset[locale]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates_seqmlp_score['dataset_id'][merged_candidates_seqmlp_score['sess_locale'] == locale] = \


In [23]:
valid_query_embeddings = valid_query_embeddings.to('cuda:0')
product_embeddings = product_embeddings.to('cuda:0')

In [24]:
merged_candidates_seqmlp_score['seqmlp_scores'] = get_scores(merged_candidates_seqmlp_score, valid_query_embeddings, product_embeddings)

100%|██████████| 41215/41215 [05:15<00:00, 130.65it/s]


In [25]:
normalize_scores(merged_candidates_seqmlp_score, 'seqmlp_scores', 'seqmlp_normalized_scores')

In [26]:
merged_candidates['seqmlp_scores'] = merged_candidates_seqmlp_score['seqmlp_scores']
merged_candidates['seqmlp_normalized_scores'] = merged_candidates_seqmlp_score['seqmlp_normalized_scores']

In [34]:
cast_dtype(merged_candidates)
merged_candidates.to_parquet(merged_candidates_feature_path, engine='pyarrow')

In [27]:
merged_candidates_seqmlp_score

Unnamed: 0,sess_id,sess_locale,product,dataset_id,sasrec_scores_3,sasrec_normalized_scores_3,exp_score,score_sum
0,0,UK,B000OPPVCS,950401,10.891474,0.000252,53716.4073,213403423.307637
1,0,UK,B000V599Y2,1086285,10.677187,0.000203,43355.418383,213403423.307637
2,0,UK,B0018HH444,1296855,6.074605,0.000002,434.677766,213403423.307637
3,0,UK,B0079JI4DU,913338,0.000000,0.0,1.0,213403423.307637
4,0,UK,B0079JI4EY,913338,0.000000,0.0,1.0,213403423.307637
...,...,...,...,...,...,...,...,...
84407334,361580,DE,B0BB7XV97M,92057,9.635838,0.000034,15303.51095,449578670.471484
84407335,361580,DE,B0BB7YSRBX,52408,9.159988,0.000021,9508.946805,449578670.471484
84407336,361580,DE,B0BB7ZMGY8,20389,10.119755,0.000055,24828.681879,449578670.471484
84407337,361580,DE,B0BD4CP7N3,491692,-1.612869,0.0,0.199315,449578670.471484


In [33]:
merged_candidates.query("sess_id == 2").query("product=='B07VYSSRL7'")

Unnamed: 0,sess_id,sess_locale,product,target,sasrec_scores_2,sasrec_normalized_scores_2,product_freq,gru4rec_scores,gru4rec_normalized_scores,sess_avg_price,...,normalized_co_graph_counts_2,roberta_scores,roberta_normalized_scores,title_BM25_scores,sasrec_scores_3,sasrec_normalized_scores_3,normalized_all_items_co_graph_count_0,all_items_co_graph_count_0,seqmlp_scores,seqmlp_normalized_scores


In [32]:
# verify sasrec scores
merged_candidates[merged_candidates['sess_id'] == 3013].sort_values(by=['sasrec_scores_3'], ascending=False)[['sess_id', 'sess_locale', 'product', 'seqmlp_scores', 'seqmlp_normalized_scores', 'sasrec_scores_3', 'sasrec_normalized_scores_3']].iloc[:20]

Unnamed: 0,sess_id,sess_locale,product,seqmlp_scores,seqmlp_normalized_scores,sasrec_scores_3,sasrec_normalized_scores_3
706060,3013,JP,B06X6BJ13P,13.300402,0.08856,12.64131,0.183014
706058,3013,JP,B06VSKB9DB,13.193668,0.079594,12.610657,0.177489
706059,3013,JP,B06WV7B1DR,12.940981,0.061822,12.428663,0.147956
706057,3013,JP,B06VSCGTYJ,12.887313,0.058591,12.08965,0.105415
706098,3013,JP,B07S8VWT5B,14.072466,0.191663,11.718424,0.072724
706150,3013,JP,B0956X785M,12.787196,0.053009,11.063332,0.037773
706090,3013,JP,B07JB6Q58L,11.666172,0.017278,10.30368,0.017671
706144,3013,JP,B0927ZDJTL,11.755957,0.018901,10.241162,0.0166
706197,3013,JP,B09Y8G6FJJ,12.823019,0.054943,10.113563,0.014612
706039,3013,JP,B0161YPB8Y,11.818468,0.020121,10.005676,0.013117


In [27]:
valid_sessions

Unnamed: 0,prev_items,next_item,locale
0,['B09VSN9GLS' 'B09VSG9DCG' 'B0BJ5L1ZPH' 'B09VS...,B06XG1LZ6Z,UK
1,['B00390YWXE' 'B00390YWXE' 'B09WM9W6WQ'],B01MSUI4FE,JP
2,['B01BM9V6H8' 'B01MG55XDR' 'B07VYSSRL7'],B01M6625ME,UK
3,['B092ZG24S7' 'B09BNHWWZM' 'B08CB1WG5M' '17880...,0241558573,UK
4,['B0B6NY5RM8' 'B09BJGBBBR'],B09BJF6N8K,JP
...,...,...,...
361576,['B08HH6L4PB' 'B08L8N8HDR'],B00TS5UXGY,UK
361577,['B08X4L1KLZ' 'B09BBX1T4S' 'B09D76FT9D'],B09BCM5NL1,JP
361578,['B0098G6L3M' 'B00ELRLP3O' 'B00PLXGK82' 'B09GS...,B0BC38GHB4,DE
361579,['B07Q2CNLY3' 'B07Q2CNLY3' 'B07BR7DZWN' 'B07Q2...,B08H8SYLMQ,DE


In [34]:
merged_candidates

Unnamed: 0,sess_id,sess_locale,product,target,sasrec_scores_2,sasrec_normalized_scores_2
0,0,UK,B000OPPVCS,0.0,11.972421,1.994129e-04
1,0,UK,B000V599Y2,0.0,13.152878,6.492611e-04
2,0,UK,B0018HH444,0.0,5.606023,3.426590e-07
3,0,UK,B0079JI4DU,0.0,0.000000,1.259497e-09
4,0,UK,B0079JI4EY,0.0,0.000000,1.259497e-09
...,...,...,...,...,...,...
84209438,361580,DE,B0B6QWY8Q7,0.0,0.000000,1.126144e-09
84209439,361580,DE,B0BB7YSRBX,0.0,9.163816,1.074952e-05
84209440,361580,DE,B0BB7ZMGY8,0.0,11.256460,8.713899e-05
84209441,361580,DE,B0BD4CP7N3,0.0,-3.778687,2.573541e-11


# Merge test score

In [6]:
merged_candidates_feature_test_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/XGBoost/candidates/merged_candidates_test_no_hist_feature.parquet'
test_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data/sessions_test_task1.csv'

In [7]:
DE_product_embeddings_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/candidates/query_embeddings/SeqMLP/kdd_cup_2023_DE/product_embeddings_2023-05-15-16-46-30.pt'
DE_test_embeddings_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/candidates/query_embeddings/SeqMLP/kdd_cup_2023_DE/predict_embeddings_2023-05-15-16-47-56.pt'
JP_product_embeddings_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/candidates/query_embeddings/SeqMLP/kdd_cup_2023_JP/product_embeddings_2023-05-15-16-50-01.pt'
JP_test_embeddings_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/candidates/query_embeddings/SeqMLP/kdd_cup_2023_JP/predict_embeddings_2023-05-15-16-51-41.pt'
UK_product_embeddings_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/candidates/query_embeddings/SeqMLP/kdd_cup_2023_UK/product_embeddings_2023-05-15-16-52-41.pt'
UK_test_embeddings_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/candidates/query_embeddings/SeqMLP/kdd_cup_2023_UK/predict_embeddings_2023-05-15-16-56-16.pt'

In [8]:
@lru_cache(maxsize=1)
def read_merged_candidates_feature_test():
    return pd.read_parquet(merged_candidates_feature_test_path, engine='pyarrow')

@lru_cache(maxsize=1)
def read_test_sessions():
    return pd.read_csv(test_sessions_path)

In [9]:
merged_candidates = read_merged_candidates_feature_test()
test_sessions = read_test_sessions()
EMBED_DIM = 256
merged_candidates.sort_values(by=['sess_id', 'product'], inplace=True)
merged_candidates.reset_index(drop=True, inplace=True)

In [10]:
# sess embeddings 
test_DE_query_emb = torch.load(DE_test_embeddings_path, map_location='cpu')
test_JP_query_emb = torch.load(JP_test_embeddings_path, map_location='cpu')
test_UK_query_emb = torch.load(UK_test_embeddings_path, map_location='cpu')
test_query_embeddings = torch.cat(
    [test_DE_query_emb[test_sessions['locale'] == 'DE'], test_JP_query_emb[test_sessions['locale'] == 'JP'], test_UK_query_emb[test_sessions['locale'] == 'UK']],
    dim=0)

In [11]:
test_query_embeddings.shape

torch.Size([316971, 256])

In [12]:
# product embeddings 
DE_product_emb = torch.load(DE_product_embeddings_path, map_location='cpu')
JP_product_emb = torch.load(JP_product_embeddings_path, map_location='cpu')
UK_product_emb = torch.load(UK_product_embeddings_path, map_location='cpu')
product_embeddings = torch.cat([DE_product_emb, JP_product_emb, UK_product_emb], dim=0)

In [13]:
DE_dataset_cache = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/.recstudio/cache/c76eddf0a07106ffcce7ce8010856a3b'
JP_dataset_cache = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/.recstudio/cache/81a71d0a18766af84b3beab69bf53e69'
UK_dataset_cache = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/.recstudio/cache/250dbc09c30162452e00486051e47756'
DE_train_dataset, DE_valid_dataset = _load_cache(DE_dataset_cache)
JP_train_dataset, JP_valid_dataset = _load_cache(JP_dataset_cache)
UK_train_dataset, UK_valid_dataset = _load_cache(UK_dataset_cache)
locale_map = {
    'DE' : DE_train_dataset.field2token2idx['product_id'], 
    'JP' : JP_train_dataset.field2token2idx['product_id'], 
    'UK' : UK_train_dataset.field2token2idx['product_id']
    }

In [14]:
DE_product_list, DE_id_list = list(zip(*locale_map['DE'].items()))
JP_product_list, JP_id_list = list(zip(*locale_map['JP'].items()))
UK_product_list, UK_id_list = list(zip(*locale_map['UK'].items()))
product_list = list(DE_product_list) + list(JP_product_list) + list(UK_product_list)
id_list = list(DE_id_list) + list(JP_id_list) + list(UK_id_list)
locale_list = ['DE'] * len(DE_id_list) + ['JP'] * len(JP_id_list) + ['UK'] * len(UK_id_list)
product_id_df = pd.DataFrame({'locale' : locale_list, 'product' : product_list, 'dataset_id' : id_list})

In [15]:
merged_candidates_seqmlp = merged_candidates[['sess_id', 'sess_locale', 'product']].copy()

In [16]:
merged_candidates_seqmlp_g = cudf.from_pandas(merged_candidates_seqmlp)
product_id_df_g = cudf.from_pandas(product_id_df)

In [17]:
merged_candidates_seqmlp_score_g = merged_candidates_seqmlp_g.merge(product_id_df_g, how='left', left_on=['sess_locale', 'product'], right_on=['locale', 'product'])
merged_candidates_seqmlp_score_g['dataset_id'] = merged_candidates_seqmlp_score_g['dataset_id'].fillna(0)
merged_candidates_seqmlp_score_g.drop(columns=['locale'], inplace=True)
merged_candidates_seqmlp_score_g = merged_candidates_seqmlp_score_g.sort_values(by=['sess_id', 'product'])
merged_candidates_seqmlp_score_g.reset_index(drop=True, inplace=True)
merged_candidates_seqmlp_score = merged_candidates_seqmlp_score_g.to_pandas()

In [18]:
del merged_candidates_seqmlp_score_g
del product_id_df_g
del merged_candidates_seqmlp_g

In [19]:
locale_offset = {'DE' : 0, 'JP' : len(DE_product_list), 'UK' : len(DE_product_list) + len(JP_product_list)}
for locale in ['DE', 'JP', 'UK']:
    merged_candidates_seqmlp_score['dataset_id'][merged_candidates_seqmlp_score['sess_locale'] == locale] = \
        merged_candidates_seqmlp_score['dataset_id'][merged_candidates_seqmlp_score['sess_locale'] == locale] + locale_offset[locale]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates_seqmlp_score['dataset_id'][merged_candidates_seqmlp_score['sess_locale'] == locale] = \


In [20]:
merged_candidates_seqmlp_score['seqmlp_scores'] = get_scores(merged_candidates_seqmlp_score, test_query_embeddings, product_embeddings)

100%|██████████| 33901/33901 [01:08<00:00, 498.50it/s]


In [21]:
normalize_scores(merged_candidates_seqmlp_score, 'seqmlp_scores', 'seqmlp_normalized_scores')

In [22]:
merged_candidates['seqmlp_scores'] = merged_candidates_seqmlp_score['seqmlp_scores']
merged_candidates['seqmlp_normalized_scores'] = merged_candidates_seqmlp_score['seqmlp_normalized_scores']

In [56]:
cast_dtype(merged_candidates)
merged_candidates.to_parquet(merged_candidates_feature_test_path, engine='pyarrow')

In [25]:
merged_candidates[(merged_candidates['sess_id'] == 230000)].sort_values(by='sasrec_normalized_scores_3', ascending=False)[['sess_locale', 'product', 'sasrec_normalized_scores_3', 'sasrec_scores_3', 'seqmlp_normalized_scores', 'seqmlp_scores']][:20]

Unnamed: 0,sess_locale,product,sasrec_normalized_scores_3,sasrec_scores_3,seqmlp_normalized_scores,seqmlp_scores
50247381,UK,B093LPWTD8,0.084951,10.35239,0.091373,10.93745
50247327,UK,B073JJ9HP2,0.07202,10.187253,0.066935,10.62621
50247391,UK,B094R47H8Q,0.069588,10.152899,0.039884,10.108466
50247286,UK,B009WU0LWG,0.060782,10.017603,0.033926,9.94668
50247376,UK,B093C9B1HK,0.051335,9.848681,0.074143,10.728494
50247387,UK,B093LVB4P7,0.043121,9.674311,0.051726,10.368446
50247272,UK,B005EJFL42,0.02952,9.295372,0.026107,9.684687
50247273,UK,B005EJFLEM,0.02896,9.276229,0.04013,10.114613
50247375,UK,B093C88F2R,0.027596,9.227978,0.041181,10.140461
50247377,UK,B093CBBKG8,0.026901,9.20247,0.046459,10.261065


In [34]:
test_sessions.loc[200000]['prev_items']

"['B09NBQKRPC' 'B09NBQKRPC' 'B0BHQQQK2D']"