use loc[i, field] instead of iloc[i, field], iloc is very slow.

In [1]:
import sys
sys.path = ['/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/RecStudio/'] + sys.path
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
import random
import numpy as np
import pandas as pd
import cudf, itertools
import scipy.sparse as ssp
from functools import lru_cache, partial
from tqdm import tqdm, trange
from collections import Counter, defaultdict
import torch
import pickle


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def cast_dtype(df : pd.DataFrame, columns=None):
    if columns is None:
        columns = df.columns
    for k in columns:
        dt = type(df[k].iloc[0])
        if 'float' in str(dt):
            df[k] = df[k].astype('float32')
        elif 'int' in str(dt):
            df[k] = df[k].astype('int32')
        elif dt == list:
            dt_ = type(df.iloc[0][k][0])
            if 'float' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.float32))
            elif 'int' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.int32))

In [3]:
def _load_cache(path):
    with open(path, 'rb') as f:
        download_obj = pickle.load(f)
    return download_obj

In [4]:
def get_scores(merged_candidates_df, query_embeddings, product_embeddings):
    batch_size = 2048
    num_iter = (len(merged_candidates_df) - 1) // batch_size + 1
    score_list = []
    with torch.no_grad():
        for i in tqdm(range(num_iter)):
            st, ed = i * batch_size, (i + 1) * batch_size 
            batch_sess = merged_candidates_df.iloc[st : ed]
            batch_sess_id = torch.tensor(batch_sess['sess_id'].tolist(), dtype=torch.long, device=query_embeddings.device)
            batch_product_id = torch.tensor(batch_sess['dataset_id'].tolist(), dtype=torch.long, device=product_embeddings.device)
            query_emb = query_embeddings[batch_sess_id].to('cuda:0')
            product_emb = product_embeddings[batch_product_id].to('cuda:0')
            batch_score = (query_emb * product_emb).sum(dim=-1) 
            score_list.append(batch_score.cpu())
        score_list = torch.cat(score_list, dim=0).cpu().tolist()
        return score_list 

In [5]:
def normalize_scores(score_df, score_name, normalized_score_name):
    # score_df_g = cudf.from_pandas(score_df)
    score_df['exp_score'] = np.exp(score_df[score_name].to_numpy())
    scores_sum = score_df[['sess_id', 'exp_score']].groupby('sess_id').sum()
    scores_sum.reset_index(inplace=True)
    scores_sum = scores_sum.sort_values(by=['sess_id'], ascending=True)
    scores_sum.reset_index(drop=True, inplace=True)
    scores_sum.rename(columns={'exp_score' : 'score_sum'}, inplace=True)

    merged_score_df = score_df.merge(scores_sum, how='left', left_on=['sess_id'], right_on=['sess_id'])
    merged_score_df = merged_score_df.sort_values(by=['sess_id', 'product'])
    merged_score_df.reset_index(drop=True, inplace=True)
    
    # merged_score_df = merged_score_df_g.to_pandas(merged_score_df_g)
    score_df[normalized_score_name] = merged_score_df['exp_score'] / merged_score_df['score_sum']
    score_df['exp_score'] = merged_score_df['exp_score']
    score_df['score_sum'] = merged_score_df['score_sum']

    # del scores_sum_g
    # del merged_score_df_g 

# Merge valid score

In [6]:
merged_candidates_feature_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/XGBoost/candidates/merged_candidates_feature.parquet'
valid_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/task1_data/task13_4_task1_valid_sessions.csv'
test_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data/sessions_test_task1.csv'

In [7]:
@lru_cache(maxsize=1)
def read_merged_candidates_feature():
    return pd.read_parquet(merged_candidates_feature_path, engine='pyarrow')

@lru_cache(maxsize=1)
def read_valid_sessions():
    return pd.read_csv(valid_sessions_path)

@lru_cache(maxsize=1)
def read_test_sessions():
    return pd.read_csv(test_sessions_path)

In [8]:
FIELD_NAME = 'bert_scores'

In [9]:
DE_product_embeddings_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/text_method/phase2_task1_bert_results_DE/results/item_reps/reordered_item.npy'
DE_valid_embeddings_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/text_method/phase2_task1_bert_results_DE/valid_results/valid_query_reps/query.npy'
JP_product_embeddings_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/text_method/phase2_task1_bert_results_JP/results/item_reps/reordered_item.npy'
JP_valid_embeddings_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/text_method/phase2_task1_bert_results_JP/valid_results/valid_query_reps/query.npy'
UK_product_embeddings_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/text_method/phase2_task1_roberta_results_UK/results/item_reps/reordered_item.npy'
UK_valid_embeddings_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/text_method/phase2_task1_roberta_results_UK/valid_results/valid_query_reps/query.npy'

In [12]:
DE_dataset_cache = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/.recstudio/cache/cf49a486c59e9dd4de37544db3d11d4f'
JP_dataset_cache = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/.recstudio/cache/0a61def413f0594014cbda0db39a5d35'
UK_dataset_cache = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/.recstudio/cache/5bd28611fbebac9d3034ecb047ad8235'

In [13]:
DE_train_dataset, DE_valid_dataset = _load_cache(DE_dataset_cache)
JP_train_dataset, JP_valid_dataset = _load_cache(JP_dataset_cache)
UK_train_dataset, UK_valid_dataset = _load_cache(UK_dataset_cache)
locale_map = {
    'DE' : DE_train_dataset.field2token2idx['product_id'], 
    'JP' : JP_train_dataset.field2token2idx['product_id'], 
    'UK' : UK_train_dataset.field2token2idx['product_id']
    }

In [14]:
merged_candidates = read_merged_candidates_feature()
valid_sessions = read_valid_sessions()
EMBED_DIM = 768
merged_candidates.sort_values(by=['sess_id', 'product'], inplace=True)
merged_candidates.reset_index(drop=True, inplace=True)

In [45]:
# sess embeddings 
valid_DE_query_emb = torch.from_numpy(np.load(DE_valid_embeddings_path)) 
valid_JP_query_emb = torch.from_numpy(np.load(JP_valid_embeddings_path))
valid_UK_query_emb = torch.from_numpy(np.load(UK_valid_embeddings_path))
valid_query_embeddings = torch.empty(len(valid_sessions), EMBED_DIM)
valid_query_embeddings[(valid_sessions[valid_sessions['locale'] == 'DE'].index).tolist()] = valid_DE_query_emb
valid_query_embeddings[(valid_sessions[valid_sessions['locale'] == 'JP'].index).tolist()] = valid_JP_query_emb
valid_query_embeddings[(valid_sessions[valid_sessions['locale'] == 'UK'].index).tolist()] = valid_UK_query_emb

In [17]:
valid_query_embeddings, valid_query_embeddings.shape

(tensor([[-0.0282,  0.0774, -0.1366,  ...,  0.1535, -0.0135,  0.0947],
         [-0.1326,  0.1515, -0.3750,  ...,  0.0164, -0.1484, -0.1188],
         [ 0.0509,  0.1774, -0.1938,  ..., -0.2617,  0.0409,  0.1053],
         ...,
         [-0.0287,  0.3558,  0.3159,  ..., -0.1061,  0.0198, -0.0101],
         [ 0.0265,  0.5761,  0.1858,  ..., -0.0052, -0.0693, -0.0411],
         [-0.2727,  0.4056, -0.1167,  ...,  0.2551, -0.1521,  0.1274]]),
 torch.Size([361581, 768]))

In [46]:
# product_embeddings, the embeddings include padding embedding
DE_product_emb = torch.from_numpy(np.load(DE_product_embeddings_path)).type(torch.float)
JP_product_emb = torch.from_numpy(np.load(JP_product_embeddings_path)).type(torch.float)
UK_product_emb = torch.from_numpy(np.load(UK_product_embeddings_path)).type(torch.float)
product_embeddings = torch.cat([DE_product_emb, JP_product_emb, UK_product_emb], dim=0)

In [21]:
DE_product_emb

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.2777,  0.4617,  0.1657,  ...,  0.0505,  0.0628, -0.7108],
        [-0.0915,  0.2569, -0.1503,  ..., -0.0563,  0.1256, -0.4411],
        ...,
        [ 0.2559,  0.1071, -0.1689,  ..., -0.0935, -0.0517, -0.4844],
        [-0.1851,  0.6337,  0.0548,  ...,  0.1978, -0.4778, -0.8956],
        [-0.1200,  0.1002, -0.4467,  ...,  0.1571, -0.4825, -0.3066]])

In [22]:
merged_candidates_ = merged_candidates[['sess_id', 'sess_locale', 'product']].copy()

In [23]:
DE_product_list, DE_id_list = list(zip(*locale_map['DE'].items()))
JP_product_list, JP_id_list = list(zip(*locale_map['JP'].items()))
UK_product_list, UK_id_list = list(zip(*locale_map['UK'].items()))
product_list = list(DE_product_list) + list(JP_product_list) + list(UK_product_list)
id_list = list(DE_id_list) + list(JP_id_list) + list(UK_id_list)
locale_list = ['DE'] * len(DE_id_list) + ['JP'] * len(JP_id_list) + ['UK'] * len(UK_id_list)
product_id_df = pd.DataFrame({'locale' : locale_list, 'product' : product_list, 'dataset_id' : id_list})

In [17]:
# merged_candidates_g = cudf.from_pandas(merged_candidates_)
# product_id_df_g = cudf.from_pandas(product_id_df)

In [18]:
# merged_candidates_score_g = merged_candidates_g.merge(product_id_df_g, how='left', left_on=['sess_locale', 'product'], right_on=['locale', 'product'])
# merged_candidates_score_g['dataset_id'] = merged_candidates_score_g['dataset_id'].fillna(0)
# merged_candidates_score_g.drop(columns=['locale'], inplace=True)
# merged_candidates_score_g = merged_candidates_score_g.sort_values(by=['sess_id', 'product'])
# merged_candidates_score_g.reset_index(drop=True, inplace=True)
# merged_candidates_score = merged_candidates_score_g.to_pandas()

In [24]:
merged_candidates_score = merged_candidates_.merge(product_id_df, how='left', left_on=['sess_locale', 'product'], right_on=['locale', 'product'])
merged_candidates_score['dataset_id'] = merged_candidates_score['dataset_id'].fillna(0)
merged_candidates_score.drop(columns=['locale'], inplace=True)
merged_candidates_score = merged_candidates_score.sort_values(by=['sess_id', 'product'])
merged_candidates_score.reset_index(drop=True, inplace=True)
assert len(merged_candidates_score) == len(merged_candidates)

In [19]:
# del merged_candidates_g
# del product_id_df_g
# del merged_candidates_score_g

In [25]:
locale_offset = {'DE' : 0, 'JP' : len(DE_product_list), 'UK' : len(DE_product_list) + len(JP_product_list)}
for locale in ['DE', 'JP', 'UK']:
    merged_candidates_score['dataset_id'][merged_candidates_score['sess_locale'] == locale] = \
        merged_candidates_score['dataset_id'][merged_candidates_score['sess_locale'] == locale] + locale_offset[locale]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates_score['dataset_id'][merged_candidates_score['sess_locale'] == locale] = \


In [47]:
# using cosine scores 
normalized_valid_query_embeddings = torch.nn.functional.normalize(valid_query_embeddings, p=2, dim=-1)
normalized_product_embeddings = torch.nn.functional.normalize(product_embeddings, p=2, dim=-1)
normalized_product_embeddings = normalized_product_embeddings.type(torch.float)

In [48]:
# cosine scores
merged_candidates_score['cos_'+FIELD_NAME] = get_scores(merged_candidates_score, normalized_valid_query_embeddings, normalized_product_embeddings)

  batch_product_id = torch.tensor(batch_sess['dataset_id'].tolist(), dtype=torch.long, device=product_embeddings.device)
100%|██████████| 37877/37877 [03:14<00:00, 194.90it/s]


In [26]:
merged_candidates_score[FIELD_NAME] = get_scores(merged_candidates_score, valid_query_embeddings, product_embeddings)

  batch_product_id = torch.tensor(batch_sess['dataset_id'].tolist(), dtype=torch.long, device=product_embeddings.device)
100%|██████████| 37877/37877 [03:07<00:00, 202.28it/s]


In [28]:
normalize_scores(merged_candidates_score, FIELD_NAME, 'normalized_'+FIELD_NAME)

In [49]:
merged_candidates['cos_'+FIELD_NAME] = merged_candidates_score['cos_'+FIELD_NAME]

In [29]:
merged_candidates[FIELD_NAME] = merged_candidates_score[FIELD_NAME]
merged_candidates['normalized_'+FIELD_NAME] = merged_candidates_score['normalized_'+FIELD_NAME]

In [55]:
cast_dtype(merged_candidates, ['cos_'+FIELD_NAME])

In [56]:
cast_dtype(merged_candidates, [FIELD_NAME, 'normalized_'+FIELD_NAME])
merged_candidates.to_parquet(merged_candidates_feature_path, engine='pyarrow')

In [57]:
# verify gru4rec scores
merged_candidates[merged_candidates['sess_id'] == 150009].sort_values(by=['sasrec_scores_2'], ascending=False)[['sess_locale', 'product', 'normalized_sasrec_scores_2', 'sasrec_scores_2', 'normalized_bert_scores', 'cos_bert_scores', 'bert_scores']].iloc[:25]

Unnamed: 0,sess_locale,product,normalized_sasrec_scores_2,sasrec_scores_2,normalized_bert_scores,cos_bert_scores,bert_scores
32168671,JP,B07QJWZWXL,0.13292,15.21209,0.007895,0.947035,185.95015
32168750,JP,B09HPP2P62,0.111624,15.037471,0.005894,0.954957,185.657837
32168709,JP,B08LGXL3KL,0.075404,14.645205,0.000298,0.937566,182.672867
32168670,JP,B07QCL7FHC,0.07391,14.625184,0.026392,0.959379,187.157013
32168828,JP,B0B71HWNBQ,0.072561,14.606762,0.003284,0.952611,185.073151
32168714,JP,B08P57HNCJ,0.060838,14.430552,0.001012,0.919057,183.896027
32168698,JP,B08937YX6R,0.057737,14.378245,0.008227,0.948958,185.991364
32168640,JP,B015ZFKLNK,0.029743,13.71495,0.018403,0.949049,186.796448
32168783,JP,B09WCKD74Q,0.028454,13.670612,0.003087,0.945784,185.0112
32168681,JP,B07Y421Y1S,0.027272,13.628181,0.000413,0.93642,182.999954


In [41]:
merged_candidates

Unnamed: 0,sess_id,sess_locale,product,target,sess_avg_price,product_price,product_freq,sasrec_scores_2,normalized_sasrec_scores_2,sasrec_scores_3,...,roberta_scores,normalized_roberta_scores,co_graph_counts_0,normalized_co_graph_counts_0,co_graph_counts_1,normalized_co_graph_counts_1,co_graph_counts_2,normalized_co_graph_counts_2,bert_scores,normalized_bert_scores
0,0,UK,B000V599Y2,0.0,7.388571,5.200000,37.0,13.152878,7.433639e-04,10.677187,...,259.157867,1.341519e-06,3.0,0.004992,0.000000,0.000000,0.0,0.000000,476.148773,8.255889e-06
1,0,UK,B007VZUA7U,0.0,7.388571,7.000000,36.0,9.393598,1.732076e-05,8.838863,...,257.981598,4.137609e-07,3.0,0.004992,0.000000,0.000000,0.0,0.000000,475.083313,2.844725e-06
2,0,UK,B009EUAEQC,0.0,7.388571,7.490000,4.0,11.754339,1.835794e-04,10.670128,...,255.483337,3.402269e-08,6.0,0.009983,1.033333,0.004797,0.0,0.000000,476.230896,8.962503e-06
3,0,UK,B00AH02IWG,0.0,7.388571,8.500000,3.0,12.194766,2.851667e-04,11.166204,...,255.024780,2.150898e-08,4.0,0.006656,1.250000,0.005803,1.0,0.007937,472.108978,1.453126e-07
4,0,UK,B00I0UKKD4,0.0,7.388571,17.049999,118.0,11.835367,1.990737e-04,11.681271,...,267.615601,6.320386e-03,3.0,0.004992,1.833333,0.008512,1.0,0.007937,482.222168,3.584311e-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77570148,361580,DE,B0BB7XV97M,0.0,32.424000,47.990002,56.0,9.117821,6.076918e-05,9.635838,...,263.574158,1.367507e-03,0.0,0.000000,0.000000,0.000000,0.0,0.000000,388.997711,1.905752e-04
77570149,361580,DE,B0BB7YSRBX,0.0,32.424000,43.990002,58.0,9.163816,6.362959e-05,9.159988,...,263.523743,1.300273e-03,0.0,0.000000,0.000000,0.000000,0.0,0.000000,388.911377,1.748122e-04
77570150,361580,DE,B0BB7ZMGY8,0.0,32.424000,41.990002,452.0,11.256460,5.158017e-04,10.119755,...,263.567017,1.357776e-03,0.0,0.000000,0.000000,0.000000,0.0,0.000000,387.696472,5.187348e-05
77570151,361580,DE,B0BD4CP7N3,0.0,32.424000,24.990000,1.0,-3.778687,1.523355e-10,-1.612869,...,265.401611,8.503204e-03,0.0,0.000000,0.000000,0.000000,0.0,0.000000,384.966888,3.384560e-06
