use loc[i, field] instead of iloc[i, field], iloc is very slow.

In [1]:
import sys
sys.path = ['/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/RecStudio/'] + sys.path
import random
import numpy as np
import pandas as pd
import cudf, itertools
import scipy.sparse as ssp
from functools import lru_cache, partial
from tqdm import tqdm, trange
from collections import Counter, defaultdict
import torch
import pickle

In [2]:
def cast_dtype(df : pd.DataFrame, columns=None):
    if columns is None:
        columns = df.columns
    for k in columns:
        dt = type(df[k].iloc[0])
        if 'float' in str(dt):
            df[k] = df[k].astype('float32')
        elif 'int' in str(dt):
            df[k] = df[k].astype('int32')
        elif dt == list:
            dt_ = type(df.iloc[0][k][0])
            if 'float' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.float32))
            elif 'int' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.int32))

In [3]:
def _load_cache(path):
    with open(path, 'rb') as f:
        download_obj = pickle.load(f)
    return download_obj

In [4]:
def get_scores(merged_candidates_df, query_embeddings, product_embeddings):
    batch_size = 10000
    num_iter = (len(merged_candidates_df) - 1) // batch_size + 1
    score_list = []
    with torch.no_grad():
        for i in tqdm(range(num_iter)):
            st, ed = i * batch_size, (i + 1) * batch_size 
            batch_sess = merged_candidates_df.iloc[st : ed]
            batch_sess_id = torch.tensor(batch_sess['sess_id'].tolist(), dtype=torch.long, device=query_embeddings.device)
            batch_product_id = torch.tensor(batch_sess['dataset_id'].tolist(), dtype=torch.long, device=product_embeddings.device)
            query_emb = query_embeddings[batch_sess_id]
            product_emb = product_embeddings[batch_product_id]
            batch_score = (query_emb * product_emb).sum(dim=-1) 
            score_list.append(batch_score.cpu())
        score_list = torch.cat(score_list, dim=0).cpu().tolist()
        return score_list 

In [5]:
def normalize_scores(score_df, score_name, normalized_score_name):
    # score_df_g = cudf.from_pandas(score_df)
    score_df['exp_score'] = np.exp(score_df[score_name].to_numpy())
    scores_sum = score_df[['sess_id', 'exp_score']].groupby('sess_id').sum()
    scores_sum.reset_index(inplace=True)
    scores_sum = scores_sum.sort_values(by=['sess_id'], ascending=True)
    scores_sum.reset_index(drop=True, inplace=True)
    scores_sum.rename(columns={'exp_score' : 'score_sum'}, inplace=True)

    merged_score_df = score_df.merge(scores_sum, how='left', left_on=['sess_id'], right_on=['sess_id'])
    merged_score_df = merged_score_df.sort_values(by=['sess_id', 'product'])
    merged_score_df.reset_index(drop=True, inplace=True)
    
    # merged_score_df = merged_score_df_g.to_pandas(merged_score_df_g)
    score_df[normalized_score_name] = merged_score_df['exp_score'] / merged_score_df['score_sum']
    score_df['exp_score'] = merged_score_df['exp_score']
    score_df['score_sum'] = merged_score_df['score_sum']

    # del scores_sum_g
    # del merged_score_df_g 

# Merge valid score

In [6]:
merged_candidates_feature_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/XGBoost/candidates_phase2/merged_candidates_150_feature.parquet'
valid_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/task1_data/task13_4_task1_valid_sessions_phase2.csv'
test_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data/sessions_test_task1_phase2.csv'

In [7]:
@lru_cache(maxsize=1)
def read_merged_candidates_feature():
    return pd.read_parquet(merged_candidates_feature_path, engine='pyarrow')

@lru_cache(maxsize=1)
def read_valid_sessions():
    return pd.read_csv(valid_sessions_path)

@lru_cache(maxsize=1)
def read_test_sessions():
    return pd.read_csv(test_sessions_path)

In [10]:
FIELD_NAME = 'title_bert_scores'

In [9]:
DE_product_embeddings_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/text_method/phase2_task1_bert_title_results_DE/results/item_reps/reordered_item.npy'
DE_valid_embeddings_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/text_method/phase2_task1_bert_title_results_DE/valid_results/valid_query_reps/query.npy'
JP_product_embeddings_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/text_method/phase2_task1_bert_title_results_JP/results/item_reps/reordered_item.npy'
JP_valid_embeddings_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/text_method/phase2_task1_bert_title_results_JP/valid_results/valid_query_reps/query.npy'
UK_product_embeddings_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/text_method/phase2_task1_roberta_title_results_UK/results/item_reps/reordered_item.npy'
UK_valid_embeddings_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/text_method/phase2_task1_roberta_title_results_UK/valid_results/valid_query_reps/query.npy'

In [10]:
DE_dataset_cache = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/.recstudio/cache/87c62409540df6ccca9d90ab244af0e5'
JP_dataset_cache = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/.recstudio/cache/d296613e4d5aa97bebf6c4b114f02d89'
UK_dataset_cache = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/.recstudio/cache/d3540d1aadb28b19da92e77c7cf0f7e2'

In [11]:
DE_train_dataset, DE_valid_dataset = _load_cache(DE_dataset_cache)
JP_train_dataset, JP_valid_dataset = _load_cache(JP_dataset_cache)
UK_train_dataset, UK_valid_dataset = _load_cache(UK_dataset_cache)
locale_map = {
    'DE' : DE_train_dataset.field2token2idx['product_id'], 
    'JP' : JP_train_dataset.field2token2idx['product_id'], 
    'UK' : UK_train_dataset.field2token2idx['product_id']
    }

In [8]:
merged_candidates = read_merged_candidates_feature()
valid_sessions = read_valid_sessions()
EMBED_DIM = 768
merged_candidates.sort_values(by=['sess_id', 'product'], inplace=True)
merged_candidates.reset_index(drop=True, inplace=True)

In [13]:
# sess embeddings 
valid_DE_query_emb = torch.from_numpy(np.load(DE_valid_embeddings_path)) 
valid_JP_query_emb = torch.from_numpy(np.load(JP_valid_embeddings_path))
valid_UK_query_emb = torch.from_numpy(np.load(UK_valid_embeddings_path))
valid_query_embeddings = torch.empty(len(valid_sessions), EMBED_DIM)
valid_query_embeddings[(valid_sessions[valid_sessions['locale'] == 'DE'].index).tolist()] = valid_DE_query_emb
valid_query_embeddings[(valid_sessions[valid_sessions['locale'] == 'JP'].index).tolist()] = valid_JP_query_emb
valid_query_embeddings[(valid_sessions[valid_sessions['locale'] == 'UK'].index).tolist()] = valid_UK_query_emb

In [14]:
valid_query_embeddings, valid_query_embeddings.shape

(tensor([[-0.3068, -0.1536, -0.1808,  ...,  0.0290,  0.2173, -0.3839],
         [ 0.1686,  0.2973,  0.0846,  ..., -0.0785,  0.1833, -0.0888],
         [-0.0688,  0.2372, -0.1395,  ...,  0.0942,  0.2405, -0.2518],
         ...,
         [ 0.0506,  0.2156, -0.2450,  ..., -0.2550,  0.5810,  0.1119],
         [ 0.2178,  0.2354, -0.1033,  ...,  0.0651,  0.3374,  0.0170],
         [ 0.2207,  0.3106, -0.1534,  ..., -0.0553,  0.4170,  0.1376]]),
 torch.Size([261816, 768]))

In [15]:
# product_embeddings, the embeddings include padding embedding
DE_product_emb = torch.from_numpy(np.load(DE_product_embeddings_path)).type(torch.float)
JP_product_emb = torch.from_numpy(np.load(JP_product_embeddings_path)).type(torch.float)
UK_product_emb = torch.from_numpy(np.load(UK_product_embeddings_path)).type(torch.float)
product_embeddings = torch.cat([DE_product_emb, JP_product_emb, UK_product_emb], dim=0)

In [16]:
DE_product_emb

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.1786,  0.1092,  0.1171,  ...,  0.1083, -0.2798,  0.0563],
        [ 0.1745, -0.1096,  0.2095,  ..., -0.2003,  0.0631, -0.1278],
        ...,
        [ 0.3762,  0.6142, -0.1706,  ...,  0.3378,  0.2007, -0.5260],
        [-0.2694, -0.1877,  0.5673,  ...,  0.1234, -0.2152, -0.2062],
        [-0.1591,  0.6152,  0.1678,  ...,  0.2960, -0.5903, -0.2701]])

In [17]:
merged_candidates_ = merged_candidates[['sess_id', 'sess_locale', 'product']].copy()

In [18]:
DE_product_list, DE_id_list = list(zip(*locale_map['DE'].items()))
JP_product_list, JP_id_list = list(zip(*locale_map['JP'].items()))
UK_product_list, UK_id_list = list(zip(*locale_map['UK'].items()))
product_list = list(DE_product_list) + list(JP_product_list) + list(UK_product_list)
id_list = list(DE_id_list) + list(JP_id_list) + list(UK_id_list)
locale_list = ['DE'] * len(DE_id_list) + ['JP'] * len(JP_id_list) + ['UK'] * len(UK_id_list)
product_id_df = pd.DataFrame({'locale' : locale_list, 'product' : product_list, 'dataset_id' : id_list})

In [19]:
# merged_candidates_g = cudf.from_pandas(merged_candidates_)
# product_id_df_g = cudf.from_pandas(product_id_df)

In [20]:
# merged_candidates_score_g = merged_candidates_g.merge(product_id_df_g, how='left', left_on=['sess_locale', 'product'], right_on=['locale', 'product'])
# merged_candidates_score_g['dataset_id'] = merged_candidates_score_g['dataset_id'].fillna(0)
# merged_candidates_score_g.drop(columns=['locale'], inplace=True)
# merged_candidates_score_g = merged_candidates_score_g.sort_values(by=['sess_id', 'product'])
# merged_candidates_score_g.reset_index(drop=True, inplace=True)
# merged_candidates_score = merged_candidates_score_g.to_pandas()

In [21]:
merged_candidates_score = merged_candidates_.merge(product_id_df, how='left', left_on=['sess_locale', 'product'], right_on=['locale', 'product'])
merged_candidates_score['dataset_id'] = merged_candidates_score['dataset_id'].fillna(0)
merged_candidates_score.drop(columns=['locale'], inplace=True)
merged_candidates_score = merged_candidates_score.sort_values(by=['sess_id', 'product'])
merged_candidates_score.reset_index(drop=True, inplace=True)
assert len(merged_candidates_score) == len(merged_candidates)

In [22]:
# del merged_candidates_g
# del product_id_df_g
# del merged_candidates_score_g

In [23]:
locale_offset = {'DE' : 0, 'JP' : len(DE_product_list), 'UK' : len(DE_product_list) + len(JP_product_list)}
for locale in ['DE', 'JP', 'UK']:
    merged_candidates_score['dataset_id'][merged_candidates_score['sess_locale'] == locale] = \
        merged_candidates_score['dataset_id'][merged_candidates_score['sess_locale'] == locale] + locale_offset[locale]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates_score['dataset_id'][merged_candidates_score['sess_locale'] == locale] = \


In [24]:
valid_query_embeddings = valid_query_embeddings.to('cuda:7')
product_embeddings = product_embeddings.to('cuda:7')

In [25]:
# using cosine scores 
normalized_valid_query_embeddings = torch.nn.functional.normalize(valid_query_embeddings, p=2, dim=-1)
normalized_product_embeddings = torch.nn.functional.normalize(product_embeddings, p=2, dim=-1)
normalized_product_embeddings = normalized_product_embeddings.type(torch.float)

In [26]:
# cosine scores
merged_candidates_score['cos_'+FIELD_NAME] = get_scores(merged_candidates_score, normalized_valid_query_embeddings, normalized_product_embeddings)

  batch_product_id = torch.tensor(batch_sess['dataset_id'].tolist(), dtype=torch.long, device=product_embeddings.device)
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7885/7885 [02:01<00:00, 65.11it/s]


In [27]:
merged_candidates_score[FIELD_NAME] = get_scores(merged_candidates_score, valid_query_embeddings, product_embeddings)

  batch_product_id = torch.tensor(batch_sess['dataset_id'].tolist(), dtype=torch.long, device=product_embeddings.device)
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7885/7885 [02:14<00:00, 58.45it/s]


In [28]:
normalize_scores(merged_candidates_score, FIELD_NAME, 'normalized_'+FIELD_NAME)

In [29]:
merged_candidates['cos_'+FIELD_NAME] = merged_candidates_score['cos_'+FIELD_NAME]

In [30]:
merged_candidates[FIELD_NAME] = merged_candidates_score[FIELD_NAME]
merged_candidates['normalized_'+FIELD_NAME] = merged_candidates_score['normalized_'+FIELD_NAME]

In [31]:
cast_dtype(merged_candidates, ['cos_'+FIELD_NAME])

In [32]:
cast_dtype(merged_candidates, [FIELD_NAME, 'normalized_'+FIELD_NAME])
merged_candidates.to_parquet(merged_candidates_feature_path, engine='pyarrow')

In [15]:
# verify gru4rec scores
merged_candidates[merged_candidates['sess_id'] == 250110].sort_values(by=['sasrec_scores_2'], ascending=False)[['sess_locale', 'product', 'normalized_sasrec_scores_2', 'sasrec_scores_2', 'normalized_'+FIELD_NAME, FIELD_NAME, 'cos_'+FIELD_NAME]].iloc[:25]

Unnamed: 0,sess_locale,product,normalized_sasrec_scores_2,sasrec_scores_2,normalized_title_bert_scores,title_bert_scores,cos_title_bert_scores
75328033,UK,B07V2CGB91,0.494195,18.951687,0.390746,463.745636,0.987632
75328030,UK,B07T4NKNR6,0.058709,16.82135,0.003202,458.941315,0.979913
75327988,UK,B078WTHG9R,0.035952,16.330944,0.063096,461.922241,0.982737
75328038,UK,B07VJJWMFD,0.026003,16.006985,7e-05,455.112732,0.96978
75328077,UK,B08HN8TT4G,0.02392,15.923451,0.00012,455.654083,0.968337
75328180,UK,B09Y829WWJ,0.022351,15.855639,0.001254,458.004059,0.975785
75328089,UK,B08RRWFCKQ,0.020468,15.767605,7.2e-05,455.147461,0.970064
75328052,UK,B07ZCPDM3B,0.019874,15.738167,0.002296,458.608643,0.979744
75328101,UK,B094Y16J1M,0.016922,15.577348,0.000205,456.190613,0.971835
75328130,UK,B09G62Q4LG,0.016818,15.571203,0.001183,457.945801,0.975734


In [34]:
merged_candidates

Unnamed: 0,sess_id,sess_locale,product,target,sess_avg_price,product_price,sasrec_scores_3,normalized_sasrec_scores_3,sasrec_scores_2,normalized_sasrec_scores_2,...,w2v_l2_score,w2v_l3_score,normalized_w2v_l1_score,normalized_w2v_l2_score,normalized_w2v_l3_score,next_freq,next_freq_,cos_title_bert_scores,title_bert_scores,normalized_title_bert_scores
0,0,DE,355165591X,0.0,43.256542,8.990000,2.230508,7.658405e-09,0.512931,1.377575e-09,...,23.846624,22.635153,7.111217e-16,1.469563e-07,8.380214e-08,7.0,9.0,0.922899,371.254242,3.539007e-08
1,0,DE,3833237058,0.0,43.256542,22.000000,9.605231,1.221631e-05,9.325538,9.255110e-06,...,24.611195,25.308212,8.036434e-13,3.156726e-07,1.213808e-06,18.0,26.0,0.945082,381.494324,9.910427e-04
2,0,DE,B00CIXSI6U,0.0,43.256542,6.470000,0.714114,1.681035e-09,-0.115904,7.345399e-10,...,20.370945,19.463253,3.116658e-18,4.546946e-09,3.513310e-09,1.0,1.0,0.919212,369.278351,4.906398e-09
3,0,DE,B00NVDOWUW,0.0,43.256542,11.990000,8.750996,5.199363e-06,8.507557,4.084482e-06,...,22.638163,20.257607,1.129502e-15,4.388943e-08,7.774991e-09,40.0,43.0,0.949996,380.893890,5.436601e-04
4,0,DE,B00NVDP3ZU,0.0,43.256542,22.990000,8.056712,2.596729e-06,5.898870,3.007453e-07,...,25.067163,20.753508,2.234024e-12,4.980370e-07,1.276636e-08,50.0,79.0,0.947742,379.729675,1.697131e-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78842194,261815,UK,B0BCX524Y6,0.0,9.383333,16.990000,6.813615,1.076201e-03,7.203015,4.597607e-04,...,25.790541,25.963753,1.723611e-07,1.723611e-07,3.863373e-07,5.0,5.0,0.971653,439.007324,1.447198e-04
78842195,261815,UK,B0BCX6QB4L,0.0,9.383333,10.990000,9.030836,9.881445e-03,10.123234,8.526421e-03,...,37.195541,37.079201,1.547277e-02,1.547277e-02,2.596238e-02,18.0,21.0,0.971653,439.007324,1.447198e-04
78842196,261815,UK,B0BFPJYXQL,0.0,9.383333,10.560000,0.796892,2.623396e-06,1.711608,1.895152e-06,...,18.659113,17.611990,1.378160e-10,1.378160e-10,9.116796e-11,0.0,0.0,0.955145,432.761322,2.804940e-07
78842197,261815,UK,B0BH3X67S3,0.0,9.383333,6.830000,4.250781,8.296004e-05,6.447586,2.159998e-04,...,28.308519,26.345455,2.137881e-06,2.137881e-06,5.658977e-07,7.0,12.0,0.956763,434.462494,1.537210e-06
