use loc[i, field] instead of iloc[i, field], iloc is very slow.

In [1]:
import sys
sys.path = ['/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/RecStudio/'] + sys.path
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
import random
import numpy as np
import pandas as pd
import cudf, itertools
import scipy.sparse as ssp
from functools import lru_cache, partial
from tqdm import tqdm, trange
from collections import Counter, defaultdict
import torch
import pickle

In [2]:
def cast_dtype(df : pd.DataFrame):
    for k in df.columns:
        dt = type(df[k].iloc[0])
        if 'float' in str(dt):
            df[k] = df[k].astype('float32')
        elif 'int' in str(dt):
            df[k] = df[k].astype('int32')
        elif dt == list:
            dt_ = type(df.iloc[0][k][0])
            if 'float' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.float32))
            elif 'int' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.int32))

In [3]:
def _load_cache(path):
    with open(path, 'rb') as f:
        download_obj = pickle.load(f)
    return download_obj

In [4]:
def get_scores(merged_candidates_df, query_embeddings, product_embeddings):
    batch_size = 2048
    num_iter = (len(merged_candidates_df) - 1) // batch_size + 1
    score_list = []
    with torch.no_grad():
        for i in tqdm(range(num_iter)):
            st, ed = i * batch_size, (i + 1) * batch_size 
            batch_sess = merged_candidates_df.iloc[st : ed]
            batch_sess_id = torch.tensor(batch_sess['sess_id'].tolist(), dtype=torch.long, device=query_embeddings.device)
            batch_product_id = torch.tensor(batch_sess['dataset_id'].tolist(), dtype=torch.long, device=product_embeddings.device)
            query_emb = query_embeddings[batch_sess_id]
            product_emb = product_embeddings[batch_product_id]
            batch_score = (query_emb * product_emb).sum(dim=-1) 
            score_list.append(batch_score.cpu())
        score_list = torch.cat(score_list, dim=0).cpu().tolist()
        return score_list 

In [5]:
def normalize_scores(score_df, score_name, normalized_score_name):
    # score_df_g = cudf.from_pandas(score_df)
    score_df['exp_score'] = np.exp(score_df[score_name].to_numpy())
    scores_sum = score_df[['sess_id', 'exp_score']].groupby('sess_id').sum()
    scores_sum.reset_index(inplace=True)
    scores_sum = scores_sum.sort_values(by=['sess_id'], ascending=True)
    scores_sum.reset_index(drop=True, inplace=True)
    scores_sum.rename(columns={'exp_score' : 'score_sum'}, inplace=True)

    merged_score_df = score_df.merge(scores_sum, how='left', left_on=['sess_id'], right_on=['sess_id'])
    merged_score_df = merged_score_df.sort_values(by=['sess_id', 'product'])
    merged_score_df.reset_index(drop=True, inplace=True)
    
    # merged_score_df = merged_score_df_g.to_pandas(merged_score_df_g)
    score_df[normalized_score_name] = merged_score_df['exp_score'] / merged_score_df['score_sum']
    score_df['exp_score'] = merged_score_df['exp_score']
    score_df['score_sum'] = merged_score_df['score_sum']

    # del scores_sum_g
    # del merged_score_df_g 

# Merge valid score

In [6]:
FIELD_NAME = 'gru4rec_feat_scores_2'

In [7]:
merged_candidates_feature_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/XGBoost/candidates_phase2/merged_candidates_150_feature.parquet'
valid_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/task1_data/task13_4_task1_valid_sessions_phase2.csv'
test_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data/sessions_test_task1_phase2.csv'

In [8]:
@lru_cache(maxsize=1)
def read_merged_candidates_feature():
    return pd.read_parquet(merged_candidates_feature_path, engine='pyarrow')

@lru_cache(maxsize=1)
def read_valid_sessions():
    return pd.read_csv(valid_sessions_path)

@lru_cache(maxsize=1)
def read_test_sessions():
    return pd.read_csv(test_sessions_path)

In [9]:
DE_product_embeddings_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/candidates_phase2/query_embeddings/GRU4Rec_Next_Feat/kdd_cup_2023_DE/product_embeddings_2023-06-03-23-30-48.pt'
DE_valid_embeddings_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/candidates_phase2/query_embeddings/GRU4Rec_Next_Feat/kdd_cup_2023_DE/valid_embeddings_2023-06-03-23-30-56.pt'
JP_product_embeddings_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/candidates_phase2/query_embeddings/GRU4Rec_Next_Feat/kdd_cup_2023_JP/product_embeddings_2023-06-03-23-32-03.pt'
JP_valid_embeddings_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/candidates_phase2/query_embeddings/GRU4Rec_Next_Feat/kdd_cup_2023_JP/valid_embeddings_2023-06-03-23-32-11.pt'
UK_product_embeddings_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/candidates_phase2/query_embeddings/GRU4Rec_Next_Feat/kdd_cup_2023_UK/product_embeddings_2023-06-03-23-33-21.pt'
UK_valid_embeddings_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/candidates_phase2/query_embeddings/GRU4Rec_Next_Feat/kdd_cup_2023_UK/valid_embeddings_2023-06-03-23-33-30.pt'

In [10]:
DE_dataset_cache = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/.recstudio/cache/b5aeac4e5b9ff0518bbcb59a28086594'
JP_dataset_cache = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/.recstudio/cache/2536617955df215e0047f5b220d1c012'
UK_dataset_cache = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/.recstudio/cache/8d133ea55ad67bd3efd625dfeff0fb1d'

In [11]:
DE_train_dataset, DE_valid_dataset = _load_cache(DE_dataset_cache)
JP_train_dataset, JP_valid_dataset = _load_cache(JP_dataset_cache)
UK_train_dataset, UK_valid_dataset = _load_cache(UK_dataset_cache)
locale_map = {
    'DE' : DE_train_dataset.field2token2idx['product_id'], 
    'JP' : JP_train_dataset.field2token2idx['product_id'], 
    'UK' : UK_train_dataset.field2token2idx['product_id']
    }

In [12]:
merged_candidates = read_merged_candidates_feature()
valid_sessions = read_valid_sessions()
EMBED_DIM = 128
merged_candidates.sort_values(by=['sess_id', 'product'], inplace=True)
merged_candidates.reset_index(drop=True, inplace=True)

In [13]:
# sess embeddings 
valid_DE_query_emb = torch.load(DE_valid_embeddings_path, map_location='cpu')
valid_JP_query_emb = torch.load(JP_valid_embeddings_path, map_location='cpu')
valid_UK_query_emb = torch.load(UK_valid_embeddings_path, map_location='cpu')
valid_query_embeddings = torch.empty(len(valid_sessions), EMBED_DIM)
valid_query_embeddings[(valid_sessions[valid_sessions['locale'] == 'DE'].index).tolist()] = valid_DE_query_emb
valid_query_embeddings[(valid_sessions[valid_sessions['locale'] == 'JP'].index).tolist()] = valid_JP_query_emb
valid_query_embeddings[(valid_sessions[valid_sessions['locale'] == 'UK'].index).tolist()] = valid_UK_query_emb

In [14]:
# product_embeddings 
DE_product_emb = torch.load(DE_product_embeddings_path, map_location='cpu')
JP_product_emb = torch.load(JP_product_embeddings_path, map_location='cpu')
UK_product_emb = torch.load(UK_product_embeddings_path, map_location='cpu')
product_embeddings = torch.cat([DE_product_emb, JP_product_emb, UK_product_emb], dim=0)

In [15]:
merged_candidates_sasrec = merged_candidates[['sess_id', 'sess_locale', 'product']].copy()

In [16]:
DE_product_list, DE_id_list = list(zip(*locale_map['DE'].items()))
JP_product_list, JP_id_list = list(zip(*locale_map['JP'].items()))
UK_product_list, UK_id_list = list(zip(*locale_map['UK'].items()))
product_list = list(DE_product_list) + list(JP_product_list) + list(UK_product_list)
id_list = list(DE_id_list) + list(JP_id_list) + list(UK_id_list)
locale_list = ['DE'] * len(DE_id_list) + ['JP'] * len(JP_id_list) + ['UK'] * len(UK_id_list)
product_id_df = pd.DataFrame({'locale' : locale_list, 'product' : product_list, 'dataset_id' : id_list})

In [17]:
# merged_candidates_sasrec_g = cudf.from_pandas(merged_candidates_sasrec)
# product_id_df_g = cudf.from_pandas(product_id_df)

In [18]:
# merged_candidates_sasrec_score_g = merged_candidates_sasrec_g.merge(product_id_df_g, how='left', left_on=['sess_locale', 'product'], right_on=['locale', 'product'])
# merged_candidates_sasrec_score_g['dataset_id'] = merged_candidates_sasrec_score_g['dataset_id'].fillna(0)
# merged_candidates_sasrec_score_g.drop(columns=['locale'], inplace=True)
# merged_candidates_sasrec_score_g = merged_candidates_sasrec_score_g.sort_values(by=['sess_id', 'product'])
# merged_candidates_sasrec_score_g.reset_index(drop=True, inplace=True)
# merged_candidates_sasrec_score = merged_candidates_sasrec_score_g.to_pandas()
# assert len(merged_candidates_sasrec_score) == len(merged_candidates)

In [19]:
merged_candidates_sasrec_score = merged_candidates_sasrec.merge(product_id_df, how='left', left_on=['sess_locale', 'product'], right_on=['locale', 'product'])
merged_candidates_sasrec_score['dataset_id'] = merged_candidates_sasrec_score['dataset_id'].fillna(0)
merged_candidates_sasrec_score.drop(columns=['locale'], inplace=True)
merged_candidates_sasrec_score = merged_candidates_sasrec_score.sort_values(by=['sess_id', 'product'])
merged_candidates_sasrec_score.reset_index(drop=True, inplace=True)
assert len(merged_candidates_sasrec_score) == len(merged_candidates)

In [20]:
# del merged_candidates_sasrec_g
# del product_id_df_g
# del merged_candidates_sasrec_score_g

In [21]:
locale_offset = {'DE' : 0, 'JP' : len(DE_product_list), 'UK' : len(DE_product_list) + len(JP_product_list)}
for locale in ['DE', 'JP', 'UK']:
    merged_candidates_sasrec_score['dataset_id'][merged_candidates_sasrec_score['sess_locale'] == locale] = \
        merged_candidates_sasrec_score['dataset_id'][merged_candidates_sasrec_score['sess_locale'] == locale] + locale_offset[locale]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates_sasrec_score['dataset_id'][merged_candidates_sasrec_score['sess_locale'] == locale] = \


In [22]:
valid_query_embeddings = valid_query_embeddings.to('cuda:0')
product_embeddings = product_embeddings.to('cuda:0')

In [23]:
merged_candidates_sasrec_score[FIELD_NAME] = get_scores(merged_candidates_sasrec_score, valid_query_embeddings, product_embeddings)

  batch_product_id = torch.tensor(batch_sess['dataset_id'].tolist(), dtype=torch.long, device=product_embeddings.device)
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 38498/38498 [01:55<00:00, 334.07it/s]


In [24]:
normalize_scores(merged_candidates_sasrec_score, FIELD_NAME, 'normalized_'+FIELD_NAME)

In [25]:
merged_candidates[FIELD_NAME] = merged_candidates_sasrec_score[FIELD_NAME]
merged_candidates['normalized_'+FIELD_NAME] = merged_candidates_sasrec_score['normalized_'+FIELD_NAME]

In [26]:
cast_dtype(merged_candidates)
merged_candidates.to_parquet(merged_candidates_feature_path, engine='pyarrow')

In [27]:
merged_candidates_sasrec_score

Unnamed: 0,sess_id,sess_locale,product,dataset_id,gru4rec_feat_scores_2,exp_score,normalized_gru4rec_feat_scores_2,score_sum
0,0,DE,355165591X,16294.0,6.238983,5.123373e+02,2.858669e-12,1.792223e+14
1,0,DE,3833237058,32078.0,13.559689,7.742798e+05,4.320219e-09,1.792223e+14
2,0,DE,B00CIXSI6U,284040.0,8.215778,3.698854e+03,2.063835e-11,1.792223e+14
3,0,DE,B00NVDOWUW,1746.0,10.070776,2.364190e+04,1.319138e-10,1.792223e+14
4,0,DE,B00NVDP3ZU,106496.0,16.270103,1.164169e+07,6.495671e-08,1.792223e+14
...,...,...,...,...,...,...,...,...
78842194,261815,UK,B0BCX524Y6,1162096.0,9.691351,1.617708e+04,3.847656e-04,4.204399e+07
78842195,261815,UK,B0BCX6QB4L,1026202.0,12.838280,3.763518e+05,8.951382e-03,4.204399e+07
78842196,261815,UK,B0BFPJYXQL,1055763.0,7.261014,1.423700e+03,3.386215e-05,4.204399e+07
78842197,261815,UK,B0BH3X67S3,966272.0,9.661869,1.570711e+04,3.735876e-04,4.204399e+07


In [28]:
# # verify sasrec scores
merged_candidates[merged_candidates['sess_id'] == 2000].sort_values(by=['sasrec_scores_3'], ascending=False)[['sess_id', 'sess_locale', 'product', 'sasrec_scores_2', 'normalized_'+FIELD_NAME, FIELD_NAME, 'normalized_sasrec_scores_3']].iloc[:20]

Unnamed: 0,sess_id,sess_locale,product,sasrec_scores_2,normalized_gru4rec_feat_scores_2,gru4rec_feat_scores_2,normalized_sasrec_scores_3
596023,2000,DE,B09D3N56DD,16.126688,0.377798,18.351198,0.429727
596028,2000,DE,B09D3P5LYN,14.356667,0.019606,15.392672,0.104548
596064,2000,DE,B09X7BK27V,13.219624,0.060374,16.517393,0.038561
596020,2000,DE,B09D3LP52K,12.823625,0.022163,15.515283,0.035548
595970,2000,DE,B08GYKNCCP,13.141228,0.071795,16.690653,0.03534
596026,2000,DE,B09D3NPHLP,13.011024,0.040851,16.126768,0.026086
596072,2000,DE,B09X7CRKRZ,12.562859,0.027223,15.720893,0.019405
595994,2000,DE,B08TJRVWV1,11.837065,0.004905,14.007093,0.018766
596068,2000,DE,B09X7C7LL1,12.590157,0.023576,15.577086,0.01707
596024,2000,DE,B09D3N8XBH,12.032446,0.012965,14.979074,0.01534
