In [1]:
import sys
sys.path = ['/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/RecStudio/'] + sys.path
import random
import numpy as np
import pandas as pd
import cudf, itertools
import scipy.sparse as ssp
from functools import lru_cache, partial
from tqdm import tqdm, trange
from collections import Counter, defaultdict
import torch
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def cast_dtype(df : pd.DataFrame, columns=None):
    if columns is None:
        columns = df.columns
    for k in columns:
        dt = type(df[k].iloc[0])
        if 'float' in str(dt):
            df[k] = df[k].astype('float32')
        elif 'int' in str(dt):
            df[k] = df[k].astype('int32')
        elif dt == list:
            dt_ = type(df.iloc[0][k][0])
            if 'float' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.float32))
            elif 'int' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.int32))

In [3]:
def get_scores(merged_candidates_df, product_id_name, query_embeddings, product_embeddings):
    batch_size = 10000
    num_iter = (len(merged_candidates_df) - 1) // batch_size + 1
    score_list = []
    with torch.no_grad():
        for i in tqdm(range(num_iter)):
            st, ed = i * batch_size, (i + 1) * batch_size 
            batch_sess = merged_candidates_df.iloc[st : ed]
            batch_sess_id = torch.tensor(batch_sess['sess_id'].tolist(), dtype=torch.long, device=query_embeddings.device)
            batch_product_id = torch.tensor(batch_sess[product_id_name].tolist(), dtype=torch.long, device=product_embeddings.device)
            query_emb = query_embeddings[batch_sess_id]
            product_emb = product_embeddings[batch_product_id]
            batch_score = (query_emb * product_emb).sum(dim=-1) 
            score_list.append(batch_score.cpu())
        score_list = torch.cat(score_list, dim=0).cpu().tolist()
        return score_list 

In [4]:
def normalize_scores(score_df, score_name, normalized_score_name):
    # score_df_g = cudf.from_pandas(score_df)
    score_df['exp_score'] = np.exp(score_df[score_name].to_numpy())
    scores_sum = score_df[['sess_id', 'exp_score']].groupby('sess_id').sum()
    scores_sum.reset_index(inplace=True)
    scores_sum = scores_sum.sort_values(by=['sess_id'], ascending=True)
    scores_sum.reset_index(drop=True, inplace=True)
    scores_sum.rename(columns={'exp_score' : 'score_sum'}, inplace=True)

    merged_score_df = score_df.merge(scores_sum, how='left', left_on=['sess_id'], right_on=['sess_id'])
    merged_score_df = merged_score_df.sort_values(by=['sess_id', 'product'])
    merged_score_df.reset_index(drop=True, inplace=True)
    
    # merged_score_df = merged_score_df_g.to_pandas(merged_score_df_g)
    score_df[normalized_score_name] = merged_score_df['exp_score'] / merged_score_df['score_sum']
    score_df['exp_score'] = merged_score_df['exp_score']
    score_df['score_sum'] = merged_score_df['score_sum']

    # del scores_sum_g
    # del merged_score_df_g 

# Merge test score

In [None]:
FIELD_NAME = 'roberta_scores'

In [6]:
merged_candidates_feature_test_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/XGBoost/candidates_phase2/merged_candidates_150_test_feature.parquet'
test_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data/sessions_test_task1_phase2.csv'
product_data_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data/products_train.csv'

In [8]:
@lru_cache(maxsize=1)
def read_merged_candidates_feature_test():
    return pd.read_parquet(merged_candidates_feature_test_path, engine='pyarrow')

@lru_cache(maxsize=1)
def read_test_sessions():
    return pd.read_csv(test_sessions_path)

@lru_cache(maxsize=1)
def read_product_data():
    return pd.read_csv(product_data_path)

In [9]:
merged_candidates_test = read_merged_candidates_feature_test()
test_sessions = read_test_sessions()
product_data = read_product_data()

In [10]:
merged_candidates_product = merged_candidates_test[['sess_id', 'sess_locale', 'product']]
merged_candidates_product

Unnamed: 0,sess_id,sess_locale,product
0,0,DE,4088833651
1,0,DE,B000H6W2GW
2,0,DE,B000JG2RAG
3,0,DE,B000RYSOUW
4,0,DE,B000UGZVQM
...,...,...,...
69428426,316970,UK,B0BJCTH4NH
69428427,316970,UK,B0BJTQQWLG
69428428,316970,UK,B0BJV3RL4H
69428429,316970,UK,B0BK7SPC84


In [11]:
product_index = product_data[['id', 'locale']]
product_index['product_index'] = product_index.index + 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  product_index['product_index'] = product_index.index + 1


In [13]:
# merged_candidates_product_g = cudf.from_pandas(merged_candidates_product)
# product_index_g = cudf.from_pandas(product_index)

In [14]:
# merged_candidates_product_index_g = merged_candidates_product_g.merge(product_index_g, how='left', left_on=['sess_locale', 'product'], right_on=['locale', 'id'])
# merged_candidates_product_index_g = merged_candidates_product_index_g.sort_values(by=['sess_id', 'product'])
# merged_candidates_product_index_g.reset_index(drop=True, inplace=True)
# assert len(merged_candidates_product_index_g) == len(merged_candidates_product_g)
# merged_candidates_product_index_g.drop(columns=['id', 'locale'], inplace=True)
# merged_candidates_product_index_g['product_index'] = merged_candidates_product_index_g['product_index'].fillna(0)
# merged_candidates_product_index = merged_candidates_product_index_g.to_pandas()

In [None]:
merged_candidates_product_index = merged_candidates_product.merge(product_index, how='left', left_on=['sess_locale', 'product'], right_on=['locale', 'id'])
merged_candidates_product_index = merged_candidates_product_index.sort_values(by=['sess_id', 'product'])
merged_candidates_product_index.reset_index(drop=True, inplace=True)
assert len(merged_candidates_product_index) == len(merged_candidates_product)
merged_candidates_product_index.drop(columns=['id', 'locale'], inplace=True)
merged_candidates_product_index['product_index'] = merged_candidates_product_index['product_index'].fillna(0)

In [15]:
# del merged_candidates_product_g
# del product_index_g
# del merged_candidates_product_index_g

In [16]:
roberta_product_embeddings_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/text_method/phase2_task1_xlm_roberta_results/results/item_reps/item.npy'
roberta_test_embeddings_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/text_method/phase2_task1_xlm_roberta_results/test_results/test_query_reps/query.npy'

In [18]:
roberta_product_embeddings = np.load(roberta_product_embeddings_path)
roberta_test_embeddings = np.load(roberta_test_embeddings_path)

In [45]:
roberta_test_embeddings.shape, roberta_product_embeddings.shape

(torch.Size([316971, 768]), torch.Size([1551058, 768]))

In [19]:
roberta_product_embeddings = torch.from_numpy(roberta_product_embeddings)
roberta_test_embeddings = torch.from_numpy(roberta_test_embeddings)
roberta_product_embeddings = torch.cat([torch.tensor([[0.0] * roberta_product_embeddings.shape[1]]), roberta_product_embeddings], dim=0)

In [None]:
roberta_test_embeddings = roberta_test_embeddings.to('cuda:7')
roberta_product_embeddings = roberta_product_embeddings.to('cuda:7')

In [21]:
merged_candidates_product_index[FIELD_NAME] = get_scores(merged_candidates_product_index, 'product_index', roberta_test_embeddings, roberta_product_embeddings)

100%|██████████| 16951/16951 [01:54<00:00, 147.60it/s]


In [24]:
normalize_scores(merged_candidates_product_index, FIELD_NAME, 'normalized_'+FIELD_NAME)

In [27]:
assert len(merged_candidates_test) == len(merged_candidates_product_index)
merged_candidates_test[FIELD_NAME] = merged_candidates_product_index[FIELD_NAME]
merged_candidates_test['normalized_'+FIELD_NAME] = merged_candidates_product_index['normalized_'+FIELD_NAME]

In [None]:
cast_dtype(merged_candidates_test, [FIELD_NAME, 'normalized_'+FIELD_NAME])
merged_candidates_test.to_parquet(merged_candidates_feature_test_path, engine='pyarrow')

In [28]:
merged_candidates_test

Unnamed: 0,sess_id,sess_locale,product,sasrec_scores_2,sasrec_normalized_scores_2,gru4rec_scores,gru4rec_normalized_scores,product_freq,sess_avg_price,product_price,gru4rec_scores_2,gru4rec_normalized_scores_2,co_graph_counts_0,normalized_co_graph_counts_0,co_graph_counts_1,normalized_co_graph_counts_1,co_graph_counts_2,normalized_co_graph_counts_2,roberta_scores,roberta_normalized_scores
0,0,DE,4088833651,0.000000,2.975813e-09,0.000000,1.580065e-09,828,25.195269,36.761604,0.000000,1.326730e-09,0,0.0,0.0,0.0,0,0.0,0.000000,0.0
1,0,DE,B000H6W2GW,0.000000,2.975813e-09,0.000000,1.580065e-09,875,25.195269,36.761604,0.000000,1.326730e-09,0,0.0,0.0,0.0,0,0.0,0.000000,0.0
2,0,DE,B000JG2RAG,7.665308,6.347557e-06,8.104032,5.226502e-06,24,25.195269,23.190001,11.372551,1.152972e-04,0,0.0,0.0,0.0,0,0.0,267.192719,0.004943
3,0,DE,B000RYSOUW,-2.951060,1.555882e-10,-2.857798,9.068785e-11,5,25.195269,6.900000,-2.205641,1.461790e-10,0,0.0,0.0,0.0,0,0.0,267.322815,0.005629
4,0,DE,B000UGZVQM,3.977920,1.589257e-07,4.688567,1.717488e-07,4,25.195269,21.990000,8.559400,6.919625e-06,0,0.0,0.0,0.0,0,0.0,267.242462,0.005195
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69428426,316970,UK,B0BJCTH4NH,11.327528,1.041200e-04,10.629994,3.818184e-04,74,16.950001,5.800000,11.301320,2.638649e-04,0,0.0,0.0,0.0,0,0.0,270.043762,0.014921
69428427,316970,UK,B0BJTQQWLG,5.604142,3.403292e-07,6.052083,3.923694e-06,6,16.950001,9.880000,8.246040,1.243056e-05,0,0.0,0.0,0.0,0,0.0,269.350769,0.007462
69428428,316970,UK,B0BJV3RL4H,9.146974,1.176336e-05,7.667603,1.973815e-05,7,16.950001,22.097065,9.860847,6.248733e-05,0,0.0,0.0,0.0,0,0.0,269.313751,0.007191
69428429,316970,UK,B0BK7SPC84,-10.383047,3.879279e-14,-6.356799,1.601719e-11,0,16.950001,5.960000,-7.227418,2.368389e-12,0,0.0,0.0,0.0,0,0.0,270.200653,0.017456


In [88]:
merged_candidates_test.query('sess_id==300110').sort_values(by=['roberta_scores'], ascending=False)[['sess_locale', 'product', 'normalized_sasrec_scores_2', 'roberta_scores', 'normalized_roberta_scores']][:30]

Unnamed: 0,sess_locale,product,sasrec_normalized_scores_2,roberta_scores,roberta_normalized_scores
65700003,UK,B076HQTX5S,0.02021244,266.844208,0.138655
65700012,UK,B076HS7LDQ,0.02786299,266.776978,0.12964
65700010,UK,B076HRHP8R,2.011223e-07,266.75351,0.126633
65700005,UK,B076HQXB8X,0.925384,266.709839,0.121221
65700006,UK,B076HQZQ8L,0.006359186,266.664795,0.115882
65699999,UK,B076HPV3FC,0.01194097,266.515411,0.099802
65699998,UK,B076HMSJ74,0.003603664,266.383545,0.087473
65700007,UK,B076HR26F7,2.702519e-11,264.602417,0.014735
65699968,UK,B001O506EK,4.276145e-11,264.305908,0.010954
65699970,UK,B001O5259Y,1.448184e-12,263.800964,0.006611
