In [1]:
import sys
sys.path = ['/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/RecStudio/'] + sys.path
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import random
import numpy as np
import pandas as pd
import cudf, itertools
import scipy.sparse as ssp
from functools import lru_cache, partial
from tqdm import tqdm, trange
from collections import Counter, defaultdict
import torch
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def cast_dtype(df : pd.DataFrame):
    for k in df.columns:
        dt = type(df[k].iloc[0])
        if 'float' in str(dt):
            df[k] = df[k].astype('float32')
        elif 'int' in str(dt):
            df[k] = df[k].astype('int32')
        elif dt == list:
            dt_ = type(df.iloc[0][k][0])
            if 'float' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.float32))
            elif 'int' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.int32))

In [3]:
def get_scores(merged_candidates_df, product_id_name, query_embeddings, product_embeddings):
    batch_size = 4096
    num_iter = (len(merged_candidates_df) - 1) // batch_size + 1
    score_list = []
    with torch.no_grad():
        for i in tqdm(range(num_iter)):
            st, ed = i * batch_size, (i + 1) * batch_size 
            batch_sess = merged_candidates_df.iloc[st : ed]
            batch_sess_id = torch.tensor(batch_sess['sess_id'].tolist(), dtype=torch.long, device=query_embeddings.device)
            batch_product_id = torch.tensor(batch_sess[product_id_name].tolist(), dtype=torch.long, device=product_embeddings.device)
            query_emb = query_embeddings[batch_sess_id]
            product_emb = product_embeddings[batch_product_id]
            batch_score = (query_emb * product_emb).sum(dim=-1) 
            score_list.append(batch_score.cpu())
        score_list = torch.cat(score_list, dim=0).cpu().tolist()
        return score_list 

In [4]:
def normalize_scores(score_df, score_name, normalized_score_name):
    # score_df_g = cudf.from_pandas(score_df)
    score_df['exp_score'] = np.exp(score_df[score_name].to_numpy())
    scores_sum = score_df[['sess_id', 'exp_score']].groupby('sess_id').sum()
    scores_sum.reset_index(inplace=True)
    scores_sum = scores_sum.sort_values(by=['sess_id'], ascending=True)
    scores_sum.reset_index(drop=True, inplace=True)
    scores_sum.rename(columns={'exp_score' : 'score_sum'}, inplace=True)

    merged_score_df = score_df.merge(scores_sum, how='left', left_on=['sess_id'], right_on=['sess_id'])
    merged_score_df = merged_score_df.sort_values(by=['sess_id', 'product'])
    merged_score_df.reset_index(drop=True, inplace=True)
    
    # merged_score_df = merged_score_df_g.to_pandas(merged_score_df_g)
    score_df[normalized_score_name] = merged_score_df['exp_score'] / merged_score_df['score_sum']
    score_df['exp_score'] = merged_score_df['exp_score']
    score_df['score_sum'] = merged_score_df['score_sum']

    # del scores_sum_g
    # del merged_score_df_g 

# Merge valid score

In [5]:
FIELD_NAME = 'roberta_scores'

In [6]:
merged_candidates_feature_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/XGBoost/candidates/merged_candidates_2_feature.parquet'
valid_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/task1_data/task13_4_task1_valid_sessions.csv'
product_data_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data/products_train.csv'

In [7]:
@lru_cache(maxsize=1)
def read_merged_candidates_feature():
    return pd.read_parquet(merged_candidates_feature_path, engine='pyarrow')

@lru_cache(maxsize=1)
def read_valid_sessions():
    return pd.read_csv(valid_sessions_path)

@lru_cache(maxsize=1)
def read_product_data():
    return pd.read_csv(product_data_path)

In [8]:
merged_candidates = read_merged_candidates_feature()
valid_sessions = read_valid_sessions()
product_data = read_product_data()

In [12]:
merged_candidates_product = merged_candidates[['sess_id', 'sess_locale', 'product']]
merged_candidates_product

Unnamed: 0,sess_id,sess_locale,product
0,0,UK,B000V599Y2
1,0,UK,B007VZUA7U
2,0,UK,B009EUAEQC
3,0,UK,B00AH02IWG
4,0,UK,B00I0UKKD4
...,...,...,...
77570148,361580,DE,B0BB7XV97M
77570149,361580,DE,B0BB7YSRBX
77570150,361580,DE,B0BB7ZMGY8
77570151,361580,DE,B0BD4CP7N3


In [13]:
product_index = product_data[['id', 'locale']]
product_index['product_index'] = product_index.index + 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  product_index['product_index'] = product_index.index + 1


In [12]:
# merged_candidates_product_g = cudf.from_pandas(merged_candidates_product)
# product_index_g = cudf.from_pandas(product_index)

In [13]:
# merged_candidates_product_index_g = merged_candidates_product_g.merge(product_index_g, how='left', left_on=['sess_locale', 'product'], right_on=['locale', 'id'])
# merged_candidates_product_index_g = merged_candidates_product_index_g.sort_values(by=['sess_id', 'product'])
# merged_candidates_product_index_g.reset_index(drop=True, inplace=True)
# assert len(merged_candidates_product_index_g) == len(merged_candidates_product_g)
# merged_candidates_product_index_g.drop(columns=['id', 'locale'], inplace=True)
# merged_candidates_product_index_g['product_index'] = merged_candidates_product_index_g['product_index'].fillna(0)
# merged_candidates_product_index = merged_candidates_product_index_g.to_pandas()

In [14]:
merged_candidates_product_index = merged_candidates_product.merge(product_index, how='left', left_on=['sess_locale', 'product'], right_on=['locale', 'id'])
merged_candidates_product_index = merged_candidates_product_index.sort_values(by=['sess_id', 'product'])
merged_candidates_product_index.reset_index(drop=True, inplace=True)
assert len(merged_candidates_product_index) == len(merged_candidates_product)
merged_candidates_product_index.drop(columns=['id', 'locale'], inplace=True)
merged_candidates_product_index['product_index'] = merged_candidates_product_index['product_index'].fillna(0)

In [34]:
# del merged_candidates_product_g
# del product_index_g
# del merged_candidates_product_index_g

In [15]:
roberta_product_embeddings_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/text_method/roberta_results/results_epoch_4/item_reps/item.npy'
roberta_valid_embeddings_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/text_method/roberta_results/valid_results_epoch_4/valid_query_reps/query.npy'

In [16]:
roberta_product_embeddings = np.load(roberta_product_embeddings_path)
roberta_valid_embeddings = np.load(roberta_valid_embeddings_path)

In [17]:
roberta_product_embeddings = torch.from_numpy(roberta_product_embeddings)
roberta_valid_embeddings = torch.from_numpy(roberta_valid_embeddings)
roberta_product_embeddings = torch.cat([torch.tensor([[0.0] * roberta_product_embeddings.shape[1]]), roberta_product_embeddings], dim=0)

In [18]:
roberta_product_embeddings = roberta_product_embeddings.to('cuda:0')
roberta_valid_embeddings = roberta_valid_embeddings.to('cuda:0')

In [19]:
merged_candidates_product_index[FIELD_NAME] = get_scores(merged_candidates_product_index, 'product_index', roberta_valid_embeddings, roberta_product_embeddings)

  batch_product_id = torch.tensor(batch_sess[product_id_name].tolist(), dtype=torch.long, device=product_embeddings.device)
100%|██████████| 18939/18939 [01:55<00:00, 163.54it/s]


In [20]:
roberta_product_embeddings = roberta_product_embeddings.to('cpu')
roberta_valid_embeddings = roberta_valid_embeddings.to('cpu')

In [21]:
normalize_scores(merged_candidates_product_index, FIELD_NAME, 'normalized_'+FIELD_NAME)

In [22]:
assert len(merged_candidates) == len(merged_candidates_product_index)
merged_candidates[FIELD_NAME] = merged_candidates_product_index[FIELD_NAME]
merged_candidates['normalized_'+FIELD_NAME] = merged_candidates_product_index['normalized_'+FIELD_NAME]

In [57]:
cast_dtype(merged_candidates)
merged_candidates.to_parquet(merged_candidates_feature_path, engine='pyarrow')

In [23]:
roberta_product_embeddings

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0310,  0.2281, -0.1089,  ..., -0.5854,  0.1648, -0.1245],
        [ 0.0048,  0.3028, -0.1198,  ...,  0.1212,  0.1891,  0.1643],
        ...,
        [ 0.2472, -0.0760,  0.0638,  ..., -0.5656, -0.1147, -0.2556],
        [ 0.2095,  0.0716, -0.0547,  ..., -0.3876, -0.0677, -0.1837],
        [ 0.0653, -0.2431, -0.2078,  ..., -0.2018, -0.0851, -0.1529]])

In [24]:
merged_candidates

Unnamed: 0,sess_id,sess_locale,product,target,sess_avg_price,product_price,product_freq,sasrec_scores_2,normalized_sasrec_scores_2,sasrec_scores_3,...,normalized_all_items_co_graph_count_1,all_items_co_graph_count_1,normalized_all_items_co_graph_count_2,all_items_co_graph_count_2,sasrec_feat_scores,normalized_sasrec_feat_scores,title_BM25_scores,desc_BM25_scores,roberta_scores,normalized_roberta_scores
0,0,UK,B000V599Y2,0.0,7.388571,5.200000,37.0,13.152878,7.433639e-04,10.677187,...,0.000000,0.000000,0.000000,0,11.217986,3.615180e-05,111.069756,64.592583,259.157867,1.341519e-06
1,0,UK,B007VZUA7U,0.0,7.388571,7.000000,36.0,9.393598,1.732076e-05,8.838863,...,0.000000,0.000000,0.000000,0,8.984396,3.873415e-06,130.196732,200.158524,257.981598,4.137609e-07
2,0,UK,B009EUAEQC,0.0,7.388571,7.490000,4.0,11.754339,1.835794e-04,10.670128,...,0.003793,1.033333,0.000000,0,12.906184,1.955713e-04,99.471718,30.517475,255.483337,3.402269e-08
3,0,UK,B00AH02IWG,0.0,7.388571,8.500000,3.0,12.194766,2.851667e-04,11.166204,...,0.004588,1.250000,0.006494,1,13.625712,4.015986e-04,2.061926,216.653198,255.024780,2.150898e-08
4,0,UK,B00I0UKKD4,0.0,7.388571,17.049999,118.0,11.835367,1.990737e-04,11.681271,...,0.006730,1.833333,0.006494,1,16.518963,7.249614e-03,611.139954,323.099518,267.615601,6.320386e-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77570148,361580,DE,B0BB7XV97M,0.0,32.424000,47.990002,56.0,9.117821,6.076918e-05,9.635838,...,0.000000,0.000000,0.000000,0,8.051070,1.486059e-06,118.126396,0.000000,263.574158,1.367507e-03
77570149,361580,DE,B0BB7YSRBX,0.0,32.424000,43.990002,58.0,9.163816,6.362959e-05,9.159988,...,0.000000,0.000000,0.000000,0,8.144089,1.630923e-06,124.881615,0.000000,263.523743,1.300273e-03
77570150,361580,DE,B0BB7ZMGY8,0.0,32.424000,41.990002,452.0,11.256460,5.158017e-04,10.119755,...,0.010612,1.226190,0.000000,0,11.075905,3.059885e-05,124.881615,0.000000,263.567017,1.357776e-03
77570151,361580,DE,B0BD4CP7N3,0.0,32.424000,24.990000,1.0,-3.778687,1.523355e-10,-1.612869,...,0.000000,0.000000,0.000000,0,1.995993,3.486178e-09,192.540955,36.028561,265.401611,8.503204e-03


In [25]:
merged_candidates.query('sess_id==60').sort_values(by=['normalized_sasrec_scores_2'], ascending=False)[['product', 'normalized_sasrec_scores_2', 'roberta_scores', 'normalized_roberta_scores']][:15]

Unnamed: 0,product,normalized_sasrec_scores_2,roberta_scores,normalized_roberta_scores
13017,B0BFPJ2GYZ,0.820311,267.482422,0.025413
13021,B0BFPL2QTY,0.097068,267.883789,0.037963
12990,B0B1J924QR,0.024245,269.148499,0.134469
12836,B083VL9TWR,0.013671,265.113983,0.002379
13000,B0B5P2V47K,0.011131,262.555878,0.000184
13010,B0B9QZ585L,0.010925,265.838074,0.004908
13018,B0BFPJLPWX,0.003906,260.694,2.9e-05
12862,B08HSBN2DF,0.003064,263.648682,0.00055
13015,B0BFPHSNC7,0.002091,262.94754,0.000273
12986,B0B18FPXX4,0.001939,265.157318,0.002485


In [26]:
merged_candidates.query('sess_id==60').sort_values(by=['normalized_roberta_scores'], ascending=False)[['product', 'normalized_sasrec_scores_2', 'roberta_scores', 'normalized_roberta_scores']][:15]

Unnamed: 0,product,normalized_sasrec_scores_2,roberta_scores,normalized_roberta_scores
12990,B0B1J924QR,0.02424478,269.148499,0.134469
12902,B091B11BX4,2.369517e-05,267.949738,0.040551
13021,B0BFPL2QTY,0.09706778,267.883789,0.037963
13024,B0BHMRQQJX,7.253334e-08,267.776062,0.034086
13017,B0BFPJ2GYZ,0.8203114,267.482422,0.025413
12963,B09Q97NNRL,8.027714e-08,267.341003,0.022062
12951,B09L122SFQ,3.820757e-09,267.327057,0.021756
12964,B09Q97PW4F,3.851446e-06,267.32663,0.021747
12830,B07XY32W2Q,6.169603e-08,267.212158,0.019395
12941,B09C2FXWDV,9.323143e-08,267.05719,0.01661
