In [4]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import random
import numpy as np
import pandas as pd
import cudf, itertools
import scipy.sparse as ssp
from functools import lru_cache, partial
from tqdm import tqdm, trange
from collections import Counter, defaultdict

In [5]:
def cast_dtype(df : pd.DataFrame):
    for k in df.columns:
        dt = type(df[k].iloc[0])
        if 'float' in str(dt):
            df[k] = df[k].astype('float32')
        elif 'int' in str(dt):
            df[k] = df[k].astype('int32')
        elif dt == list:
            dt_ = type(df.iloc[0][k][0])
            if 'float' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.float32))
            elif 'int' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.int32))

In [6]:
def get_avg_price(session_df, product2price_dict):
    avg_price_list = []
    for i in tqdm(range(session_df.shape[0])):
        sess = session_df.iloc[i]
        locale = sess['locale']
        avg_price = 0.0
        prev_items = eval(sess['prev_items'].replace(' ', ','))
        for item in prev_items:
            avg_price += product2price_dict[item+'_'+locale]
        avg_price = avg_price / len(prev_items)
        avg_price_list.append(avg_price)
    return avg_price_list

In [7]:
def get_product_price(candidate_df : pd.DataFrame, product2price_dict):
    price_list = []
    for candidate in tqdm(candidate_df.itertuples(index=False), total=candidate_df.shape[0]):
        product = candidate.product
        locale = candidate.sess_locale
        if product+'_'+locale not in product2price_dict:
            if locale == 'DE': price_list.append(36.7616060903638)
            elif locale == 'JP': price_list.append(4201.2729840839065)
            elif locale == 'UK': price_list.append(22.097065056579634)
        else:
            price_list.append(product2price_dict[product+'_'+locale])
    return price_list

# Merge test price

In [8]:
merged_candidates_feature_test_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/XGBoost/candidates/merged_candidates_test_2_feature.parquet'
test_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data/sessions_test_task1.csv'
product_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/processed_products_train.csv'

In [9]:
@lru_cache(maxsize=1)
def read_merged_candidates_feature_test():
    return pd.read_parquet(merged_candidates_feature_test_path, engine='pyarrow')

@lru_cache(maxsize=1)
def read_test_sessions():
    return pd.read_csv(test_sessions_path)

@lru_cache(maxsize=1)
def read_product_feature():
    return pd.read_csv(product_path)

In [10]:
test_sessions = read_test_sessions()
product_feature = read_product_feature()
merged_candidates_feature_test = read_merged_candidates_feature_test()
product_feature['price'] = np.exp(product_feature['price']) - 1

In [11]:
product2price = {}
for i in tqdm(range(product_feature.shape[0])):
    product = product_feature.iloc[i]
    k = product['id']+'_'+product['locale']
    v = product['price']
    product2price[k] = v

100%|██████████| 1551057/1551057 [02:27<00:00, 10528.02it/s]


In [12]:
avg_price_list = get_avg_price(test_sessions, product2price)
assert len(avg_price_list) == len(test_sessions)

100%|██████████| 316971/316971 [00:23<00:00, 13771.17it/s]


In [13]:
avg_price_list = np.array(avg_price_list)
sess_index = merged_candidates_feature_test['sess_id'].to_numpy()
avg_price_list = avg_price_list[sess_index]

In [14]:
merged_candidates_feature_test['sess_avg_price'] = avg_price_list

In [15]:
# merged_candidates_feature_test_g = cudf.from_pandas(merged_candidates_feature_test)
# product_price_feature_g = cudf.from_pandas(product_feature[['id', 'locale', 'price']])

In [16]:
product_price_feature = product_feature[['id', 'locale', 'price']]

In [17]:
merged_candidates_feature_test_price = merged_candidates_feature_test.merge(product_price_feature, how='left', left_on=['sess_locale', 'product'], right_on=['locale', 'id'])
merged_candidates_feature_test_price = merged_candidates_feature_test_price.sort_values(by=['sess_id', 'product'])
merged_candidates_feature_test_price.reset_index(drop=True, inplace=True)
merged_candidates_feature_test_price.rename(columns={'price' : 'product_price'}, inplace=True)
merged_candidates_feature_test_price.drop(columns=['id', 'locale'], inplace=True)

In [29]:
# merged_candidates_feature_test_price = merged_candidates_feature_test_price_g.to_pandas()

In [18]:
# fill nan price 
DE_price_NA = (merged_candidates_feature_test_price['sess_locale'] == 'DE') & merged_candidates_feature_test_price['product_price'].isna()
JP_price_NA = (merged_candidates_feature_test_price['sess_locale'] == 'JP') & merged_candidates_feature_test_price['product_price'].isna()
UK_price_NA = (merged_candidates_feature_test_price['sess_locale'] == 'UK') & merged_candidates_feature_test_price['product_price'].isna()

In [19]:
merged_candidates_feature_test_price.loc[merged_candidates_feature_test_price.index[DE_price_NA], 'product_price'] = 36.7616060903638
merged_candidates_feature_test_price.loc[merged_candidates_feature_test_price.index[JP_price_NA], 'product_price'] = 4201.2729840839065
merged_candidates_feature_test_price.loc[merged_candidates_feature_test_price.index[UK_price_NA], 'product_price'] = 22.097065056579634

In [34]:
cast_dtype(merged_candidates_feature_test_price)
merged_candidates_feature_test_price.to_parquet(merged_candidates_feature_test_path, engine='pyarrow')

In [20]:
merged_candidates_feature_test_price[merged_candidates_feature_test_price['sess_id'] == 3]

Unnamed: 0,sess_id,sess_locale,product,sess_avg_price,product_price
540,3,DE,1687728321,13.7225,14.80
541,3,DE,1720106894,13.7225,14.80
542,3,DE,3257230478,13.7225,9.00
543,3,DE,3473327328,13.7225,14.99
544,3,DE,3735850669,13.7225,16.00
...,...,...,...,...,...
728,3,DE,B0BHN57LC9,13.7225,9.95
729,3,DE,B0BJBYY1TX,13.7225,6.99
730,3,DE,B0BJNDCGQX,13.7225,7.99
731,3,DE,B0BJYJQS7T,13.7225,11.90


In [None]:
# del merged_candidates_feature_test_g
# del product_price_feature_g
# del merged_candidates_feature_test_price_g

In [21]:
merged_candidates_feature_test_price

Unnamed: 0,sess_id,sess_locale,product,sess_avg_price,product_price
0,0,DE,B000JG2RAG,25.195268,23.190000
1,0,DE,B000RYSOUW,25.195268,6.900000
2,0,DE,B000UGZVQM,25.195268,21.990000
3,0,DE,B000Z6JN7K,25.195268,13.170000
4,0,DE,B003CYK6FU,25.195268,11.990000
...,...,...,...,...,...
66438766,316970,UK,B0BJJMGPJ7,16.950000,7.990000
66438767,316970,UK,B0BJTQQWLG,16.950000,9.880000
66438768,316970,UK,B0BJV3RL4H,16.950000,22.097065
66438769,316970,UK,B0BK7SPC84,16.950000,5.960000


In [22]:
merged_candidates_feature_test

Unnamed: 0,sess_id,sess_locale,product,sess_avg_price
0,0,DE,B099NR3X6D,25.195268
1,0,DE,B0BHVPHVCB,25.195268
2,0,DE,B09WGLXRM5,25.195268
3,0,DE,B0733FZHH4,25.195268
4,0,DE,B0044XII3A,25.195268
...,...,...,...,...
66438766,316970,UK,B09DL4XV4M,16.950000
66438767,316970,UK,B09MLYQDMS,16.950000
66438768,316970,UK,B0BK7SPC84,16.950000
66438769,316970,UK,B0BHHSZQWR,16.950000
