In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import random
import numpy as np
import pandas as pd
import cudf, itertools
import scipy.sparse as ssp
from functools import lru_cache, partial
from tqdm import tqdm, trange
from collections import Counter, defaultdict

In [2]:
def cast_dtype(df : pd.DataFrame):
    for k in df.columns:
        dt = type(df[k].iloc[0])
        if 'float' in str(dt):
            df[k] = df[k].astype('float32')
        elif 'int' in str(dt):
            df[k] = df[k].astype('int32')
        elif dt == list:
            dt_ = type(df.iloc[0][k][0])
            if 'float' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.float32))
            elif 'int' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.int32))

In [3]:
def get_avg_price(session_df, product2price_dict):
    avg_price_list = []
    for i in tqdm(range(session_df.shape[0])):
        sess = session_df.iloc[i]
        locale = sess['locale']
        avg_price = 0.0
        prev_items = eval(sess['prev_items'].replace(' ', ','))
        for item in prev_items:
            avg_price += product2price_dict[item+'_'+locale]
        avg_price = avg_price / len(prev_items)
        avg_price_list.append(avg_price)
    return avg_price_list

In [4]:
def get_product_price(candidate_df : pd.DataFrame, product2price_dict):
    price_list = []
    for candidate in tqdm(candidate_df.itertuples(index=False), total=candidate_df.shape[0]):
        product = candidate.product
        locale = candidate.sess_locale
        if product+'_'+locale not in product2price_dict:
            if locale == 'DE': price_list.append(36.7616060903638)
            elif locale == 'JP': price_list.append(4201.2729840839065)
            elif locale == 'UK': price_list.append(22.097065056579634)
        else:
            price_list.append(product2price_dict[product+'_'+locale])
    return price_list

# Merge valid price 

In [10]:
merged_candidates_feature_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/XGBoost/candidates_phase2/merged_candidates_150_feature.parquet'
valid_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/task1_data/task13_4_task1_valid_sessions_phase2.csv'
product_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/processed_products_train.csv'

In [11]:
@lru_cache(maxsize=1)
def read_merged_candidates_feature():
    return pd.read_parquet(merged_candidates_feature_path, engine='pyarrow')

@lru_cache(maxsize=1)
def read_valid_sessions():
    return pd.read_csv(valid_sessions_path)

@lru_cache(maxsize=1)
def read_product_feature():
    return pd.read_csv(product_path)

In [12]:
valid_sessions = read_valid_sessions()
product_feature = read_product_feature()
merged_candidates_feature = read_merged_candidates_feature()
product_feature['price'] = np.exp(product_feature['price']) - 1

In [13]:
product2price = {}
for i in tqdm(range(product_feature.shape[0])):
    product = product_feature.iloc[i]
    k = product['id']+'_'+product['locale']
    v = product['price']
    product2price[k] = v

100%|██████████| 1551057/1551057 [03:41<00:00, 7012.30it/s]


In [14]:
avg_price_list = get_avg_price(valid_sessions, product2price)
assert len(avg_price_list) == len(valid_sessions)

100%|██████████| 261816/261816 [00:28<00:00, 9060.97it/s] 


In [15]:
avg_price_list = np.array(avg_price_list)
sess_index = merged_candidates_feature['sess_id'].to_numpy()
avg_price_list = avg_price_list[sess_index]

In [16]:
merged_candidates_feature['sess_avg_price'] = avg_price_list

In [17]:
product_price_list = get_product_price(merged_candidates_feature, product2price)
assert len(product_price_list) == len(merged_candidates_feature)

100%|██████████| 78842199/78842199 [04:24<00:00, 298195.06it/s]


In [18]:
merged_candidates_feature['product_price'] = product_price_list

In [19]:
cast_dtype(merged_candidates_feature)
merged_candidates_feature.to_parquet(merged_candidates_feature_path, engine='pyarrow')

In [20]:
merged_candidates_feature

Unnamed: 0,sess_id,sess_locale,product,target,sess_avg_price,product_price
0,0,DE,B09BNW4F85,0.0,43.256542,35.990002
1,0,DE,B01J41G4SC,0.0,43.256542,49.990002
2,0,DE,B07FP2KPWV,0.0,43.256542,39.000000
3,0,DE,B09QFM2945,0.0,43.256542,28.990000
4,0,DE,B09QG4M23V,0.0,43.256542,31.990000
...,...,...,...,...,...,...
78842194,261815,UK,B07CF56HFY,0.0,9.383333,9.990000
78842195,261815,UK,B002SPGQV2,0.0,9.383333,5.410000
78842196,261815,UK,B07YQFYH54,0.0,9.383333,9.790000
78842197,261815,UK,B09N3QGQ2M,0.0,9.383333,22.097065
