In [22]:
import numpy as np
import pandas as pd 
from tqdm import tqdm 
from functools import lru_cache
import os 

In [23]:
recstudio_data_dir = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio'
raw_data_dir = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data'

In [24]:
@lru_cache(maxsize=1)
def read_train_sessions():
    return pd.read_csv(os.path.join(recstudio_data_dir, 'all_task_1_train_sessions.csv'))

@lru_cache(maxsize=1)
def read_valid_sessions():
    return pd.read_csv(os.path.join(recstudio_data_dir, 'all_task_1_valid_sessions.csv'))

@lru_cache(maxsize=1)
def read_product_data():
    return pd.read_csv(os.path.join(raw_data_dir, 'products_train.csv'))

In [25]:
def transform_to_inter_feat(sess_df, save_file):
    num_sess = len(sess_df)

    with open(os.path.join(recstudio_data_dir, save_file), 'w') as f:
        f.write('sess_id,product_id,timestamp,locale\n')
        
        for i in tqdm(range(num_sess)):
            sess_id = i 
            sess = sess_df.iloc[i]
            sess_locale = sess['locale']
            sess_prev_items = sess['prev_items']
            if 'next_item' in sess_df:
                sess_nxt_item = sess['next_item']

            product_list = sess_prev_items.strip('[]').split(' ')
            product_list = list(map(lambda x : x.strip("'\n"), product_list))
            if 'next_item' in sess_df:
                product_list.append(sess_nxt_item)

            for j, product_id in enumerate(product_list):
                inter_str = f'{sess_id},{product_id},{j},{sess_locale}\n'
                f.write(inter_str)

In [26]:
def split_single_locale(all_sessions, locale_name):
    locale_sessions = all_sessions[all_sessions['locale'] == locale_name]
    return locale_sessions   

In [27]:
all_task_1_train_sessions = read_train_sessions()
all_task_1_train_sessions.head(2)

Unnamed: 0,prev_items,next_item,locale
0,['B005ZJTUXE' 'B005ZJTUXE' 'B00P8VIBBG'],B07TVSL9TW,FR
1,['B09M8HSN22' 'B09MTKZNB2' 'B07XWK3G8K' 'B09H7...,B01J5EEEQW,DE


In [13]:
DE_train_sessions = split_single_locale(all_task_1_train_sessions, 'DE')
transform_to_inter_feat(DE_train_sessions, 'DE_train_inter_feat.csv')

100%|██████████| 1103803/1103803 [00:43<00:00, 25321.51it/s]


In [14]:
JP_train_sessions = split_single_locale(all_task_1_train_sessions, 'JP')
print(len(JP_train_sessions))
transform_to_inter_feat(JP_train_sessions, 'JP_train_inter_feat.csv')

976851


100%|██████████| 976851/976851 [00:38<00:00, 25135.97it/s]


In [15]:
UK_train_sessions = split_single_locale(all_task_1_train_sessions, 'UK')
print(len(UK_train_sessions))
transform_to_inter_feat(UK_train_sessions, 'UK_train_inter_feat.csv')

1177452


100%|██████████| 1177452/1177452 [00:46<00:00, 25521.07it/s]


In [17]:
all_task_1_valid_sessions = read_valid_sessions()
len(all_task_1_valid_sessions)

361581

In [18]:
DE_valid_sessions = split_single_locale(all_task_1_valid_sessions, 'DE')
transform_to_inter_feat(DE_valid_sessions, 'DE_valid_inter_feat.csv')

JP_valid_sessions = split_single_locale(all_task_1_valid_sessions, 'JP')
transform_to_inter_feat(JP_valid_sessions, 'JP_valid_inter_feat.csv')

UK_valid_sessions = split_single_locale(all_task_1_valid_sessions, 'UK')
transform_to_inter_feat(UK_valid_sessions, 'UK_valid_inter_feat.csv')

100%|██████████| 122181/122181 [00:04<00:00, 25352.06it/s]
100%|██████████| 108735/108735 [00:04<00:00, 25332.73it/s]
100%|██████████| 130665/130665 [00:05<00:00, 25941.79it/s]


In [28]:
all_product_data = read_product_data()
len(all_product_data)

1551057

In [29]:
DE_product_data = all_product_data[all_product_data['locale'] == 'DE']
DE_product_data.to_csv(os.path.join(recstudio_data_dir, 'DE_product_train.csv'), index=False)

In [30]:
JP_product_data = all_product_data[all_product_data['locale'] == 'JP']
JP_product_data.to_csv(os.path.join(recstudio_data_dir, 'JP_product_train.csv'), index=False)

In [31]:
UK_product_data = all_product_data[all_product_data['locale'] == 'UK']
UK_product_data.to_csv(os.path.join(recstudio_data_dir, 'UK_product_train.csv'), index=False)

In [32]:
DE_product_data.head(2)

Unnamed: 0,id,locale,title,price,brand,color,size,model,material,author,desc
0,B005ZSSN10,DE,RED DRAGON Amberjack 3 - Steel Tip 22 Gramm Wo...,30.95,RED DRAGON,,,RDD0089,,,Amberjacks Steel Dartpfeile sind verfügbar in ...
1,B08PRYN6LD,DE,Simply Keto Lower Carb* Schokodrops ohne Zucke...,17.9,Simply Keto,,750 g (1er Pack),,,,🌱 NATÜRLICHE SÜSSE DURCH ERYTHRIT - Wir stelle...


In [33]:
id_DE_product_data = DE_product_data[['id', 'locale']]
id_DE_product_data.to_csv(os.path.join(recstudio_data_dir, 'id_DE_product_train.csv'), index=False)

In [34]:
id_JP_product_data = JP_product_data[['id', 'locale']]
id_JP_product_data.to_csv(os.path.join(recstudio_data_dir, 'id_JP_product_train.csv'), index=False)

In [35]:
id_UK_product_data = UK_product_data[['id', 'locale']]
id_UK_product_data.to_csv(os.path.join(recstudio_data_dir, 'id_UK_product_train.csv'), index=False)