In [1]:
import numpy as np
import pandas as pd 
from tqdm import tqdm 
from functools import lru_cache
import os 

In [2]:
recstudio_data_dir = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio'
raw_data_dir = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data'

In [3]:
@lru_cache(maxsize=1)
def read_train_sessions():
    return pd.read_csv(os.path.join(recstudio_data_dir, 'task1_data/task13_4_task1_train_sessions.csv'))

@lru_cache(maxsize=1)
def read_valid_sessions():
    return pd.read_csv(os.path.join(recstudio_data_dir, 'task1_data/task13_4_task1_valid_sessions.csv'))

@lru_cache(maxsize=1)
def read_product_data():
    return pd.read_csv(os.path.join(raw_data_dir, 'products_train.csv'))

In [7]:
def transform_to_inter_feat(sess_df, save_file):
    num_sess = len(sess_df)

    with open(os.path.join(recstudio_data_dir, save_file), 'w') as f:
        f.write('sess_id,product_id,timestamp,locale\n')
        
        for i in tqdm(range(num_sess)):
            sess_id = i 
            sess = sess_df.iloc[i]
            sess_locale = sess['locale']
            sess_prev_items = sess['prev_items']
            if 'next_item' in sess_df:
                sess_nxt_item = sess['next_item']

            product_list = sess_prev_items.strip('[]').split(' ')
            product_list = list(map(lambda x : x.strip("'\n"), product_list))
            if 'next_item' in sess_df:
                product_list.append(sess_nxt_item)

            for j, product_id in enumerate(product_list):
                inter_str = f'{sess_id},{product_id},{j},{sess_locale}\n'
                f.write(inter_str)

In [5]:
def split_single_locale(all_sessions, locale_name):
    locale_sessions = all_sessions[all_sessions['locale'] == locale_name]
    return locale_sessions   

In [7]:
all_task_1_train_sessions = read_train_sessions()
all_task_1_train_sessions.head(2)

Unnamed: 0,prev_items,next_item,locale
0,['B005ZJTUXE' 'B005ZJTUXE' 'B00P8VIBBG'],B07TVSL9TW,FR
1,['B09M8HSN22' 'B09MTKZNB2' 'B07XWK3G8K' 'B09H7...,B01J5EEEQW,DE


In [8]:
all_task_1_train_sessions['locale'].unique()

array(['FR', 'DE', 'JP', 'UK', 'ES', 'IT'], dtype=object)

In [12]:
UK_train_sessions = split_single_locale(all_task_1_train_sessions, 'UK')
UK_train_sessions.to_csv('../data_for_recstudio/UK_data/UK_train_sessions.csv', index=False)

DE_train_sessions = split_single_locale(all_task_1_train_sessions, 'DE')
DE_train_sessions.to_csv('../data_for_recstudio/DE_data/DE_train_sessions.csv', index=False)

JP_train_sessions = split_single_locale(all_task_1_train_sessions, 'JP')
JP_train_sessions.to_csv('../data_for_recstudio/JP_data/JP_train_sessions.csv', index=False)

In [13]:
DE_train_sessions = split_single_locale(all_task_1_train_sessions, 'DE')
transform_to_inter_feat(DE_train_sessions, 'DE_train_inter_feat.csv')

100%|██████████| 1103803/1103803 [00:43<00:00, 25321.51it/s]


In [14]:
JP_train_sessions = split_single_locale(all_task_1_train_sessions, 'JP')
print(len(JP_train_sessions))
transform_to_inter_feat(JP_train_sessions, 'JP_train_inter_feat.csv')

976851


100%|██████████| 976851/976851 [00:38<00:00, 25135.97it/s]


In [15]:
UK_train_sessions = split_single_locale(all_task_1_train_sessions, 'UK')
print(len(UK_train_sessions))
transform_to_inter_feat(UK_train_sessions, 'UK_train_inter_feat.csv')

1177452


100%|██████████| 1177452/1177452 [00:46<00:00, 25521.07it/s]


In [15]:
IT_train_sessions = split_single_locale(all_task_1_train_sessions, 'IT')
print(len(IT_train_sessions))
transform_to_inter_feat(IT_train_sessions, 'IT_data/IT_train_inter_feat.csv')

114115


100%|██████████| 114115/114115 [00:04<00:00, 25700.24it/s]


In [16]:
ES_train_sessions = split_single_locale(all_task_1_train_sessions, 'ES')
print(len(ES_train_sessions))
transform_to_inter_feat(ES_train_sessions, 'ES_data/ES_train_inter_feat.csv')

80093


100%|██████████| 80093/80093 [00:03<00:00, 25271.48it/s]


In [11]:
FR_train_sessions = split_single_locale(all_task_1_train_sessions, 'FR')
print(len(FR_train_sessions))
transform_to_inter_feat(FR_train_sessions, 'FR_train_inter_feat.csv')

105584


100%|██████████| 105584/105584 [00:04<00:00, 25670.94it/s]


In [14]:
all_task_1_valid_sessions = read_valid_sessions()
len(all_task_1_valid_sessions)

361581

In [15]:
UK_valid_sessions = split_single_locale(all_task_1_valid_sessions, 'UK')
UK_valid_sessions.to_csv('../data_for_recstudio/UK_data/UK_valid_sessions.csv', index=False)

DE_valid_sessions = split_single_locale(all_task_1_valid_sessions, 'DE')
DE_valid_sessions.to_csv('../data_for_recstudio/DE_data/DE_valid_sessions.csv', index=False)

JP_valid_sessions = split_single_locale(all_task_1_valid_sessions, 'JP')
JP_valid_sessions.to_csv('../data_for_recstudio/JP_data/JP_valid_sessions.csv', index=False)

In [18]:
DE_valid_sessions = split_single_locale(all_task_1_valid_sessions, 'DE')
transform_to_inter_feat(DE_valid_sessions, 'DE_valid_inter_feat.csv')

JP_valid_sessions = split_single_locale(all_task_1_valid_sessions, 'JP')
transform_to_inter_feat(JP_valid_sessions, 'JP_valid_inter_feat.csv')

UK_valid_sessions = split_single_locale(all_task_1_valid_sessions, 'UK')
transform_to_inter_feat(UK_valid_sessions, 'UK_valid_inter_feat.csv')

100%|██████████| 122181/122181 [00:04<00:00, 25352.06it/s]
100%|██████████| 108735/108735 [00:04<00:00, 25332.73it/s]
100%|██████████| 130665/130665 [00:05<00:00, 25941.79it/s]


In [17]:
task13_4_all_valid_sessions = pd.read_csv('../data_for_recstudio/task13_4_all_valid_sessions.csv')
len(task13_4_all_valid_sessions)

395322

In [21]:
task13_4_task2_valid_sessions = task13_4_all_valid_sessions[task13_4_all_valid_sessions['locale'].isin(['FR', 'IT', 'ES'])]
len(task13_4_task2_valid_sessions)

33741

In [23]:
IT_valid_sessions = split_single_locale(task13_4_task2_valid_sessions, 'IT')
transform_to_inter_feat(IT_valid_sessions, 'IT_data/IT_valid_inter_feat.csv')

ES_valid_sessions = split_single_locale(task13_4_task2_valid_sessions, 'ES')
transform_to_inter_feat(ES_valid_sessions, 'ES_data/ES_valid_inter_feat.csv')

FR_valid_sessions = split_single_locale(task13_4_task2_valid_sessions, 'FR')
transform_to_inter_feat(FR_valid_sessions, 'FR_data/FR_valid_inter_feat.csv')

100%|██████████| 12810/12810 [00:00<00:00, 24989.90it/s]
100%|██████████| 8954/8954 [00:00<00:00, 24732.72it/s]
100%|██████████| 11977/11977 [00:00<00:00, 25485.01it/s]


In [28]:
all_product_data = read_product_data()
len(all_product_data)

1551057

In [29]:
DE_product_data = all_product_data[all_product_data['locale'] == 'DE']
DE_product_data.to_csv(os.path.join(recstudio_data_dir, 'DE_product_train.csv'), index=False)

In [30]:
JP_product_data = all_product_data[all_product_data['locale'] == 'JP']
JP_product_data.to_csv(os.path.join(recstudio_data_dir, 'JP_product_train.csv'), index=False)

In [31]:
UK_product_data = all_product_data[all_product_data['locale'] == 'UK']
UK_product_data.to_csv(os.path.join(recstudio_data_dir, 'UK_product_train.csv'), index=False)

In [32]:
DE_product_data.head(2)

Unnamed: 0,id,locale,title,price,brand,color,size,model,material,author,desc
0,B005ZSSN10,DE,RED DRAGON Amberjack 3 - Steel Tip 22 Gramm Wo...,30.95,RED DRAGON,,,RDD0089,,,Amberjacks Steel Dartpfeile sind verfügbar in ...
1,B08PRYN6LD,DE,Simply Keto Lower Carb* Schokodrops ohne Zucke...,17.9,Simply Keto,,750 g (1er Pack),,,,🌱 NATÜRLICHE SÜSSE DURCH ERYTHRIT - Wir stelle...


In [33]:
id_DE_product_data = DE_product_data[['id', 'locale']]
id_DE_product_data.to_csv(os.path.join(recstudio_data_dir, 'id_DE_product_train.csv'), index=False)

In [34]:
id_JP_product_data = JP_product_data[['id', 'locale']]
id_JP_product_data.to_csv(os.path.join(recstudio_data_dir, 'id_JP_product_train.csv'), index=False)

In [35]:
id_UK_product_data = UK_product_data[['id', 'locale']]
id_UK_product_data.to_csv(os.path.join(recstudio_data_dir, 'id_UK_product_train.csv'), index=False)

# Debug sessions

In [9]:
UK_train_sessions = pd.read_csv('/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/UK_data/UK_train_sessions.csv')
UK_train_sessions.head(2)

Unnamed: 0,prev_items,next_item,locale
0,['B07DRPH58X' 'B098TF21JM'],B098TG1D3C,UK
1,['B07Q9ZTSJD' 'B00D8CXYXC'],B00D8CXXTC,UK


In [12]:
debug_train_sessions = UK_train_sessions.sample(10000)
debug_train_sessions = debug_train_sessions.reset_index(drop=True)
debug_train_sessions

Unnamed: 0,prev_items,next_item,locale
0,['B08NWFZXXK' 'B08NTQVNDG' 'B07XN4ZP6F' 'B07XG...,B083DWL275,UK
1,['B00SH2UZWG' 'B00B7JJB76' 'B005LMS21Q' 'B002R...,B005LH2J42,UK
2,['B0B42YN26Y' 'B07PQRBT5N' 'B0B431NHJ1' 'B0B42...,B075WWQY2Y,UK
3,['B00BQFTN5G' 'B07B8PQGR2' 'B01H69OP0I' 'B09QY...,B09VY73TFG,UK
4,['B07RRWQ4TS' 'B07RRWQKWJ'],B07PQT9P2M,UK
...,...,...,...
9995,['B0071QN03K' 'B000TASL5W' 'B08SWKPHF1'],B092TSH17C,UK
9996,['B0B1Z419SF' 'B00SGP6TAQ' 'B016SK4TIE'],B01N3VDMSY,UK
9997,['B0B4JW7L9C' 'B0B5MM44QY'],B00B3T3U2C,UK
9998,['B07T13PN2N' 'B07SYJ48GM'],B084RMLXFH,UK


In [13]:
debug_train_sessions.to_csv('../data_for_recstudio/UK_data/debug_UK_train_sessions.csv', index=False)

In [14]:
transform_to_inter_feat(debug_train_sessions, '../data_for_recstudio/UK_data/debug_UK_train_inter_feat.csv')

100%|██████████| 10000/10000 [00:00<00:00, 16625.48it/s]
