In [2]:
import numpy as np
import pandas as pd 
from functools import lru_cache
import os
from tqdm import tqdm 
np.random.seed(42)

In [3]:
train_data_dir = '../raw_data/'
test_data_dir = '../raw_data/'
recstudio_data_dir = '../data_for_recstudio/'
task = 'task1'
PREDS_PER_SESSION = 100

In [4]:
@lru_cache(maxsize=1)
def read_product_data():
    return pd.read_csv(os.path.join(train_data_dir, 'products_train.csv'))

@lru_cache(maxsize=1)
def read_train_data():
    return pd.read_csv(os.path.join(train_data_dir, 'sessions_train.csv'))

@lru_cache(maxsize=3)
def read_test_data(task):
    return pd.read_csv(os.path.join(test_data_dir, f'sessions_test_{task}.csv'))

@lru_cache(maxsize=1)
def read_all_task1_data():
    return pd.read_csv(os.path.join(recstudio_data_dir, 'products_train.csv'))

In [4]:
def split_valid_data(df, ratio):
    num_sessions = len(df)
    num_val_sessions = int(ratio * num_sessions)
    index_permu = np.random.permutation(num_sessions)
    valid_index = index_permu[:num_val_sessions]
    train_index = index_permu[num_val_sessions:]
    val_df = df.iloc[valid_index].reset_index(drop=True)
    train_df = df.iloc[train_index].reset_index(drop=True)
    return train_df, val_df

In [5]:
def session_2_inter_feat(sessions_df, save_path, test=False):
    num_sessions = len(sessions_df)

    with open(os.path.join(save_path), 'w') as f:
        f.write('sess_id,product_id,timestamp,locale\n')

        for i in tqdm(range(num_sessions)):
            sess = sessions_df.iloc[i]
            sess_locale = sess['locale']
            sess_prev_items = sess['prev_items']
            if not test:
                sess_next_item = sess['next_item']
            
            product_list = sess_prev_items.strip('[]').split(' ')
            product_list = list(map(lambda x : x.strip("'\n"), product_list))
            if not test:
                product_list.append(sess_next_item)

            sess_id = i
            for j, product_id in enumerate(product_list):
                inter_str = f'{sess_id},{product_id},{j},{sess_locale}\n'
                f.write(inter_str)

In [6]:
def transform_test_2_train(test_sessions_df):
    train_prev_items, train_locales, train_next_items  = [], [], []
    for i in tqdm(range(len(test_sessions_df))):
        sess = test_sessions_df.iloc[i]
        sess_prev_items = sess['prev_items']
        sess_locale = sess['locale']
        product_list = sess_prev_items.strip('[]').split(' ')
        product_list = np.array(list(map(lambda x : x.strip("'\n"), product_list)))
        
        if len(product_list) <= 1:
            continue

        next_item = product_list[-1]
        product_list = product_list[:-1]

        train_prev_items.append(str(product_list))
        train_locales.append(sess_locale)
        train_next_items.append(next_item)

    train_sessions_df = pd.DataFrame({'prev_items' : train_prev_items, 'locale' : train_locales, 'next_item' : train_next_items})
    return train_sessions_df

In [7]:
train_sessions = read_train_data()
task_1_test_sessions = read_test_data('task1')
task_2_test_sessions = read_test_data('task2')
task_3_test_sessions = read_test_data('task3')

# data for task1
这个划分不能修改了，因为已经有算法在使用

In [6]:
task3_4_task1_test_sessions = task_3_test_sessions[task_3_test_sessions['locale'].isin(['UK', 'JP', 'DE'])].reset_index(drop=True)
len(task3_4_task1_test_sessions)

30000

In [8]:
# merge 3 data

task1_prev_items, task1_locales, task1_next_items  = [], [], []
for i in tqdm(range(len(task_1_test_sessions))):
    sess = task_1_test_sessions.iloc[i]
    sess_prev_items = sess['prev_items']
    sess_locale = sess['locale']
    product_list = sess_prev_items.strip('[]').split(' ')
    product_list = np.array(list(map(lambda x : x.strip("'\n"), product_list)))
    
    if len(product_list) <= 1:
        continue

    next_item = product_list[-1]
    product_list = product_list[:-1]

    task1_prev_items.append(str(product_list))
    task1_locales.append(sess_locale)
    task1_next_items.append(next_item)

task_1_test_sessions = pd.DataFrame({'prev_items' : task1_prev_items, 'locale' : task1_locales, 'next_item' : task1_next_items})
print(task_1_test_sessions.head(5))
print(len(task_1_test_sessions))

task3_prev_items, task3_locales, task3_next_items  = [], [], []
for i in tqdm(range(len(task3_4_task1_test_sessions))):
    sess = task3_4_task1_test_sessions.iloc[i]
    sess_prev_items = sess['prev_items']
    sess_locale = sess['locale']
    product_list = sess_prev_items.strip('[]').split(' ')
    product_list = np.array(list(map(lambda x : x.strip("'\n"), product_list)))
    
    if len(product_list) <= 1:
        continue

    next_item = product_list[-1]
    product_list = product_list[:-1]

    task3_prev_items.append(str(product_list))
    task3_locales.append(sess_locale)
    task3_next_items.append(next_item)

task3_4_task1_test_sessions = pd.DataFrame({'prev_items' : task3_prev_items, 'locale' : task3_locales, 'next_item' : task3_next_items})
print(task3_4_task1_test_sessions.head(5))
print(len(task3_4_task1_test_sessions))



100%|██████████| 316971/316971 [00:19<00:00, 15985.35it/s]


                                          prev_items locale   next_item
0  ['B08V12CT4C' 'B08V1KXBQD' 'B01BVG1XJS' 'B09VC...     DE  B099NQFMG7
1                        ['B00R9R5ND6' 'B00R9RZ9ZS']     DE  B00R9RZ9ZS
2           ['B07YSRXJD3' 'B07G7Q5N6G' 'B08C9Q7QVK']     DE  B07G7Q5N6G
3  ['B08KQBYV43' '3955350843' '3955350843' '39553...     DE  3955350843
4  ['B09FPTCWMC' 'B09FPTQP68' 'B08HMRY8NG' 'B08TB...     DE  B09J945WQR
316971


100%|██████████| 30000/30000 [00:01<00:00, 16654.98it/s]


                                 prev_items locale   next_item
0  ['B07KWVBK8W' 'B07KWVDNV2' 'B07KWVBK8W']     DE  B01M2CLQA5
1               ['B08K7GPV1G' 'B08P1WJYW5']     DE  B09MFXKQMT
2               ['B07R8RCRYL' 'B08379LSYF']     DE  B00PM9Z2L6
3  ['B084RTW66R' 'B001O7XWFI' 'B088KRKFJ3']     DE  B09LYX3WBC
4                            ['B074M9DZ4M']     DE  B074M9DZ4M
30000


In [9]:
all_task_1_sessions = pd.concat([train_sessions, task_1_test_sessions, task3_4_task1_test_sessions], axis=0, ignore_index=True)
all_task_1_sessions.head(5), len(all_task_1_sessions), len(train_sessions)

(                                          prev_items   next_item locale
 0                        ['B09W9FND7K' 'B09JSPLN1M']  B09M7GY217     DE
 1  ['B076THCGSG' 'B007MO8IME' 'B08MF65MLV' 'B001B...  B001B4THSA     DE
 2  ['B0B1LGXWDS' 'B00AZYORS2' 'B0B1LGXWDS' 'B00AZ...  B0767DTG2Q     DE
 3  ['B09XMTWDVT' 'B0B4MZZ8MB' 'B0B7HZ2GWX' 'B09XM...  B0B4R9NN4B     DE
 4           ['B09Y5CSL3T' 'B09Y5DPTXN' 'B09FKD61R8']  B0BGVBKWGZ     DE,
 3953220,
 3606249)

In [10]:
# split all data 
all_task_1_train_sessions, all_task_1_valid_sessions = split_valid_data(all_task_1_sessions, 0.1)

In [11]:
len(all_task_1_train_sessions), len(all_task_1_valid_sessions)

(3557898, 395322)

In [13]:
all_task_1_valid_sessions.to_csv('../data_for_recstudio/task13_4_all_valid_sessions.csv', index=False)

In [53]:
# filter ES, IT, FR in valid sessions
all_task_1_valid_sessions = all_task_1_valid_sessions[all_task_1_valid_sessions['locale'].isin(['DE', 'JP', 'UK'])]
len(all_task_1_valid_sessions)

361581

In [55]:
session_2_inter_feat(all_task_1_train_sessions, '../data_for_recstudio/all_task_1_train_inter_feat.csv')
session_2_inter_feat(all_task_1_valid_sessions, '../data_for_recstudio/all_task_1_valid_inter_feat.csv')

100%|██████████| 3557898/3557898 [05:45<00:00, 10285.57it/s]
100%|██████████| 361581/361581 [00:35<00:00, 10151.82it/s]


# tune data

In [56]:
tune_task_1_sessions = all_task_1_sessions.sample(int(0.3 * len(all_task_1_sessions)), ignore_index=True)

In [57]:
len(tune_task_1_sessions)

1185966

In [58]:
tune_task_1_train_sessions, tune_task_1_valid_sessions = split_valid_data(tune_task_1_sessions, 0.1)
len(tune_task_1_train_sessions), len(tune_task_1_valid_sessions)

(1067370, 118596)

In [59]:
# filter ES, IT, FR in valid sessions
tune_task_1_valid_sessions = tune_task_1_valid_sessions[tune_task_1_valid_sessions['locale'].isin(['DE', 'JP', 'UK'])]
len(tune_task_1_valid_sessions)

108617

In [60]:
session_2_inter_feat(tune_task_1_train_sessions, '../data_for_recstudio/tune_task_1_train_inter_feat.csv')
session_2_inter_feat(tune_task_1_valid_sessions, '../data_for_recstudio/tune_task_1_valid_inter_feat.csv')

100%|██████████| 1067370/1067370 [01:42<00:00, 10365.51it/s]
100%|██████████| 108617/108617 [00:10<00:00, 10113.86it/s]


In [61]:
all_task_1_train_inter_feat = pd.read_csv('../data_for_recstudio/all_task_1_train_inter_feat.csv')
all_task_1_valid_inter_feat = pd.read_csv('../data_for_recstudio/all_task_1_valid_inter_feat.csv')

len(all_task_1_train_inter_feat), len(all_task_1_valid_inter_feat)

(18321875, 1882412)

In [62]:
tune_task_1_train_inter_feat = pd.read_csv('../data_for_recstudio/tune_task_1_train_inter_feat.csv')
tune_task_1_valid_inter_feat = pd.read_csv('../data_for_recstudio/tune_task_1_valid_inter_feat.csv')
len(tune_task_1_train_inter_feat), len(tune_task_1_valid_inter_feat)

(5497807, 566614)

# data for task2

In [8]:
task3_4_task2_test_sessions = task_3_test_sessions[task_3_test_sessions['locale'].isin(['IT', 'FR', 'ES'])].reset_index(drop=True)
len(task3_4_task2_test_sessions)

26421

In [9]:
task3_4_task2_test_sessions_transformed = transform_test_2_train(task3_4_task2_test_sessions)

100%|██████████| 26421/26421 [00:02<00:00, 9542.61it/s] 


In [10]:
task_2_test_sessions_transformed = transform_test_2_train(task_2_test_sessions)

100%|██████████| 34688/34688 [00:04<00:00, 8534.68it/s] 


In [11]:
train_task2_sessions = train_sessions[train_sessions['locale'].isin(['IT', 'ES', 'FR'])]
len(train_task2_sessions)

333533

In [12]:
task23_4_task2_sessions = pd.concat([train_task2_sessions, task_2_test_sessions_transformed, task3_4_task2_test_sessions_transformed], axis=0, ignore_index=True)
task23_4_task2_sessions.head(5), len(task23_4_task2_sessions), len(train_task2_sessions)

(                                          prev_items   next_item locale
 0           ['B08MV5B53K' 'B08MV4RCQR' 'B08MV5B53K']  B012408XPC     ES
 1                        ['B07JGW4QWX' 'B085VCXHXL']  B07JFPYN5P     ES
 2           ['B08BFQ52PR' 'B08LVSTZVF' 'B08BFQ52PR']  B08NJP3KT6     ES
 3  ['B08PPBF9C6' 'B08PPBF9C6' 'B08PPBF9C6' 'B08PP...  B08PP6BLLK     ES
 4           ['B0B6W67XCR' 'B0B712FY2M' 'B0B6ZYJ3S2']  B09SL4MBM2     ES,
 394642,
 333533)

In [13]:
# split all data, use the first random operation of the random seed 
task23_4_task2_train_sessions, task23_4_task2_valid_sessions = split_valid_data(task23_4_task2_sessions, 0.1)
len(task23_4_task2_train_sessions), len(task23_4_task2_valid_sessions)

(355178, 39464)

In [15]:
session_2_inter_feat(task23_4_task2_train_sessions, '../data_for_recstudio/task2_data/task23_4_task2_train_inter_feat.csv')
session_2_inter_feat(task23_4_task2_valid_sessions, '../data_for_recstudio/task2_data/task23_4_task2_valid_inter_feat.csv')

100%|██████████| 355178/355178 [00:24<00:00, 14383.95it/s]
100%|██████████| 39464/39464 [00:02<00:00, 13506.41it/s]


In [16]:
task23_4_task2_train_sessions.to_csv('../data_for_recstudio/task2_data/task23_4_task2_train_sessions.csv', index=False)
task23_4_task2_valid_sessions.to_csv('../data_for_recstudio/task2_data/task23_4_task2_valid_sessions.csv', index=False)

In [18]:
IT_task23_4_task2_train_sessions = task23_4_task2_train_sessions[task23_4_task2_train_sessions['locale'] == 'IT']
session_2_inter_feat(IT_task23_4_task2_train_sessions, '../data_for_recstudio/IT_data/IT_train_inter_feat.csv')

FR_task23_4_task2_train_sessions = task23_4_task2_train_sessions[task23_4_task2_train_sessions['locale'] == 'FR']
session_2_inter_feat(FR_task23_4_task2_train_sessions, '../data_for_recstudio/FR_data/FR_train_inter_feat.csv')

ES_task23_4_task2_train_sessions = task23_4_task2_train_sessions[task23_4_task2_train_sessions['locale'] == 'ES']
session_2_inter_feat(ES_task23_4_task2_train_sessions, '../data_for_recstudio/ES_data/ES_train_inter_feat.csv')

100%|██████████| 135785/135785 [00:09<00:00, 14530.99it/s]
100%|██████████| 126177/126177 [00:08<00:00, 14728.14it/s]
100%|██████████| 93216/93216 [00:07<00:00, 12535.53it/s]


In [19]:
IT_task23_4_task2_valid_sessions = task23_4_task2_valid_sessions[task23_4_task2_valid_sessions['locale'] == 'IT']
session_2_inter_feat(IT_task23_4_task2_valid_sessions, '../data_for_recstudio/IT_data/IT_valid_inter_feat.csv')

FR_task23_4_task2_valid_sessions = task23_4_task2_valid_sessions[task23_4_task2_valid_sessions['locale'] == 'FR']
session_2_inter_feat(FR_task23_4_task2_valid_sessions, '../data_for_recstudio/FR_data/FR_valid_inter_feat.csv')

ES_task23_4_task2_valid_sessions = task23_4_task2_valid_sessions[task23_4_task2_valid_sessions['locale'] == 'ES']
session_2_inter_feat(ES_task23_4_task2_valid_sessions, '../data_for_recstudio/ES_data/ES_valid_inter_feat.csv')

100%|██████████| 15132/15132 [00:01<00:00, 11512.50it/s]
100%|██████████| 13904/13904 [00:00<00:00, 13933.00it/s]
100%|██████████| 10428/10428 [00:00<00:00, 13712.46it/s]


In [5]:
products_data = read_product_data()
len(products_data)

1551057

In [21]:
IT_products_data = products_data[products_data['locale'] == 'IT'].reset_index(drop=True)
IT_products_data.to_csv('../data_for_recstudio/IT_data/IT_product_train.csv', index=False)

FR_products_data = products_data[products_data['locale'] == 'FR'].reset_index(drop=True)
FR_products_data.to_csv('../data_for_recstudio/FR_data/FR_product_train.csv', index=False)

ES_products_data = products_data[products_data['locale'] == 'ES'].reset_index(drop=True)
ES_products_data.to_csv('../data_for_recstudio/ES_data/ES_product_train.csv', index=False)

In [22]:
IT_products_data

Unnamed: 0,id,locale,title,price,brand,color,size,model,material,author,desc
0,B09BNYKKLN,IT,Pro Breeze Mini Termoventilatore Ceramico da 1...,52.99,Pro Breeze,Nero,,1800W,,,"2 impostazioni di potenza, oscillazione e moda..."
1,B09D7LG64G,IT,Coloranti Alimentari a 14 colori - Colorante A...,9.98,SigWong,"rosso arancio, rosa, giallo verde, giallo limo...",6 ml (Confezione da 14),,,,ALTAMENTE CONCENTRATO - Sia che tu stia prepar...
2,B09Z6Y56X5,IT,"Pampers Progressi & Fit Prime Junior, Formato ...",59.99,Fater Spa,Avana,Taglia 5 (Confezione da 114),,,,"Cari mamma e papà, c’è una bella novità per vo..."
3,B016QM6OIC,IT,"Tommy Hilfiger Classic Bb Cap, Berretto Uomo, ...",32.22,Tommy Hilfiger,FLAG BLACK,Taglia unica,E367895041,Cotone,,Cinturino regolabile sul retro
4,B09BVHH3F6,IT,"JETech Cover in Silicone per iPhone 13 6,1 Pol...",11.99,JETech,Nero,"6,1 pollici",3584-,Silicone,,[Altamente protettivo] I bordi rialzati offron...
...,...,...,...,...,...,...,...,...,...,...,...
50456,B09BW5CDRR,IT,Barbie - Playset Gelateria con Bambola con Mac...,20.48,Barbie,,,HCN46,,,DETTAGLI REALISTICI. Basta inserire la pasta m...
50457,B0050IILBM,IT,"Braun Silk-épil 1 Depilatore Donna, Epilatore ...",22.61,Braun,Pink,,4210201656067,,,Alimentato a corrente per un comodo utilizzo
50458,B07W4C5W9D,IT,BoxLegend Sacchetti Sottovuoto Vestiti 6 Pezzi...,14.99,BoxLegend,6 Pezzi.,6 Pezzi (2L + 2M + 2S),6186666487608_SML,Polietilene Ppa,,6 Sacchetti in 3 Diverse Misure- Questo set di...
50459,B012D0HJXA,IT,Trasportino Pratiko Metal - Accessorio da viag...,18.35,MPS,verde,,,Metallo,,TRASPORTINO 48X31.5X33CM


In [84]:
id_IT_products_data = IT_products_data[['id', 'locale']]
id_FR_products_data = FR_products_data[['id', 'locale']]
id_ES_products_data = ES_products_data[['id', 'locale']]
id_IT_products_data.to_csv('../data_for_recstudio/IT_data/id_IT_product_train.csv', index=False)
id_FR_products_data.to_csv('../data_for_recstudio/FR_data/id_FR_product_train.csv', index=False)
id_ES_products_data.to_csv('../data_for_recstudio/ES_data/id_ES_product_train.csv', index=False)