In [43]:
import numpy as np
import pandas as pd 
from functools import lru_cache
import os
from tqdm import tqdm 
np.random.seed(42)

In [44]:
train_data_dir = '../raw_data/'
test_data_dir = '../raw_data/'
recstudio_data_dir = '../data_for_recstudio/'
task = 'task1'
PREDS_PER_SESSION = 100

In [45]:
@lru_cache(maxsize=1)
def read_product_data():
    return pd.read_csv(os.path.join(train_data_dir, 'products_train.csv'))

@lru_cache(maxsize=1)
def read_train_data():
    return pd.read_csv(os.path.join(train_data_dir, 'sessions_train.csv'))

@lru_cache(maxsize=3)
def read_test_data(task):
    return pd.read_csv(os.path.join(test_data_dir, f'sessions_test_{task}.csv'))

@lru_cache(maxsize=1)
def read_all_task1_data():
    return pd.read_csv(os.path.join(recstudio_data_dir, 'products_train.csv'))

In [46]:
def split_valid_data(df, ratio):
    num_sessions = len(df)
    num_val_sessions = int(ratio * num_sessions)
    index_permu = np.random.permutation(num_sessions)
    valid_index = index_permu[:num_val_sessions]
    train_index = index_permu[num_val_sessions:]
    val_df = df.iloc[valid_index].reset_index(drop=True)
    train_df = df.iloc[train_index].reset_index(drop=True)
    return train_df, val_df

In [47]:
train_sessions = read_train_data()
task_1_test_sessions = read_test_data('task1')
task_3_test_sessions = read_test_data('task3')

In [48]:
task_3_test_sessions = task_3_test_sessions[task_3_test_sessions['locale'].isin(['UK', 'JP', 'DE'])].reset_index(drop=True)
len(task_3_test_sessions)

30000

In [49]:
# merge 3 data

task1_prev_items, task1_locales, task1_next_items  = [], [], []
for i in tqdm(range(len(task_1_test_sessions))):
    sess = task_1_test_sessions.iloc[i]
    sess_prev_items = sess['prev_items']
    sess_locale = sess['locale']
    product_list = sess_prev_items.strip('[]').split(' ')
    product_list = np.array(list(map(lambda x : x.strip("'\n"), product_list)))
    
    if len(product_list) <= 1:
        continue

    next_item = product_list[-1]
    product_list = product_list[:-1]

    task1_prev_items.append(str(product_list))
    task1_locales.append(sess_locale)
    task1_next_items.append(next_item)

task_1_test_sessions = pd.DataFrame({'prev_items' : task1_prev_items, 'locale' : task1_locales, 'next_item' : task1_next_items})
print(task_1_test_sessions.head(5))
print(len(task_1_test_sessions))

task3_prev_items, task3_locales, task3_next_items  = [], [], []
for i in tqdm(range(len(task_3_test_sessions))):
    sess = task_3_test_sessions.iloc[i]
    sess_prev_items = sess['prev_items']
    sess_locale = sess['locale']
    product_list = sess_prev_items.strip('[]').split(' ')
    product_list = np.array(list(map(lambda x : x.strip("'\n"), product_list)))
    
    if len(product_list) <= 1:
        continue

    next_item = product_list[-1]
    product_list = product_list[:-1]

    task3_prev_items.append(str(product_list))
    task3_locales.append(sess_locale)
    task3_next_items.append(next_item)

task_3_test_sessions = pd.DataFrame({'prev_items' : task3_prev_items, 'locale' : task3_locales, 'next_item' : task3_next_items})
print(task_3_test_sessions.head(5))
print(len(task_3_test_sessions))



100%|██████████| 316971/316971 [00:47<00:00, 6678.38it/s]


                                          prev_items locale   next_item
0  ['B08V12CT4C' 'B08V1KXBQD' 'B01BVG1XJS' 'B09VC...     DE  B099NQFMG7
1                        ['B00R9R5ND6' 'B00R9RZ9ZS']     DE  B00R9RZ9ZS
2           ['B07YSRXJD3' 'B07G7Q5N6G' 'B08C9Q7QVK']     DE  B07G7Q5N6G
3  ['B08KQBYV43' '3955350843' '3955350843' '39553...     DE  3955350843
4  ['B09FPTCWMC' 'B09FPTQP68' 'B08HMRY8NG' 'B08TB...     DE  B09J945WQR
316971


100%|██████████| 30000/30000 [00:04<00:00, 6723.23it/s]

                                 prev_items locale   next_item
0  ['B07KWVBK8W' 'B07KWVDNV2' 'B07KWVBK8W']     DE  B01M2CLQA5
1               ['B08K7GPV1G' 'B08P1WJYW5']     DE  B09MFXKQMT
2               ['B07R8RCRYL' 'B08379LSYF']     DE  B00PM9Z2L6
3  ['B084RTW66R' 'B001O7XWFI' 'B088KRKFJ3']     DE  B09LYX3WBC
4                            ['B074M9DZ4M']     DE  B074M9DZ4M
30000





In [50]:
all_task_1_sessions = pd.concat([train_sessions, task_1_test_sessions, task_3_test_sessions], axis=0, ignore_index=True)
all_task_1_sessions.head(5), len(all_task_1_sessions), len(train_sessions)

(                                          prev_items   next_item locale
 0                        ['B09W9FND7K' 'B09JSPLN1M']  B09M7GY217     DE
 1  ['B076THCGSG' 'B007MO8IME' 'B08MF65MLV' 'B001B...  B001B4THSA     DE
 2  ['B0B1LGXWDS' 'B00AZYORS2' 'B0B1LGXWDS' 'B00AZ...  B0767DTG2Q     DE
 3  ['B09XMTWDVT' 'B0B4MZZ8MB' 'B0B7HZ2GWX' 'B09XM...  B0B4R9NN4B     DE
 4           ['B09Y5CSL3T' 'B09Y5DPTXN' 'B09FKD61R8']  B0BGVBKWGZ     DE,
 3953220,
 3606249)

In [51]:
# split all data 
all_task_1_train_sessions, all_task_1_valid_sessions = split_valid_data(all_task_1_sessions, 0.1)

In [52]:
len(all_task_1_train_sessions), len(all_task_1_valid_sessions)

(3557898, 395322)

In [53]:
# filter ES, IT, FR in valid sessions
all_task_1_valid_sessions = all_task_1_valid_sessions[all_task_1_valid_sessions['locale'].isin(['DE', 'JP', 'UK'])]
len(all_task_1_valid_sessions)

361581

In [54]:
def session_2_inter_feat(sessions_df, save_path):
    num_sessions = len(sessions_df)

    with open(os.path.join(save_path), 'w') as f:
        f.write('sess_id,product_id,timestamp,locale\n')

        for i in tqdm(range(num_sessions)):
            sess = sessions_df.iloc[i]
            sess_locale = sess['locale']
            sess_prev_items = sess['prev_items']
            sess_next_item = sess['next_item']
            
            product_list = sess_prev_items.strip('[]').split(' ')
            product_list = list(map(lambda x : x.strip("'\n"), product_list))
            product_list.append(sess_next_item)

            sess_id = i
            for j, product_id in enumerate(product_list):
                inter_str = f'{sess_id},{product_id},{j},{sess_locale}\n'
                f.write(inter_str)

In [55]:
session_2_inter_feat(all_task_1_train_sessions, '../data_for_recstudio/all_task_1_train_inter_feat.csv')
session_2_inter_feat(all_task_1_valid_sessions, '../data_for_recstudio/all_task_1_valid_inter_feat.csv')

100%|██████████| 3557898/3557898 [05:45<00:00, 10285.57it/s]
100%|██████████| 361581/361581 [00:35<00:00, 10151.82it/s]


In [56]:
tune_task_1_sessions = all_task_1_sessions.sample(int(0.3 * len(all_task_1_sessions)), ignore_index=True)

In [57]:
len(tune_task_1_sessions)

1185966

In [58]:
tune_task_1_train_sessions, tune_task_1_valid_sessions = split_valid_data(tune_task_1_sessions, 0.1)
len(tune_task_1_train_sessions), len(tune_task_1_valid_sessions)

(1067370, 118596)

In [59]:
# filter ES, IT, FR in valid sessions
tune_task_1_valid_sessions = tune_task_1_valid_sessions[tune_task_1_valid_sessions['locale'].isin(['DE', 'JP', 'UK'])]
len(tune_task_1_valid_sessions)

108617

In [60]:
session_2_inter_feat(tune_task_1_train_sessions, '../data_for_recstudio/tune_task_1_train_inter_feat.csv')
session_2_inter_feat(tune_task_1_valid_sessions, '../data_for_recstudio/tune_task_1_valid_inter_feat.csv')

100%|██████████| 1067370/1067370 [01:42<00:00, 10365.51it/s]
100%|██████████| 108617/108617 [00:10<00:00, 10113.86it/s]


In [61]:
all_task_1_train_inter_feat = pd.read_csv('../data_for_recstudio/all_task_1_train_inter_feat.csv')
all_task_1_valid_inter_feat = pd.read_csv('../data_for_recstudio/all_task_1_valid_inter_feat.csv')

len(all_task_1_train_inter_feat), len(all_task_1_valid_inter_feat)

(18321875, 1882412)

In [62]:
tune_task_1_train_inter_feat = pd.read_csv('../data_for_recstudio/tune_task_1_train_inter_feat.csv')
tune_task_1_valid_inter_feat = pd.read_csv('../data_for_recstudio/tune_task_1_valid_inter_feat.csv')
len(tune_task_1_train_inter_feat), len(tune_task_1_valid_inter_feat)

(5497807, 566614)