In [1]:
import numpy as np
import pandas as pd 
from functools import lru_cache
import os 

In [3]:
data_dir = '../raw_data/'
dst_dir = '../data_for_recstudio/'

In [6]:
@lru_cache(maxsize=1)
def read_train_data():
    return pd.read_csv(os.path.join(data_dir, 'sessions_train.csv'))

@lru_cache(maxsize=1)
def read_test_data(task):
    return pd.read_csv(os.path.join(data_dir, f'sessions_test_{task}.csv'))

In [11]:
sessions_train = read_train_data()
sessions_test_task1 = read_test_data('task1')
print(sessions_train.head(5))
print(len(sessions_train))
print(sessions_test_task1.head(5))
print(len(sessions_test_task1))

                                          prev_items   next_item locale
0                        ['B09W9FND7K' 'B09JSPLN1M']  B09M7GY217     DE
1  ['B076THCGSG' 'B007MO8IME' 'B08MF65MLV' 'B001B...  B001B4THSA     DE
2  ['B0B1LGXWDS' 'B00AZYORS2' 'B0B1LGXWDS' 'B00AZ...  B0767DTG2Q     DE
3  ['B09XMTWDVT' 'B0B4MZZ8MB' 'B0B7HZ2GWX' 'B09XM...  B0B4R9NN4B     DE
4           ['B09Y5CSL3T' 'B09Y5DPTXN' 'B09FKD61R8']  B0BGVBKWGZ     DE
3606249
                                          prev_items locale
0  ['B08V12CT4C' 'B08V1KXBQD' 'B01BVG1XJS' 'B09VC...     DE
1           ['B00R9R5ND6' 'B00R9RZ9ZS' 'B00R9RZ9ZS']     DE
2  ['B07YSRXJD3' 'B07G7Q5N6G' 'B08C9Q7QVK' 'B07G7...     DE
3  ['B08KQBYV43' '3955350843' '3955350843' '39553...     DE
4  ['B09FPTCWMC' 'B09FPTQP68' 'B08HMRY8NG' 'B08TB...     DE
316971


In [13]:
sessions_test_task2 = read_test_data('task2')
sessions_test_task3 = read_test_data('task3')
print(sessions_test_task2.head(5))
print(len(sessions_test_task2))
print(sessions_test_task3.head(5))
print(len(sessions_test_task3))

                                          prev_items locale
0           ['B08GYKNCCP' 'B08HCPTMJG' 'B08HCHS64Y']     ES
1                        ['B08NYF9MBQ' 'B085NGXGWM']     ES
2                        ['B091FL1QFK' 'B0B1DG29F4']     ES
3  ['B004APAHCW' 'B07JMF49HN' 'B004APAHCW' 'B07JM...     ES
4  ['B09YM11D4T' 'B0B12QWP5G' 'B08YMT6Q4X' 'B09V1...     ES
34688
                                 prev_items locale
0               ['B082DLM3NZ' 'B089X86H73']     ES
1  ['B071WPLND2' 'B08TMJ9SDZ' 'B07XRCLVYG']     ES
2               ['B094V8G54H' 'B094V97YV8']     ES
3               ['B0B3DQXY57' 'B0B6W3GGTM']     ES
4  ['B0765BPD7T' 'B00V4PQY3C' 'B09HWV4MBK']     ES
56421


In [18]:
sessions_test3_task1 = sessions_test_task3[sessions_test_task3['locale'].isin(['UK', 'JP', 'DE'])].reset_index(drop=True)
len(sessions_test3_task1)

30000

In [19]:
sessions_test3_task1.iloc[0]['prev_items']

"['B07KWVBK8W' 'B07KWVDNV2' 'B07KWVBK8W' 'B01M2CLQA5']"

In [22]:
@lru_cache(maxsize=1)
def read_train_inter_data():
    return pd.read_csv(os.path.join(dst_dir, f'inter_feat.csv'))

In [25]:
sessions_train_inter_feat = read_train_inter_data()
num_train_sessions = sessions_train_inter_feat['sess_id'].max()
num_train_sessions

3606248

In [27]:
from tqdm import tqdm 
num_test1 = len(sessions_test_task1)
num_test3 = len(sessions_test3_task1)

with open(os.path.join(dst_dir, 'all_task1_inter_feat.csv'), 'w') as f:
    f.write('sess_id,product_id,timestamp,locale\n')
    
    for i in tqdm(range(num_test1)):
        sess_id = i + num_train_sessions
        sess = sessions_test_task1.iloc[i]
        sess_locale = sess['locale']
        sess_prev_items = sess['prev_items']
        
        product_list = sess_prev_items.strip('[]').split(' ')
        product_list = list(map(lambda x : x.strip("'\n"), product_list))
        
        for j, product_id in enumerate(product_list):
            inter_str = f'{sess_id},{product_id},{j},{sess_locale}\n'
            f.write(inter_str)

    for i in tqdm(range(num_test3)):
        sess_id = i + num_train_sessions + num_test1
        sess = sessions_test3_task1.iloc[i]
        sess_locale = sess['locale']
        sess_prev_items = sess['prev_items']
        
        product_list = sess_prev_items.strip('[]').split(' ')
        product_list = list(map(lambda x : x.strip("'\n"), product_list))
        
        for j, product_id in enumerate(product_list):
            inter_str = f'{sess_id},{product_id},{j},{sess_locale}\n'
            f.write(inter_str)

100%|██████████| 316971/316971 [00:10<00:00, 29091.68it/s]
100%|██████████| 30000/30000 [00:01<00:00, 28767.96it/s]


In [29]:
@lru_cache(maxsize=1)
def read_all_task1_inter_data():
    return pd.read_csv(os.path.join(dst_dir, f'all_task1_inter_feat.csv'))

In [31]:
all_task1_inter_feat = read_all_task1_inter_data()
print(len(all_task1_inter_feat))
print(all_task1_inter_feat['sess_id'].max())

1447779
3953218


In [33]:
all_task1_inter_feat_ = pd.concat([sessions_train_inter_feat, all_task1_inter_feat], ignore_index=True)
print(len(sessions_train_inter_feat))
print(len(all_task1_inter_feat_))
print(all_task1_inter_feat_['sess_id'].max())

18912432
20360211
3953218


In [35]:
all_task1_inter_feat_.to_csv(os.path.join(dst_dir, f'all_task1_inter_feat.csv'), sep=',')

In [4]:
all_task1_inter_feat_ = pd.read_csv(os.path.join(dst_dir, f'all_task1_inter_feat.csv'))
all_task1_inter_feat_.head(5)

Unnamed: 0.1,Unnamed: 0,sess_id,product_id,timestamp,locale
0,0,0,B09W9FND7K,0,DE
1,1,0,B09JSPLN1M,1,DE
2,2,0,B09M7GY217,2,DE
3,3,1,B076THCGSG,0,DE
4,4,1,B007MO8IME,1,DE


In [22]:
all_task1_inter_feat_ = all_task1_inter_feat_.drop(columns=['Unnamed: 0'], axis=1)

In [23]:
all_task1_inter_feat_.to_csv(os.path.join(dst_dir, f'all_task1_inter_feat.csv'), sep=',', index=False)

In [24]:
all_task1_inter_feat_ = pd.read_csv(os.path.join(dst_dir, f'all_task1_inter_feat.csv'))
all_task1_inter_feat_.head(5)

Unnamed: 0,sess_id,product_id,timestamp,locale
0,0,B09W9FND7K,0,DE
1,0,B09JSPLN1M,1,DE
2,0,B09M7GY217,2,DE
3,1,B076THCGSG,0,DE
4,1,B007MO8IME,1,DE
