In [7]:
import os
import numpy as np
import pandas as pd
from functools import lru_cache
from tqdm import tqdm 

In [12]:
train_data_dir = '../raw_data/'
test_data_dir = '../raw_data/'
recstudio_data_dir = '../data_for_recstudio/'
task = 'task1'
PREDS_PER_SESSION = 100

In [6]:
@lru_cache(maxsize=1)
def read_train_data():
    return pd.read_csv(os.path.join(train_data_dir, 'sessions_train.csv'))

@lru_cache(maxsize=1)
def read_test_data():
    return pd.read_csv(os.path.join(test_data_dir, 'sessions_test_task1.csv'))

@lru_cache(maxsize=1)
def read_test_data_task3():
    return pd.read_csv(os.path.join(test_data_dir, 'sessions_test_task3.csv'))

In [14]:
def transform_to_inter_feat(sess_df):
    num_sess = len(sess_df)

    with open(os.path.join(recstudio_data_dir, 'test_inter_feat_task3.csv'), 'w') as f:
        f.write('sess_id,product_id,timestamp,locale\n')
        
        for i in tqdm(range(num_sess)):
            sess_id = i 
            sess = sess_df.iloc[i]
            sess_locale = sess['locale']
            sess_prev_items = sess['prev_items']
            if 'next_item' in sess_df:
                sess_nxt_item = sess['next_item']

            product_list = sess_prev_items.strip('[]').split(' ')
            product_list = list(map(lambda x : x.strip("'\n"), product_list))
            if 'next_item' in sess_df:
                product_list.append(sess_nxt_item)

            for j, product_id in enumerate(product_list):
                inter_str = f'{sess_id},{product_id},{j},{sess_locale}\n'
                f.write(inter_str)

In [15]:
sess_test_task3 = read_test_data_task3()

In [16]:
transform_to_inter_feat(sess_test_task3)

100%|██████████| 56421/56421 [00:02<00:00, 26447.53it/s]


In [17]:
sess_test_task3.head(5)

Unnamed: 0,prev_items,locale
0,['B082DLM3NZ' 'B089X86H73'],ES
1,['B071WPLND2' 'B08TMJ9SDZ' 'B07XRCLVYG'],ES
2,['B094V8G54H' 'B094V97YV8'],ES
3,['B0B3DQXY57' 'B0B6W3GGTM'],ES
4,['B0765BPD7T' 'B00V4PQY3C' 'B09HWV4MBK'],ES


In [33]:
sess_train = read_train_data()

In [34]:
sess_train.head(10)

Unnamed: 0,prev_items,next_item,locale
0,['B09W9FND7K' 'B09JSPLN1M'],B09M7GY217,DE
1,['B076THCGSG' 'B007MO8IME' 'B08MF65MLV' 'B001B...,B001B4THSA,DE
2,['B0B1LGXWDS' 'B00AZYORS2' 'B0B1LGXWDS' 'B00AZ...,B0767DTG2Q,DE
3,['B09XMTWDVT' 'B0B4MZZ8MB' 'B0B7HZ2GWX' 'B09XM...,B0B4R9NN4B,DE
4,['B09Y5CSL3T' 'B09Y5DPTXN' 'B09FKD61R8'],B0BGVBKWGZ,DE
5,['B0749V8TC7' 'B0749W93VC' 'B0749TX4YP'],B0749TX4YS,DE
6,['B09SMK3R8H' 'B01N4ND0F9'],B08YNZT93Z,DE
7,['B09B2W5S9R' 'B09B2YFY6M' 'B09B2WGPRB'],B097CX2V3L,DE
8,['B01MQOR80Q' 'B095HS8R62' 'B09B31WTVY'],B09B32SSDT,DE
9,['3649625660' 'B07N3SNQW5' 'B099JZ9L9Y' 'B07Q2...,B08R7G53T1,DE


In [8]:
recstudio_data_dir = '../data_for_recstudio/'

In [36]:
from tqdm import tqdm 
num_train_sess = len(sess_train)

with open(os.path.join(recstudio_data_dir, 'inter_feat.csv'), 'w') as f:
    f.write('sess_id,product_id,timestamp,locale\n')
    
    for i in tqdm(range(num_train_sess)):
        sess_id = i 
        sess = sess_train.iloc[i]
        sess_locale = sess['locale']
        sess_prev_items = sess['prev_items']
        sess_nxt_item = sess['next_item']

        product_list = sess_prev_items.strip('[]').split(' ')
        product_list = list(map(lambda x : x.strip("'\n"), product_list))
        product_list.append(sess_nxt_item)

        for j, product_id in enumerate(product_list):
            inter_str = f'{sess_id},{product_id},{j},{sess_locale}\n'
            f.write(inter_str)

100%|██████████| 3606249/3606249 [06:00<00:00, 10007.82it/s]


In [38]:
train_data_recstduio = pd.read_csv(os.path.join(recstudio_data_dir, 'inter_feat.csv'), sep=',')
train_data_recstduio.sample(10)

Unnamed: 0,sess_id,product_id,timestamp,locale
18075374,3425525,B07J67BC5V,1,FR
6369146,1187854,B00P8XV9JA,1,JP
9396601,1740713,B09Q5SHDR3,0,JP
14078412,2629826,B084ZWXRFL,8,UK
15214021,2851136,B09VSZ6ZW9,4,UK
17796356,3363965,B09LQWM2NZ,2,FR
2835956,529970,B07KQKMCMV,3,DE
18403257,3497075,B071YH437W,3,IT
5865803,1095980,B07XLML2YS,0,DE
8058063,1495625,B092VX5RK7,18,JP


In [4]:
sess_test = read_test_data()

In [6]:
sess_test.sample(10)

Unnamed: 0,prev_items,locale
142704,['B09PBFPS1G' 'B09DQ5G235' 'B09DQ6CVHC' 'B09PB...,JP
159948,['B09BZQ9HN2' 'B0743D6F2H' 'B073S8BGJD' 'B0743...,JP
192046,['B082YQV5YJ' 'B07W22SHHS' 'B07W8772K4' 'B082Y...,JP
158890,['B00I9Q0RJ6' 'B000RYULUI'],JP
276375,['B08DJTYDHB' 'B088SYLJ97' 'B09CZZHCXD' 'B08DR...,UK
86084,['B0B17HX2G9' 'B07QTCGCV3' 'B0B17HX2G9' 'B07QT...,DE
113721,['B09CH44769' 'B09NMB5JWH'],JP
220624,['B09TNFJX22' 'B09TTWKQ3X' 'B09R9WBM1L' 'B09YM...,UK
169389,['B0895GFC7L' 'B08MTTLPK3'],JP
108191,['B08DCPJL9Q' 'B07FW6NC79' 'B075VZ3TDT' 'B07FV...,JP


In [10]:
from tqdm import tqdm 
num_test_sess = len(sess_test)

with open(os.path.join(recstudio_data_dir, 'test_inter_feat.csv'), 'w') as f:
    f.write('sess_id,product_id,timestamp,locale\n')
    
    for i in tqdm(range(num_test_sess)):
        sess_id = i 
        sess = sess_test.iloc[i]
        sess_locale = sess['locale']
        sess_prev_items = sess['prev_items']

        product_list = sess_prev_items.strip('[]').split(' ')
        product_list = list(map(lambda x : x.strip("'\n"), product_list))

        for j, product_id in enumerate(product_list):
            inter_str = f'{sess_id},{product_id},{j},{sess_locale}\n'
            f.write(inter_str)

100%|██████████| 316971/316971 [00:44<00:00, 7133.33it/s] 


In [11]:
test_data_recstduio = pd.read_csv(os.path.join(recstudio_data_dir, 'test_inter_feat.csv'), sep=',')
test_data_recstduio.sample(10)

Unnamed: 0,sess_id,product_id,timestamp,locale
523854,120923,B07C533XCW,9,JP
817807,186103,B0BGPFCXQ5,2,JP
1282568,300001,B01NCE5ZXR,7,UK
689569,157685,B08Z8TZ95V,0,JP
592978,136233,B099N5JCKQ,0,JP
243234,56486,B084RDB8GN,1,DE
768,180,B095WV7MJF,2,DE
1126722,261166,B07YSH91XR,9,UK
1044604,240664,B08L5R7WVJ,1,UK
1051935,242573,B01HZ6YAUW,0,UK


In [6]:
name_list = ['product_id', 'locale', 'title', 'price', 'brand', 'color', 'size', 'model', 'material', 'author', 'desc']
products_data_recstudio = pd.read_csv(os.path.join(recstudio_data_dir, 'products_train.csv'), sep=',', names=name_list, header=0)


In [7]:
products_data_recstudio.head(10)

Unnamed: 0,product_id,locale,title,price,brand,color,size,model,material,author,desc
0,B005ZSSN10,DE,RED DRAGON Amberjack 3 - Steel Tip 22 Gramm Wo...,30.95,RED DRAGON,,,RDD0089,,,Amberjacks Steel Dartpfeile sind verfügbar in ...
1,B08PRYN6LD,DE,Simply Keto Lower Carb* Schokodrops ohne Zucke...,17.9,Simply Keto,,750 g (1er Pack),,,,🌱 NATÜRLICHE SÜSSE DURCH ERYTHRIT - Wir stelle...
2,B09MBZJ48V,DE,"Sennheiser 508377 PC 5.2 Chat, Stilvolles Mult...",68.89,Sennheiser,Multi-Colour,One size,508377,Kunstleder,,3.5 MM BUCHSE - Kann problemlos an Geräte mit ...
3,B08ZN6F26S,DE,AmyBenton Auto ab 1 2 3 ahre - Baby Aufziehbar...,18.99,Amy & Benton,Animal Car,,2008B,aufziehauto 1 jahr,,【Auto aufziehbar】: Drücken Sie einfach leicht ...
4,B094DGRV7D,DE,PLAYMOBIL - 70522 - Cavaliere mit grauem Pony,7.17,PLAYMOBIL,Nicht Zutreffend.,OneSize,70522,Polypropylen,,Inhalt: 1 Stück
5,B09JNNBDH5,DE,"URBZUE Handwärmer, 10000mAh USB aufladbar und ...",20.99,URBZUE,Rosagold,13.5*9*5,Oro rosa,Aluminium,,Einstellbarer Temperaturmodus: Aufladbare Hand...
6,B08R62WZ1Y,DE,"Kinderkopfhörer Bluetooth, Mädchen Katzenohr K...",24.97,JYPS,Lila,,Kinderkopfhörer,,,🎁Drahtlose und kabelgebundene Kinderkopfhörer:...
7,B09WK4YNX8,DE,"8 Stück Herd Schalter Schutz, Küche Gasherd Kn...",12.49,FANSEZQ,Transparent,,,,,【BREITE ANWENDUNG】Diese Knopfschutzabdeckung i...
8,B07KTKFYYS,DE,AVANA Edelstahl Ausstechformen 12 Stück Ringe ...,13.99,AVANA,Silber,,,Edelstahl,,100% ZUFRIEDENHEITSGARANTIE - Falls Sie nicht ...
9,B0B9MPKYJK,DE,Amstory Stirnlampe LED Wiederaufladbar Joggen ...,25.99,Amstory,Schwarz,2 Stücke,LMHL-0006BK,ABS,,☀【1000 Lumen Superheller Kopflampe】 Der LED He...


In [4]:
products_data = read_train_data()

In [5]:
products_data['locale'].unique()

array(['DE', 'JP', 'UK', 'ES', 'FR', 'IT'], dtype=object)

In [8]:
task1_test = pd.read_csv(os.path.join(train_data_dir, 'sessions_test_task1.csv'))
task1_test.head(10)

Unnamed: 0,prev_items,locale
0,['B08V12CT4C' 'B08V1KXBQD' 'B01BVG1XJS' 'B09VC...,DE
1,['B00R9R5ND6' 'B00R9RZ9ZS' 'B00R9RZ9ZS'],DE
2,['B07YSRXJD3' 'B07G7Q5N6G' 'B08C9Q7QVK' 'B07G7...,DE
3,['B08KQBYV43' '3955350843' '3955350843' '39553...,DE
4,['B09FPTCWMC' 'B09FPTQP68' 'B08HMRY8NG' 'B08TB...,DE
5,['B0BHT75TPQ' 'B0BHT7X2R6' 'B0BK5VMHND' 'B0BHT...,DE
6,['B071P9DVF6' 'B07BGHDRZH' 'B09S37TD4N'],DE
7,['B0B8D1V4QW' 'B0813KJ832' 'B099XL3VS4' 'B09V1...,DE
8,['B0B3BZFMCH' 'B0B3BW437K' 'B0B3C5P8N8' 'B0B3C...,DE
9,['B08F9GMLXM' 'B0B8D4CWZ4' 'B08L9CZ7BW' 'B08FB...,DE


In [14]:
# DE 104568 JP 96467 UK 115936
(task1_test['locale'] == 'UK').sum()

115936

In [18]:
task1_DE_test = task1_test[ : 104568]
task1_JP_test = task1_test[104568 : 104568 + 96467]
task1_UK_test = task1_test[104568 + 96467 : ]


In [19]:
len(task1_DE_test) + len(task1_JP_test) + len(task1_UK_test) == len(task1_test)

True

In [27]:
(task1_UK_test['locale'] == 'UK').all()
(task1_JP_test['locale'] == 'JP').all()
(task1_DE_test['locale'] == 'DE').all()

True

In [29]:
task1_DE_test.to_csv('../data_for_recstudio/session_test_task1_DE.csv', index=False)
task1_JP_test.to_csv('../data_for_recstudio/session_test_task1_JP.csv', index=False)
task1_UK_test.to_csv('../data_for_recstudio/session_test_task1_UK.csv', index=False)

In [33]:
from tqdm import tqdm 
def transform_test_csv(test_sessions:pd.DataFrame, save_path):
    num_test = len(test_sessions)

    with open(save_path, 'w') as f:
        f.write('sess_id,product_id,timestamp,locale\n')
        
        for i in tqdm(range(num_test)):
            sess_id = i 
            sess = test_sessions.iloc[i]
            sess_locale = sess['locale']
            sess_prev_items = sess['prev_items']

            product_list = sess_prev_items.strip('[]').split(' ')
            product_list = list(map(lambda x : x.strip("'\n"), product_list))

            for j, product_id in enumerate(product_list):
                inter_str = f'{sess_id},{product_id},{j},{sess_locale}\n'
                f.write(inter_str)

In [35]:

transform_test_csv(task1_DE_test, '../data_for_recstudio/test_inter_feat_task1_DE.csv')

100%|██████████| 104568/104568 [00:03<00:00, 31364.43it/s]


In [37]:
transform_test_csv(task1_JP_test, '../data_for_recstudio/test_inter_feat_task1_JP.csv')

100%|██████████| 96467/96467 [00:03<00:00, 30458.88it/s]


In [38]:
transform_test_csv(task1_UK_test, '../data_for_recstudio/test_inter_feat_task1_UK.csv')

100%|██████████| 115936/115936 [00:03<00:00, 30526.84it/s]
