In [2]:
import warnings
warnings.simplefilter('ignore')

import gc
import re
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
from tqdm.auto import tqdm

In [3]:
df_sess = pd.read_csv('./raw_data/sessions_train.csv')
df_sess

Unnamed: 0,prev_items,next_item,locale
0,['B09W9FND7K' 'B09JSPLN1M'],B09M7GY217,DE
1,['B076THCGSG' 'B007MO8IME' 'B08MF65MLV' 'B001B...,B001B4THSA,DE
2,['B0B1LGXWDS' 'B00AZYORS2' 'B0B1LGXWDS' 'B00AZ...,B0767DTG2Q,DE
3,['B09XMTWDVT' 'B0B4MZZ8MB' 'B0B7HZ2GWX' 'B09XM...,B0B4R9NN4B,DE
4,['B09Y5CSL3T' 'B09Y5DPTXN' 'B09FKD61R8'],B0BGVBKWGZ,DE
...,...,...,...
3606244,['B086CYFSKW' 'B0874F9859' 'B086CYFSKW'],B07B5TYD76,IT
3606245,['B09NRZKZ7V' 'B08WJTPV93'],B08L1P4C3D,IT
3606246,['B085JFX7MP' 'B085JGHW8R'],B01MPWVD44,IT
3606247,['B00B0UING2' 'B00B0UING2'],B00D3HYEZ4,IT


In [4]:
df_test = pd.read_csv('./raw_data/sessions_test_task3.csv')
df_test

Unnamed: 0,prev_items,locale
0,['B082DLM3NZ' 'B089X86H73'],ES
1,['B071WPLND2' 'B08TMJ9SDZ' 'B07XRCLVYG'],ES
2,['B094V8G54H' 'B094V97YV8'],ES
3,['B0B3DQXY57' 'B0B6W3GGTM'],ES
4,['B0765BPD7T' 'B00V4PQY3C' 'B09HWV4MBK'],ES
...,...,...
56416,['B08GNG5FMW' 'B08Q7MJW8W'],UK
56417,['B09YH16XH1' 'B09YGY96ZM'],UK
56418,['B00EXKSNNE' 'B005DBORH8' 'B005DBORCS' 'B005D...,UK
56419,['B007CJVZ1A' 'B07GCSPHNK' 'B07GCVF3N3'],UK


In [5]:
def str2list(x):
    x = x.replace('[', '').replace(']', '').replace("'", '').replace('\n', ' ').replace('\r', ' ')
    l = [i for i in x.split() if i]
    return l

In [6]:
next_item_dict = defaultdict(list)

for _, row in tqdm(df_sess.iterrows(), total=len(df_sess)):
    prev_items = str2list(row['prev_items'])
    next_item = row['next_item']
    prev_items_length = len(prev_items)
    if prev_items_length <= 1:
        next_item_dict[prev_items[0]].append(next_item)
    else:
        for i, item in enumerate(prev_items[:-1]):
            next_item_dict[item].append(prev_items[i+1])
        next_item_dict[prev_items[-1]].append(next_item)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3606249/3606249 [02:43<00:00, 22087.37it/s]


In [13]:
next_item_map = {}

for item in tqdm(next_item_dict):
    counter = Counter(next_item_dict[item])
    top1_list = counter.most_common(1)
    if len(top1_list) == 0:
        next_item_map[item] = ''
    else:
        next_item_map[item] = top1_list[0][0]

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1323630/1323630 [00:09<00:00, 136106.64it/s]


In [14]:
next_item_map

{'B09W9FND7K': 'B09JSPLN1M',
 'B09JSPLN1M': 'B09W9FND7K',
 'B076THCGSG': 'B007MO8IME',
 'B007MO8IME': 'B007MO8IME',
 'B08MF65MLV': 'B0798N5C1L',
 'B001B4TKA0': 'B001B4THSA',
 'B0B1LGXWDS': 'B00AZYORS2',
 'B00AZYORS2': 'B0B1LGXWDS',
 'B09XMTWDVT': 'B0B4MZZ8MB',
 'B0B4MZZ8MB': 'B0B7HZ2GWX',
 'B0B7HZ2GWX': 'B09XMTWDVT',
 'B0B71CHT1L': 'B0B71GSJ2R',
 'B09Y5CSL3T': 'B09Y5DPTXN',
 'B09Y5DPTXN': 'B09Y5CSL3T',
 'B09FKD61R8': 'B0BGVBKWGZ',
 'B0749V8TC7': 'B0749V17QP',
 'B0749W93VC': 'B0749V8TC7',
 'B0749TX4YP': 'B0749VF4LM',
 'B09SMK3R8H': 'B01N4ND0F9',
 'B01N4ND0F9': 'B09SMK3R8H',
 'B09B2W5S9R': 'B09B2YFY6M',
 'B09B2YFY6M': 'B09B2W5S9R',
 'B09B2WGPRB': 'B097CX2V3L',
 'B01MQOR80Q': 'B095HS8R62',
 'B095HS8R62': 'B07HY9KBMZ',
 'B09B31WTVY': 'B09B32SSDT',
 '3649625660': '3649625660',
 'B07N3SNQW5': 'B099JZ9L9Y',
 'B099JZ9L9Y': 'B07Q2CFPGH',
 'B07Q2CFPGH': 'B099KCMQ92',
 'B099KCMQ92': 'B07Q2CFPGH',
 '3848520974': '3848520974',
 'B07H1GQB36': 'B08DTZ3PTY',
 'B08DTZ3PTY': 'B08G4DFMNN',
 'B0927GXJPB':

In [16]:
k = []
v = []

for item in next_item_dict:
    k.append(item)
    v.append(next_item_dict[item])
    
df_next = pd.DataFrame({'item': k, 'next_item': v})
df_next = df_next.explode('next_item').reset_index(drop=True)
df_next

Unnamed: 0,item,next_item
0,B09W9FND7K,B09JSPLN1M
1,B09W9FND7K,B09JSPLN1M
2,B09W9FND7K,B09JSPLN1M
3,B09W9FND7K,B09JSPLN1M
4,B09W9FND7K,B078WW2WN5
...,...,...
15306178,B09V7T4HXD,B09QHXKZXC
15306179,B0B14HBDHX,B092471PYL
15306180,B0B14HBDHX,B09X9DSQ7V
15306181,B07P6QPKNL,B07P6QGKTV


In [26]:
top1 = df_next['next_item'].value_counts().index.tolist()[0]

In [27]:
df_test['last_item'] = df_test['prev_items'].apply(lambda x: str2list(x)[-1])
df_test['next_item_prediction'] = df_test['last_item'].map(next_item_map)
df_test

Unnamed: 0,prev_items,locale,last_item,next_item_prediction
0,['B082DLM3NZ' 'B089X86H73'],ES,B089X86H73,B082DL9QJZ
1,['B071WPLND2' 'B08TMJ9SDZ' 'B07XRCLVYG'],ES,B07XRCLVYG,B00KBOJ8AI
2,['B094V8G54H' 'B094V97YV8'],ES,B094V97YV8,B094V2FVPV
3,['B0B3DQXY57' 'B0B6W3GGTM'],ES,B0B6W3GGTM,B0B3DQXY57
4,['B0765BPD7T' 'B00V4PQY3C' 'B09HWV4MBK'],ES,B09HWV4MBK,B08N17D24Y
...,...,...,...,...
56416,['B08GNG5FMW' 'B08Q7MJW8W'],UK,B08Q7MJW8W,B08GN9FP13
56417,['B09YH16XH1' 'B09YGY96ZM'],UK,B09YGY96ZM,B09YH16XH1
56418,['B00EXKSNNE' 'B005DBORH8' 'B005DBORCS' 'B005D...,UK,B005DBORCS,B005DBORH8
56419,['B007CJVZ1A' 'B07GCSPHNK' 'B07GCVF3N3'],UK,B07GCVF3N3,B07GCSPHNK


In [28]:
pd.isna(df_test['next_item_prediction']).sum()

1275

In [29]:
top1

'B07QPV9Z7X'

In [30]:
preds = []

for _, row in tqdm(df_test.iterrows(), total=len(df_test)):
    pred_orig = row['next_item_prediction']
    pred = pred_orig
    prev_items = str2list(row['prev_items'])
    if type(pred) == float:
        pred = top1
    preds.append(pred)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 56421/56421 [00:02<00:00, 25229.62it/s]


In [31]:
df_test['next_item_prediction'] = preds
df_test

Unnamed: 0,prev_items,locale,last_item,next_item_prediction
0,['B082DLM3NZ' 'B089X86H73'],ES,B089X86H73,B082DL9QJZ
1,['B071WPLND2' 'B08TMJ9SDZ' 'B07XRCLVYG'],ES,B07XRCLVYG,B00KBOJ8AI
2,['B094V8G54H' 'B094V97YV8'],ES,B094V97YV8,B094V2FVPV
3,['B0B3DQXY57' 'B0B6W3GGTM'],ES,B0B6W3GGTM,B0B3DQXY57
4,['B0765BPD7T' 'B00V4PQY3C' 'B09HWV4MBK'],ES,B09HWV4MBK,B08N17D24Y
...,...,...,...,...
56416,['B08GNG5FMW' 'B08Q7MJW8W'],UK,B08Q7MJW8W,B08GN9FP13
56417,['B09YH16XH1' 'B09YGY96ZM'],UK,B09YGY96ZM,B09YH16XH1
56418,['B00EXKSNNE' 'B005DBORH8' 'B005DBORCS' 'B005D...,UK,B005DBORCS,B005DBORH8
56419,['B007CJVZ1A' 'B07GCSPHNK' 'B07GCVF3N3'],UK,B07GCVF3N3,B07GCSPHNK


In [35]:
df_test['next_item_prediction'].apply(len).describe()

count    56421.0
mean        10.0
std          0.0
min         10.0
25%         10.0
50%         10.0
75%         10.0
max         10.0
Name: next_item_prediction, dtype: float64

In [38]:
products_train = pd.read_csv('./raw_data/products_train.csv')
products_train.head(5)

Unnamed: 0,id,locale,title,price,brand,color,size,model,material,author,desc
0,B005ZSSN10,DE,RED DRAGON Amberjack 3 - Steel Tip 22 Gramm Wo...,30.95,RED DRAGON,,,RDD0089,,,Amberjacks Steel Dartpfeile sind verf√ºgbar in ...
1,B08PRYN6LD,DE,Simply Keto Lower Carb* Schokodrops ohne Zucke...,17.9,Simply Keto,,750 g (1er Pack),,,,üå± NAT√úRLICHE S√úSSE DURCH ERYTHRIT - Wir stelle...
2,B09MBZJ48V,DE,"Sennheiser 508377 PC 5.2 Chat, Stilvolles Mult...",68.89,Sennheiser,Multi-Colour,One size,508377,Kunstleder,,3.5 MM BUCHSE - Kann problemlos an Ger√§te mit ...
3,B08ZN6F26S,DE,AmyBenton Auto ab 1 2 3 ahre - Baby Aufziehbar...,18.99,Amy & Benton,Animal Car,,2008B,aufziehauto 1 jahr,,„ÄêAuto aufziehbar„Äë: Dr√ºcken Sie einfach leicht ...
4,B094DGRV7D,DE,PLAYMOBIL - 70522 - Cavaliere mit grauem Pony,7.17,PLAYMOBIL,Nicht Zutreffend.,OneSize,70522,Polypropylen,,Inhalt: 1 St√ºck


In [50]:
id2title_dict = defaultdict(str)
for i, product in tqdm(products_train.iterrows(), total=len(products_train)):
    k = f"{product['id']}_{product['locale']}"
    if type(product['title']) != str:
        v = ''
    else:
        v = product['title']
    id2title_dict[k] = v
id2title_dict

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1551057/1551057 [01:09<00:00, 22315.86it/s]


defaultdict(str,
            {'B005ZSSN10_DE': 'RED DRAGON Amberjack 3 - Steel Tip 22 Gramm Wolfram Profi Dartpfeile Set mit Flights und Sch√§fte',
             'B08PRYN6LD_DE': 'Simply Keto Lower Carb* Schokodrops ohne Zuckerzusatz (750g) - Vollmilch Schoko Drops zum Naschen oder Backen - Ges√º√üt mit Erythrit statt Zucker - Ideal f√ºr Low-Carb & Ketogene Ern√§hrung',
             'B09MBZJ48V_DE': 'Sennheiser 508377 PC 5.2 Chat, Stilvolles Multi-Plattform On-Ear Headset PC, Kopfh√∂rer mit Kabel & Unterricht, f√ºr Laptop, Telefon & PC & EPOS I Sennheiser PC 8 USB Headset, Schwarz',
             'B08ZN6F26S_DE': 'AmyBenton Auto ab 1 2 3 ahre - Baby Aufziehbares Auto ab 1 Jahr - 4pcs Spielzeugautos Set f√ºr M√§dchen',
             'B094DGRV7D_DE': 'PLAYMOBIL - 70522 - Cavaliere mit grauem Pony',
             'B09JNNBDH5_DE': 'URBZUE Handw√§rmer, 10000mAh USB aufladbar und Digitalanzeige, elektrischer Taschenw√§rmer mit Mehreren Heizstufen von 40-60 ¬∞ C, tragbares warmes Wintereschenk f√

In [51]:
len(id2title_dict)

1551057

In [58]:
pred_titles = []

for _, row in tqdm(df_test.iterrows(), total=len(df_test)):
    pred_titles.append(id2title_dict[f"{row['next_item_prediction']}_{row['locale']}"])

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 56421/56421 [00:06<00:00, 8923.83it/s]


In [66]:
df_test['next_item_prediction'] = pred_titles

In [67]:
df_test.head(5)

Unnamed: 0,prev_items,locale,last_item,next_item_prediction,next_item_title
0,['B082DLM3NZ' 'B089X86H73'],ES,B089X86H73,Denver BTS-110 Black- Altavoz Port√°til. Sinton...,Denver BTS-110 Black- Altavoz Port√°til. Sinton...
1,['B071WPLND2' 'B08TMJ9SDZ' 'B07XRCLVYG'],ES,B07XRCLVYG,MAGEFESA Dynamic Olla a presi√≥n Super r√°pida d...,MAGEFESA Dynamic Olla a presi√≥n Super r√°pida d...
2,['B094V8G54H' 'B094V97YV8'],ES,B094V97YV8,,
3,['B0B3DQXY57' 'B0B6W3GGTM'],ES,B0B6W3GGTM,"Rizador Pelo Sin Calor, Rizador de Pelo Sin Ca...","Rizador Pelo Sin Calor, Rizador de Pelo Sin Ca..."
4,['B0765BPD7T' 'B00V4PQY3C' 'B09HWV4MBK'],ES,B09HWV4MBK,UCMDA Alfombrilla Rat√≥n con Coj√≠n de Mu√±eca Al...,UCMDA Alfombrilla Rat√≥n con Coj√≠n de Mu√±eca Al...


In [68]:
df_test[['locale', 'next_item_prediction']].to_parquet('submission_task3.parquet', engine='pyarrow')

In [69]:
df_test3 = pd.read_parquet('submission_task3.parquet')
df_test3.head(5)

Unnamed: 0,locale,next_item_prediction
0,ES,Denver BTS-110 Black- Altavoz Port√°til. Sinton...
1,ES,MAGEFESA Dynamic Olla a presi√≥n Super r√°pida d...
2,ES,
3,ES,"Rizador Pelo Sin Calor, Rizador de Pelo Sin Ca..."
4,ES,UCMDA Alfombrilla Rat√≥n con Coj√≠n de Mu√±eca Al...
