In [1]:
import gc
import polars as pl
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

train = pl.read_parquet('../data/train.parquet')
test = pl.read_parquet('../data/test.parquet')

In [2]:
sentences_df =  pl.concat([train, test]).groupby('session').agg(
    pl.col('aid').alias('sentence')
)

sentences = sentences_df['sentence'].to_list()
del sentences_df; gc.collect() 

0

In [5]:
%%time

w2vec = Word2Vec(sentences=sentences, vector_size= 64, window = 3, negative = 8, ns_exponent = 0.2, sg = 1, min_count=1, workers=4)

CPU times: user 3h 18min 54s, sys: 1min 34s, total: 3h 20min 29s
Wall time: 1h 58min 42s


In [6]:
%%time

from annoy import AnnoyIndex

aid2idx = {aid: i for i, aid in enumerate(w2vec.wv.index_to_key)}
index = AnnoyIndex(64, 'euclidean')

for aid, idx in aid2idx.items():
    index.add_item(idx, w2vec.wv.vectors[idx])
    
index.build(32)

CPU times: user 2min 28s, sys: 7.09 s, total: 2min 35s
Wall time: 42.5 s


True

In [7]:
import pandas as pd
import numpy as np

from collections import defaultdict
import collections

session_types = ['clicks', 'carts', 'orders']
test_session_AIDs = test.to_pandas().reset_index(drop=True).groupby('session')['aid'].apply(list)
test_session_types = test.to_pandas().reset_index(drop=True).groupby('session')['type'].apply(list)

In [8]:
labels = []

type_weight_multipliers = {0: 1, 1: 6, 2: 3}

session_num = len(test_session_AIDs)

for AIDs, types in zip(test_session_AIDs[:session_num], test_session_types[:session_num]):
    if len(AIDs) >= 20:
        # if we have enough aids (over equals 20) we don't need to look for candidates! we just use the old logic
        weights=np.logspace(0.1,1,len(AIDs),base=2, endpoint=True)-1
        aids_temp=defaultdict(lambda: 0)
        for aid,w,t in zip(AIDs,weights,types): 
            aids_temp[aid]+= w * type_weight_multipliers[t]
            
        sorted_aids=[k for k, v in sorted(aids_temp.items(), key=lambda item: -item[1])]
        labels.append(sorted_aids[:20])
    else:
        # here we don't have 20 aids to output -- we will use word2vec embeddings to generate candidates!
        AIDs = list(dict.fromkeys(AIDs[::-1]))
        
        # let's grab the up to 3 recent aids
        recent_len = max(min(3,len(AIDs)),1)
        
        # how many aids for each aid
        AIDs_num = round((20-len(AIDs))/recent_len) + 2
        
        # let's look for some neighbors!        
        nns_it = []
        for it in range(0,recent_len):
            nns_it += [w2vec.wv.index_to_key[i] for i in index.get_nns_by_item(aid2idx[AIDs[it]], AIDs_num)[1:]]
        
        # select repeating and unique neighbors
        nns_repeated = [item for item, count in collections.Counter(nns_it).items() if count > 1]
        nns_once = [item for item, count in collections.Counter(nns_it).items() if count == 1]

        # prepare selection
        nns = (nns_repeated+nns_once)[:20]
        labels.append((AIDs+nns)[:20])

In [9]:
labels_as_strings = [' '.join([str(l) for l in lls]) for lls in labels]

predictions = pd.DataFrame(data={'session_type': test_session_AIDs.index, 'labels': labels_as_strings})

prediction_dfs = []

for st in session_types:
    modified_predictions = predictions.copy()
    modified_predictions.session_type = modified_predictions.session_type.astype('str') + f'_{st}'
    prediction_dfs.append(modified_predictions)

submission = pd.concat(prediction_dfs).reset_index(drop=True)
submission.to_csv('../data/sample_submission.csv', index=False)

del labels, labels_as_strings, predictions, prediction_dfs
gc.collect()

0