In [15]:
import polars as pl
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from collections import defaultdict
from collections import Counter
import gc
import glob

is_validation = True

# for validation
if is_validation:
    # data path
    data_path = './val_data/*_parquet/*'
    # model path
    word2vec_model = './model/val_w2v.model'
# for full dataset
else:
    # data path
    data_path = './data/*_parquet/*'
    # model path
    word2vec_model = './model/w2v.model'

def read_file_into_mem(f):
    return (
        pl.read_parquet(
            f
        )
        .with_columns([
            (pl.col('ts') / 1000).cast(pl.Int32).alias('ts'),
            pl.col('type').apply(lambda x: type_labels[x]).cast(pl.Int8).alias('type')
        ])
    )

## In this notebook we train a word2vec model for aids, so that we can convert aid to vector in latter training phase
### Loading training data

In [6]:
%%time
# init variables
type_labels = {'clicks':0, 'carts':1, 'orders':2}
df = None
# grab files
files = glob.glob(data_path)
# log
print(f"Total loaded {len(files)} files, start to load into memory")
# loading file into memroy
for f in files:
    if df is None:
        df = read_file_into_mem(f)
    else:
        df = df.vstack(read_file_into_mem(f))
df

Total loaded 120 files, start to load into memory
CPU times: user 21.9 s, sys: 9.01 s, total: 30.9 s
Wall time: 34.7 s


session,aid,ts,type
i32,i32,i32,i8
11098528,11830,1661119200,0
11098529,1105029,1661119200,0
11098530,264500,1661119200,0
11098530,264500,1661119288,0
11098530,409236,1661119369,0
11098530,409236,1661119441,0
11098530,409236,1661120165,0
11098530,409236,1661120532,1
11098531,452188,1661119200,0
11098531,1239060,1661119227,0


### Group data by session

In [7]:
%%time
# groupby session
sentences_df = df.groupby('session').agg(
    pl.col('aid').alias('sentence')
)

CPU times: user 13.6 s, sys: 2.24 s, total: 15.8 s
Wall time: 2.36 s


In [8]:
sentences_df

session,sentence
i32,list[i32]
1067968,"[962549, 654542, ... 621759]"
12215904,"[1513166, 1460571, ... 540804]"
2226272,"[1679951, 127183]"
5946336,"[373932, 576642, ... 319880]"
7133184,"[1382328, 838434, ... 720326]"
12651392,"[1836933, 894653, 823573]"
1333760,"[207783, 305361, ... 181624]"
2231616,"[1181049, 1319491]"
1388160,"[1477663, 445539, 445539]"
12322528,[567420]


In [9]:
del df
_ = gc.collect()

In [10]:
# generate sentences
sentences = sentences_df['sentence'].to_list()

In [11]:
len(sentences)

12899779

In [12]:
sentences[0]

[962549, 654542, 1182276, 1532535, 1325984, 701671, 376637, 1018338, 621759]

### Train a word to vec model

In [13]:
%%time
w2vec = Word2Vec(sentences=sentences, vector_size=50, epochs=10, sg=1, window=5, sample=1e-3, ns_exponent=1, min_count=1, workers=30)

CPU times: user 2h 42min 43s, sys: 10.6 s, total: 2h 42min 54s
Wall time: 14min 21s


In [14]:
w2vec

<gensim.models.word2vec.Word2Vec at 0x7f7e1c28d340>

In [16]:
# save model
w2vec.save(word2vec_model)

In [18]:
%%time
aid2idx = {aid: i for i, aid in enumerate(w2vec.wv.index_to_key)}

aid2idx

CPU times: user 239 ms, sys: 35.9 ms, total: 275 ms
Wall time: 275 ms


509836

In [21]:
w2vec.wv.vectors[0]

array([ 0.00947055,  0.11907542,  0.09810802, -0.9399097 , -0.67572165,
       -0.64162326,  0.8951291 ,  1.0294598 , -1.0759444 , -1.0120237 ,
        0.5598956 , -0.50950134, -0.48135412,  0.49968833, -0.02785579,
       -0.5318959 ,  1.1839541 , -0.17174588, -0.75058633, -0.78881526,
        1.1445032 ,  0.7482379 ,  0.3780507 , -0.3281455 ,  0.5215576 ,
        0.05472098,  0.5621725 , -0.29557467, -0.74253094, -0.01433093,
       -0.57048124,  0.5189472 ,  0.5218271 , -0.312915  , -0.20674533,
        0.43123105, -0.04728171, -0.46908718, -0.8367278 , -0.28841847,
       -0.2606032 ,  0.21006046, -0.64531505, -0.13224085,  1.2341723 ,
        0.45935577, -0.62455   , -0.6435101 , -0.8026427 ,  0.1799721 ],
      dtype=float32)

In [None]:
# model = Word2Vec.load("word2vec.model")

In [14]:
# %%time
# aid2idx = {aid: i for i, aid in enumerate(w2vec.wv.index_to_key)}


# for aid, idx in aid2idx.items():
#     index.add_item(idx, w2vec.wv.vectors[idx])


# index = AnnoyIndex(50, 'euclidean')
# index.build(10)

CPU times: user 50.4 s, sys: 445 ms, total: 50.8 s
Wall time: 16.2 s


True

In [19]:
# save model and index file
index.save('otto.ann')

True

In [24]:
with open('aid2idx.pickle', 'wb') as handle:
    pickle.dump(aid2idx, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Create submission file

In [15]:
%%time
# convert to list
test_session_AIDs = test_df.to_pandas().reset_index(drop=True).groupby('session')['aid'].apply(list)
test_session_types = test_df.to_pandas().reset_index(drop=True).groupby('session')['type'].apply(list)
type_weight_multipliers = {'clicks': 0.5, 'carts': 9, 'orders': 0.5}


CPU times: user 41.5 s, sys: 678 ms, total: 42.2 s
Wall time: 42.1 s


In [None]:
%%time
labels = []

for AIDs, types in tqdm(zip(test_session_AIDs, test_session_types)):
    
    nns = []
    
    # if we have enough aids (over equals 20) we don't need to look for candidates
    if len(AIDs) >= 20:
        # weight by time 0 - 0.1
        weights=np.logspace(0.1,1,len(AIDs),base=2, endpoint=True)-1
        aids_temp=defaultdict(lambda: 0)
        for aid,w,t in zip(AIDs,weights,types): 
            aids_temp[aid]+= w * type_weight_multipliers[t]
            
        sorted_aids=[k for k, v in sorted(aids_temp.items(), key=lambda item: -item[1])]
        #print(f"Have more than 20 candidates rec AIDs are {sorted_aids[:20]}")
        labels.append(sorted_aids[:20])
    
    # here we don't have 20 aids to output -- use word2vec embeddings to generate candidates!
    else:
        AIDs = list(dict.fromkeys(AIDs[::-1]))
        # let's grab the most recent 5 aid
        most_recent_aid = AIDs[:5]
        # Look for some neighbors!
        for n_aid in most_recent_aid:
            nns += [w2vec.wv.index_to_key[i] for i in index.get_nns_by_item(aid2idx[n_aid], 21)[1:]]
        # Retrieve top 20
        most_comment_top20 = [aid for aid, idx in Counter(nns).most_common(20)]
        # print(f"Have less than 20 candidates orignal AIDs are {most_recent_n_aid} recommended AIDs are {nns}, most top 20 are {most_comment_top20}")
        # merge with orginal AIDs
        labels.append((AIDs+most_comment_top20)[:20])

1225929it [03:00, 6683.33it/s]

In [None]:
%%time
# convet labels into required format -> split by space
labels_as_strings = [' '.join([str(l) for l in lls]) for lls in labels]

# generate dataframe
predictions = pd.DataFrame(data={'session_type': test_session_AIDs.index, 'labels': labels_as_strings})

# generate submission file for each type
prediction_dfs = []
for st in ['clicks', 'carts', 'orders']:
    modified_predictions = predictions.copy()
    modified_predictions.session_type = modified_predictions.session_type.astype('str') + f'_{st}'
    prediction_dfs.append(modified_predictions)
submission = pd.concat(prediction_dfs).reset_index(drop=True)
submission.to_csv('submission.csv', index=False)

In [None]:
!kaggle competitions submit -c otto-recommender-system -f ./submission.csv -m "word2vec-rec"