In [1]:
from tqdm import tqdm
from pathlib import Path

import cudf
import numpy as np
from annoy import AnnoyIndex
from gensim.models import KeyedVectors

In [2]:
data_path = Path.cwd().parent / 'preprocess/data'
model_path = Path.cwd().parent / 'word2vec/data'
temp_path = Path.cwd() / 'data'
if not temp_path.is_dir():
    temp_path.mkdir()

## LB

In [3]:
lb_in = data_path / 'cv'
lb_model = model_path / 'lb'
lb_out = temp_path / 'lb'
if not lb_out.is_dir():
    lb_out.mkdir()

In [4]:
wv = KeyedVectors.load((lb_model / 'w2vec.wordvectors').as_posix(), mmap='r')

aid2idx = {aid: i for i, aid in enumerate(wv.index_to_key)}
index = AnnoyIndex(50, 'euclidean')

for aid, idx in aid2idx.items():
    index.add_item(idx, wv.vectors[idx])

index.build(20)

True

In [5]:
cand = cudf.read_parquet((lb_out / 'cand.parquet').as_posix())

data = cudf.read_parquet((lb_in / 'test_sessions.parquet').as_posix())
data[['session', 'aid']] = data[['session', 'aid']].astype('int32')

data = data.sort_values(
    ['session', 'ts'], ascending=[True, False], ignore_index=True
)

data.drop(['ts', 'type'], axis=1, inplace=True)

data.drop_duplicates(['session', 'aid'])
data['n'] = data.groupby('session').cumcount()
data['wgt'] = 1 / np.log2(data['n'] + 2)
data.drop('n', axis=1, inplace=True)

session_vectors = {}

new_cands = []
for session, group in tqdm(data.groupby('session')):
    vectors = []

    session_aids = group['aid'].to_arrow().to_pylist()
    weights = group['wgt'].to_arrow().to_pylist()

    for aid, weight in zip(session_aids, weights):
        vectors.append(np.array(index.get_item_vector(aid2idx[aid])) * weight)

    session_vector = np.mean(vectors, axis=0)
    session_vectors.update({session: session_vector})

    neighbors = index.get_nns_by_vector(session_vector, 20)
    
    cands = [{'session': session, 'candidate':wv.index_to_key[idx], 'w2v_rank':rank} for rank, idx in enumerate(neighbors)]

    new_cands.extend(cands)
new_cands = cudf.DataFrame(new_cands)

cand = cand.merge(
    new_cands, 
    on=['session', 'candidate'], 
    how='outer',
    suffixes=('', '_new')
).reset_index(drop=True)
cand['w2v_rank'] = cand['w2v_rank'].fillna(-1)
cand['past'] = cand['past'].fillna(False)

sessions_vectors = cudf.DataFrame({
    'session': list(session_vectors.keys()),
    'vector': [list(v) for v in session_vectors.values()]
})

sessions_vectors.to_parquet((lb_out / 'sessions_vectors.parquet').as_posix())
cand.to_parquet((lb_out / 'cand.parquet').as_posix())

1783737it [11:52, 2503.19it/s]
