In [1]:
from pathlib import Path

import cudf
import cupy as cp
import numpy as np
from gensim.models import KeyedVectors

In [2]:
data_path = Path.cwd().parent / 'preprocess/data'
matrix_path = Path.cwd().parent / 'covisit/data'
w2v_path = Path.cwd().parent / 'word2vec/data'
temp_path = Path.cwd() / 'data'
if not temp_path.is_dir():
    temp_path.mkdir()

## LB

In [3]:
lb_in = data_path / 'cv'
lb_model = matrix_path / 'lb'
lb_w2v = w2v_path / 'lb'
lb_out = temp_path / 'lb'
if not lb_out.is_dir():
    lb_out.mkdir()

In [4]:
sessions_vectors = cudf.read_parquet((lb_out / 'sessions_vectors.parquet').as_posix())
wv = KeyedVectors.load((lb_w2v / 'w2vec.wordvectors').as_posix(), mmap='r')

In [5]:
cand = cudf.read_parquet((lb_out / 'cand.parquet').as_posix())

In [6]:
covisit_all = cudf.read_parquet((lb_model / 'covisit.parquet').as_posix())

data = cudf.read_parquet((lb_in / 'test_sessions.parquet').as_posix())
data[['session', 'aid']] = data[['session', 'aid']].astype('int32')

data = data.sort_values(
    ['session', 'ts'], ascending=[True, False], ignore_index=True
)

data.drop(['ts', 'type'], axis=1, inplace=True)

data.drop_duplicates(['session', 'aid'])
data['n'] = data.groupby('session').cumcount()
data['wgt'] = 1 / np.log2(data['n'] + 2)
data.drop('n', axis=1, inplace=True)

data = data.merge(
    covisit_all.drop('rank', axis=1),
    how='left',
    suffixes=['', '_all'],
    on='aid'
)
data['wgt_all'] = data['wgt'] * data['wgt_all']
new_cands = data.drop(['aid','wgt'], axis=1)

new_cands = new_cands.groupby(['session', 'candidate']).sum().reset_index()
new_cands = new_cands.sort_values(
    ['session', 'wgt_all'], ascending=[True, False]
)
new_cands['rank'] = new_cands.groupby('session').candidate.cumcount()
new_cands = new_cands[new_cands['rank'] < 20].reset_index(drop=True)
new_cands.drop('rank', axis=1, inplace=True)

cand = cand.merge(
    new_cands, 
    on=['session', 'candidate'], 
    how='outer',
    suffixes=('', '_new')
).reset_index(drop=True)

cand['w2v_rank'] = cand['w2v_rank'].fillna(-1)
cand['past'] = cand['past'].fillna(False)
cand['wgt_all'] = cand['wgt_all'].fillna(-1)

In [7]:
covisit_cl2cl = cudf.read_parquet((lb_model / 'covisit_cl2cl.parquet').as_posix())
data = cudf.read_parquet((lb_in / 'test_sessions.parquet').as_posix())
data[['session', 'aid']] = data[['session', 'aid']].astype('int32')

data = data.sort_values(
    ['session', 'ts'], ascending=[True, False], ignore_index=True
)

data = data.loc[data['type'] == 'clicks']
data.drop(['ts', 'type'], axis=1, inplace=True)

data.drop_duplicates(['session', 'aid'])
data['n'] = data.groupby('session').cumcount()
data['wgt'] = 1 / np.log2(data['n'] + 2)
data.drop('n', axis=1, inplace=True)

data = data.merge(
    covisit_all.drop('rank', axis=1),
    how='left',
    suffixes=['', '_cl2cl'],
    on='aid'
)
data['wgt_cl2cl'] = data['wgt'] * data['wgt_cl2cl']
new_cands = data.drop(['aid','wgt'], axis=1)

new_cands = new_cands.groupby(['session', 'candidate']).sum().reset_index()
new_cands = new_cands.sort_values(
    ['session', 'wgt_cl2cl'], ascending=[True, False]
)
new_cands['rank'] = new_cands.groupby('session').candidate.cumcount()
new_cands = new_cands[new_cands['rank'] < 20].reset_index(drop=True)
new_cands.drop('rank', axis=1, inplace=True)

cand = cand.merge(
    new_cands, 
    on=['session', 'candidate'], 
    how='outer',
    suffixes=('', '_new')
).reset_index(drop=True)

cand['w2v_rank'] = cand['w2v_rank'].fillna(-1)
cand['past'] = cand['past'].fillna(False)
cand['wgt_all'] = cand['wgt_all'].fillna(-1)
cand['wgt_cl2cl'] = cand['wgt_cl2cl'].fillna(-1)

In [8]:
cand.to_parquet((lb_out / 'click_cand.parquet').as_posix())

In [9]:
cand = cudf.read_parquet((lb_out / 'cand.parquet').as_posix())

In [10]:
covisit_all = cudf.read_parquet((lb_model / 'covisit.parquet').as_posix())

data = cudf.read_parquet((lb_in / 'test_sessions.parquet').as_posix())
data[['session', 'aid']] = data[['session', 'aid']].astype('int32')

data = data.sort_values(
    ['session', 'ts'], ascending=[True, False], ignore_index=True
)

data.drop(['ts', 'type'], axis=1, inplace=True)

data.drop_duplicates(['session', 'aid'])
data['n'] = data.groupby('session').cumcount()
data['wgt'] = 1 / np.log2(data['n'] + 2)
data.drop('n', axis=1, inplace=True)

data = data.merge(
    covisit_all.drop('rank', axis=1),
    how='left',
    suffixes=['', '_all'],
    on='aid'
)
data['wgt_all'] = data['wgt'] * data['wgt_all']
new_cands = data.drop(['aid','wgt'], axis=1)

new_cands = new_cands.groupby(['session', 'candidate']).sum().reset_index()
new_cands = new_cands.sort_values(
    ['session', 'wgt_all'], ascending=[True, False]
)
new_cands['rank'] = new_cands.groupby('session').candidate.cumcount()
new_cands = new_cands[new_cands['rank'] < 20].reset_index(drop=True)
new_cands.drop('rank', axis=1, inplace=True)

cand = cand.merge(
    new_cands, 
    on=['session', 'candidate'], 
    how='outer',
    suffixes=('', '_new')
).reset_index(drop=True)

cand['w2v_rank'] = cand['w2v_rank'].fillna(-1)
cand['past'] = cand['past'].fillna(False)
cand['wgt_all'] = cand['wgt_all'].fillna(-1)

In [11]:
covisit_cl2cr = cudf.read_parquet((lb_model / 'covisit_cl2cr.parquet').as_posix())
data = cudf.read_parquet((lb_in / 'test_sessions.parquet').as_posix())
data[['session', 'aid']] = data[['session', 'aid']].astype('int32')

data = data.sort_values(
    ['session', 'ts'], ascending=[True, False], ignore_index=True
)

data = data.loc[data['type'] == 'clicks']
data.drop(['ts', 'type'], axis=1, inplace=True)

data.drop_duplicates(['session', 'aid'])
data['n'] = data.groupby('session').cumcount()
data['wgt'] = 1 / np.log2(data['n'] + 2)
data.drop('n', axis=1, inplace=True)

data = data.merge(
    covisit_all.drop('rank', axis=1),
    how='left',
    suffixes=['', '_cl2cr'],
    on='aid'
)
data['wgt_cl2cr'] = data['wgt'] * data['wgt_cl2cr']
new_cands = data.drop(['aid','wgt'], axis=1)

new_cands = new_cands.groupby(['session', 'candidate']).sum().reset_index()
new_cands = new_cands.sort_values(
    ['session', 'wgt_cl2cr'], ascending=[True, False]
)
new_cands['rank'] = new_cands.groupby('session').candidate.cumcount()
new_cands = new_cands[new_cands['rank'] < 20].reset_index(drop=True)
new_cands.drop('rank', axis=1, inplace=True)

cand = cand.merge(
    new_cands, 
    on=['session', 'candidate'], 
    how='outer',
    suffixes=('', '_new')
).reset_index(drop=True)

cand['w2v_rank'] = cand['w2v_rank'].fillna(-1)
cand['past'] = cand['past'].fillna(False)
cand['wgt_all'] = cand['wgt_all'].fillna(-1)
cand['wgt_cl2cr'] = cand['wgt_cl2cr'].fillna(-1)

In [12]:
covisit_cror = cudf.read_parquet((lb_model / 'covisit_cr&or.parquet').as_posix())
data = cudf.read_parquet((lb_in / 'test_sessions.parquet').as_posix())
data[['session', 'aid']] = data[['session', 'aid']].astype('int32')

data = data.sort_values(
    ['session', 'ts'], ascending=[True, False], ignore_index=True
)

data = data.loc[data['type'].isin(['carts', 'orders'])]
data.drop(['ts', 'type'], axis=1, inplace=True)

data.drop_duplicates(['session', 'aid'])
data['n'] = data.groupby('session').cumcount()
data['wgt'] = 1 / np.log2(data['n'] + 2)
data.drop('n', axis=1, inplace=True)

data = data.merge(
    covisit_all.drop('rank', axis=1),
    how='left',
    suffixes=['', '_cr&or'],
    on='aid'
)
data['wgt_cr&or'] = data['wgt'] * data['wgt_cr&or']
new_cands = data.drop(['aid','wgt'], axis=1)

new_cands = new_cands.groupby(['session', 'candidate']).sum().reset_index()
new_cands = new_cands.sort_values(
    ['session', 'wgt_cr&or'], ascending=[True, False]
)
new_cands['rank'] = new_cands.groupby('session').candidate.cumcount()
new_cands = new_cands[new_cands['rank'] < 20].reset_index(drop=True)
new_cands.drop('rank', axis=1, inplace=True)

cand = cand.merge(
    new_cands, 
    on=['session', 'candidate'], 
    how='outer',
    suffixes=('', '_new')
).reset_index(drop=True)

cand['w2v_rank'] = cand['w2v_rank'].fillna(-1)
cand['past'] = cand['past'].fillna(False)
cand['wgt_all'] = cand['wgt_all'].fillna(-1)
cand['wgt_cl2cr'] = cand['wgt_cl2cr'].fillna(-1)
cand['wgt_cr&or'] = cand['wgt_cr&or'].fillna(-1)

In [13]:
cand.to_parquet((lb_out / 'cart_cand.parquet').as_posix())

In [14]:
cand = cudf.read_parquet((lb_out / 'cand.parquet').as_posix())

In [15]:
covisit_all = cudf.read_parquet((lb_model / 'covisit.parquet').as_posix())

data = cudf.read_parquet((lb_in / 'test_sessions.parquet').as_posix())
data[['session', 'aid']] = data[['session', 'aid']].astype('int32')

data = data.sort_values(
    ['session', 'ts'], ascending=[True, False], ignore_index=True
)

data.drop(['ts', 'type'], axis=1, inplace=True)

data.drop_duplicates(['session', 'aid'])
data['n'] = data.groupby('session').cumcount()
data['wgt'] = 1 / np.log2(data['n'] + 2)
data.drop('n', axis=1, inplace=True)

data = data.merge(
    covisit_all.drop('rank', axis=1),
    how='left',
    suffixes=['', '_all'],
    on='aid'
)
data['wgt_all'] = data['wgt'] * data['wgt_all']
new_cands = data.drop(['aid','wgt'], axis=1)

new_cands = new_cands.groupby(['session', 'candidate']).sum().reset_index()
new_cands = new_cands.sort_values(
    ['session', 'wgt_all'], ascending=[True, False]
)
new_cands['rank'] = new_cands.groupby('session').candidate.cumcount()
new_cands = new_cands[new_cands['rank'] < 20].reset_index(drop=True)
new_cands.drop('rank', axis=1, inplace=True)

cand = cand.merge(
    new_cands, 
    on=['session', 'candidate'], 
    how='outer',
    suffixes=('', '_new')
).reset_index(drop=True)

cand['w2v_rank'] = cand['w2v_rank'].fillna(-1)
cand['past'] = cand['past'].fillna(False)
cand['wgt_all'] = cand['wgt_all'].fillna(-1)

In [16]:
covisit_cror = cudf.read_parquet((lb_model / 'covisit_cr&or.parquet').as_posix())
data = cudf.read_parquet((lb_in / 'test_sessions.parquet').as_posix())
data[['session', 'aid']] = data[['session', 'aid']].astype('int32')

data = data.sort_values(
    ['session', 'ts'], ascending=[True, False], ignore_index=True
)

data = data.loc[data['type'].isin(['carts', 'orders'])]
data.drop(['ts', 'type'], axis=1, inplace=True)

data.drop_duplicates(['session', 'aid'])
data['n'] = data.groupby('session').cumcount()
data['wgt'] = 1 / np.log2(data['n'] + 2)
data.drop('n', axis=1, inplace=True)

data = data.merge(
    covisit_all.drop('rank', axis=1),
    how='left',
    suffixes=['', '_cr&or'],
    on='aid'
)
data['wgt_cr&or'] = data['wgt'] * data['wgt_cr&or']
new_cands = data.drop(['aid','wgt'], axis=1)

new_cands = new_cands.groupby(['session', 'candidate']).sum().reset_index()
new_cands = new_cands.sort_values(
    ['session', 'wgt_cr&or'], ascending=[True, False]
)
new_cands['rank'] = new_cands.groupby('session').candidate.cumcount()
new_cands = new_cands[new_cands['rank'] < 20].reset_index(drop=True)
new_cands.drop('rank', axis=1, inplace=True)

cand = cand.merge(
    new_cands, 
    on=['session', 'candidate'], 
    how='outer',
    suffixes=('', '_new')
).reset_index(drop=True)

cand['w2v_rank'] = cand['w2v_rank'].fillna(-1)
cand['past'] = cand['past'].fillna(False)
cand['wgt_all'] = cand['wgt_all'].fillna(-1)
cand['wgt_cr&or'] = cand['wgt_cr&or'].fillna(-1)

In [17]:
covisit_or2or = cudf.read_parquet((lb_model / 'covisit_or2or.parquet').as_posix())
data = cudf.read_parquet((lb_in / 'test_sessions.parquet').as_posix())
data[['session', 'aid']] = data[['session', 'aid']].astype('int32')

data = data.sort_values(
    ['session', 'ts'], ascending=[True, False], ignore_index=True
)

data = data.loc[data['type'] == 'orders']
data.drop(['ts', 'type'], axis=1, inplace=True)

data.drop_duplicates(['session', 'aid'])
data['n'] = data.groupby('session').cumcount()
data['wgt'] = 1 / np.log2(data['n'] + 2)
data.drop('n', axis=1, inplace=True)

data = data.merge(
    covisit_all.drop('rank', axis=1),
    how='left',
    suffixes=['', '_or2or'],
    on='aid'
)
data['wgt_or2or'] = data['wgt'] * data['wgt_or2or']
new_cands = data.drop(['aid','wgt'], axis=1)

new_cands = new_cands.groupby(['session', 'candidate']).sum().reset_index()
new_cands = new_cands.sort_values(
    ['session', 'wgt_or2or'], ascending=[True, False]
)
new_cands['rank'] = new_cands.groupby('session').candidate.cumcount()
new_cands = new_cands[new_cands['rank'] < 20].reset_index(drop=True)
new_cands.drop('rank', axis=1, inplace=True)

cand = cand.merge(
    new_cands, 
    on=['session', 'candidate'], 
    how='outer',
    suffixes=('', '_new')
).reset_index(drop=True)

cand['w2v_rank'] = cand['w2v_rank'].fillna(-1)
cand['past'] = cand['past'].fillna(False)
cand['wgt_all'] = cand['wgt_all'].fillna(-1)
cand['wgt_cr&or'] = cand['wgt_cr&or'].fillna(-1)
cand['wgt_cor2or'] = cand['wgt_or2or'].fillna(-1)

In [18]:
cand.to_parquet((lb_out / 'order_cand.parquet').as_posix())