In [None]:
!pip install polars

In [2]:
import polars as pl
pl.Config.set_tbl_rows(50)
import numpy as np
import glob
from gensim.models import Word2Vec
from annoy import AnnoyIndex
import gc

In [3]:
def make_df_vector_MF(phase, method, ndim):
    """
    行列分解から得られたuserとitemのembeddingをDataFrameに変換して保存する
    """
    user_factor = np.load(f'input/embedding_{method}_{phase}/user_factors.npy')
    user_factor = pl.concat([pl.DataFrame({'session':list(range(len(user_factor)))}), pl.DataFrame(user_factor, columns=[f'user_vec{i}' for i in range(ndim)])], how='horizontal')
    user_factor = user_factor.with_column(np.sqrt(np.sum([pl.col(f'user_vec{i}')*pl.col(f'user_vec{i}') for i in range(ndim)])).alias('user_vec_norm'))
    user_factor = user_factor.with_column(pl.col('session').cast(pl.Int32))
    user_factor.write_parquet(f'user_vec_{method}_{phase}.parquet')
    
    item_factor = np.load(f'input/embedding_{method}_{phase}/item_factors.npy')    
    item_factor = pl.concat([pl.DataFrame({'aid':list(range(len(item_factor)))}), pl.DataFrame(item_factor, columns=[f'item_vec{i}' for i in range(ndim)])], how='horizontal')
    item_factor = item_factor.with_column(np.sqrt(np.sum([pl.col(f'item_vec{i}')*pl.col(f'item_vec{i}') for i in range(ndim)])).alias('item_vec_norm'))
    item_factor = item_factor.with_column(pl.col('aid').cast(pl.Int32))
    item_factor.write_parquet(f'item_vec_{method}_{phase}.parquet')
        
def make_df_top20_MF(phase, df_log, method, ndim):
    """
    行列分解から得られたitemのembeddingから、各itemに対してcos類似度が高いitemを上位20個まで計算して保存する。
    行列分解の計算にはuser-item_matrix_factorization.ipynbを用いる。
    """
    item_factor = np.load(f'input/embedding_{method}_{phase}/item_factors.npy')
    index = AnnoyIndex(ndim, 'angular')
    for aid in range(len(item_factor)):
        index.add_item(aid, item_factor[aid])    
    index.build(100)
    top20 = {aid:index.get_nns_by_item(aid, 21)[1:] for aid in df_log['aid'].unique()}
    top20 = pl.DataFrame({'aid':list(top20.keys()), 'top_20':list(top20.values())}).explode('top_20')
    top20 = top20.with_column(pl.col('*').cast(pl.Int32))
    top20.write_parquet(f'top20_{method}_{phase}.parquet')

def make_df_top20_w2vec(phase):
    """
    word2vecから得られたitemのembeddingから、各itemに対してcos類似度が高いitemを上位20個まで計算して保存する。
    word2vecの計算には以下の公開notebookを用いる。
    https://www.kaggle.com/code/radek1/word2vec-how-to-training-and-submission
    """
    index_w2vec = AnnoyIndex(50, 'angular')
    w2vec = Word2Vec.load(f'input/word2vec_{phase}/word2vec.model')
    aid2idx = {aid: i for i, aid in enumerate(w2vec.wv.index_to_key)}
    for aid, idx in aid2idx.items():
        index_w2vec.add_item(idx, w2vec.wv.vectors[idx])
    index_w2vec.build(100)
    top20_w2vec = {aid:[w2vec.wv.index_to_key[i] for i in index_w2vec.get_nns_by_item(aid2idx[aid], 21)[1:]] for aid in aid2idx.keys()}
    top20_w2vec = pl.DataFrame({'aid':list(top20_w2vec.keys()), 'top_20':list(top20_w2vec.values())}).explode('top_20')
    top20_w2vec = top20_w2vec.with_column(pl.col('*').cast(pl.Int32))
    top20_w2vec.write_parquet(f'top20_w2vec_{phase}.parquet')


In [4]:
def add_log_recency_score(df, exp_min):
    """
    userが最近click, add to cart, orderしたアイテムに対して大きい重みをつけるためのカラムを作成する
    https://www.kaggle.com/code/radek1/polars-proof-of-concept-lgbm-rankerから引用

    """
    
    df = df.select([
        pl.col('*'),
        pl.col('session').cumcount().reverse().over('session').alias('action_num_reverse_chrono')
    ])
    
    df = df.select([
        pl.col('*'),
        pl.col('session').count().over('session').alias('session_length')
    ])
    
    linear_interpolation = exp_min + ((1-exp_min) / (df['session_length']-1)) * (df['session_length']-df['action_num_reverse_chrono']-1)
    return df.with_columns([
        pl.Series(2**linear_interpolation - 1).alias('log_recency_score').cast(pl.Float32)
    ]).fill_nan(1).drop(['action_num_reverse_chrono', 'session_length'])

In [5]:
df_test = pl.read_parquet('input/otto_train_and_test_data_for_local_validation/test.parquet')

make_df_top20_MF('train', df_test, 'bpr', 101)
make_df_top20_MF('train', df_test, 'lmf', 102)
make_df_vector_MF('train', 'bpr', 101)
make_df_vector_MF('train', 'lmf', 102)
make_df_vector_MF('train', 'als', 50)
make_df_top20_w2vec('train')

top20_w2vec_train = pl.read_parquet('top20_w2vec_train.parquet')
top20_bpr_train = pl.read_parquet('top20_bpr_train.parquet')
user_vec_bpr_train = pl.read_parquet('user_vec_bpr_train.parquet')
item_vec_bpr_train = pl.read_parquet('item_vec_bpr_train.parquet')
top20_lmf_train = pl.read_parquet('top20_lmf_train.parquet')
user_vec_lmf_train = pl.read_parquet('user_vec_lmf_train.parquet')
item_vec_lmf_train = pl.read_parquet('item_vec_lmf_train.parquet')
user_vec_als_train = pl.read_parquet('user_vec_als_train.parquet')
item_vec_als_train = pl.read_parquet('item_vec_als_train.parquet')

In [None]:
# itemとitemのペアについて、同一のuserによってclick、add to cart, orderされた回数（共起回数）をカウントすることで得られる行列を
# co-visitation matrixと呼ぶ。co-visitation matrixによって、例えばあるitemをclickしたuserはどのitemをclickする可能性が高いか知ることができる。
# 以下の公開notebookを用いてco-visitation matrixを計算し、各itemについて共起回数が大きいitemを上位20個まで求めた。
# https://www.kaggle.com/code/cdeotte/candidate-rerank-model-lb-0-575
DISK_PIECES = 4
VER = 6
covisit_clicks_train = pl.concat([pl.read_parquet(f'input/covisitation_matrix_train/top_20_clicks_v{VER}_{i}.pqt') for i in range(DISK_PIECES)])
covisit_carts_orders_train = pl.concat([pl.read_parquet(f'input/covisitation_matrix_train/top_20_carts_orders_v{VER}_{i}.pqt') for i in range(DISK_PIECES)])
covisit_buy2buy_train = pl.read_parquet(f'input/covisitation_matrix_train/top_20_buy2buy_v{VER}_0.pqt')
covisit_clicks_train = covisit_clicks_train.drop(['__index_level_0__']).rename({'aid_y':'aid'}).with_column(pl.col(['aid_x', 'aid']).cast(pl.Int32))
covisit_carts_orders_train = covisit_carts_orders_train.drop(['__index_level_0__']).rename({'aid_y':'aid'}).with_column(pl.col(['aid_x', 'aid']).cast(pl.Int32))
covisit_buy2buy_train = covisit_buy2buy_train.drop(['__index_level_0__']).rename({'aid_y':'aid'}).with_column(pl.col(['aid_x', 'aid']).cast(pl.Int32))

In [13]:
def user_item_similarity(phase, df, user_vector, item_vector, ndim, name):
    """
    行列分解から得られたembeddingから、userとitemのcos類似度を計算して保存する
    """
    cosine_similarity = np.sum([pl.col(f'item_vec{i}')*pl.col(f'user_vec{i}') for i in range(ndim)]) / pl.col('user_vec_norm') / pl.col('item_vec_norm')
    session_chunks = map(list, np.array_split(df['session'].unique().to_list(), 10))
    chunk_list = []
    for chunk in session_chunks:
        df_chunk = df[['session', 'aid']].filter(pl.col('session').is_in(chunk))
        df_chunk = df_chunk.join(user_vector, on='session', how='left').join(item_vector, on='aid', how='left')
        df_chunk = df_chunk.with_column(cosine_similarity.alias('user-item_similarity'))
        chunk_list.append(df_chunk[['session', 'aid', 'user-item_similarity']])
    pl.concat(chunk_list).write_parquet(f'user_item_similarity_{name}_{phase}.parquet')

In [42]:
def make_candidate(phase, action_type, df, covisit_clicks, covisit_carts_orders, covisit_buy2buy,
                   top20_w2vec, top20_bpr, top20_lmf, user_vector_bpr, item_vector_bpr, user_vector_lmf, item_vector_lmf,
                  user_vector_als, item_vector_als):
    """
    各userに対して、今後click, add to cart, orderすると考えられるitemの候補を選出する。
    選出されるitemは、以下の通りである。
    (1) userが過去にclick, add to cart, orderしたitem
    (2) userが過去にclick, add to cart, orderしたitemとの共起回数が大きいitem
    (3) userが過去にclick, add to cart, orderしたitemとのcos類似度が高いitem
    
    """
    
    ndim_als = 50
    ndim_lmf = 102
    ndim_bpr = 101
        
    exp_min = {'click':0.05, 'cart':0.1, 'order':0.4}
    df = add_log_recency_score(df, exp_min[action_type])

    df_carts_orders = df.filter(pl.col('type').is_in([1, 2])).drop(['type', 'ts']).unique(subset=['session', 'aid'], keep='last')[::-1]
    df = df.drop(['type', 'ts']).unique(subset=['session', 'aid'], keep='last')[::-1]
    user_item_similarity(phase, df, user_vector_bpr, item_vector_bpr, 'userlog')
    user_item_similarity(phase, df, user_vector_lmf, item_vector_lmf, ndim_lmf, 'userlog_lmf')
    user_item_similarity(phase, df, user_vector_als, item_vector_als, ndim_als, 'userlog_als')

    # userが過去に閲覧したitemとの共起回数が大きいitemを選出する
    if action_type == 'click':
        aids_covisit_clicks = df.rename({'aid':'aid_x'}).join(covisit_clicks, on='aid_x', how='left').drop_nulls(subset='aid')
        aids_covisit_clicks = aids_covisit_clicks.groupby(['session', 'aid'], maintain_order=True).agg([pl.col('log_recency_score').sum().alias('co-visit'), (pl.col('wgt')*pl.col('log_recency_score')).sum()]).sort(['session', 'co-visit'], reverse=[False, True])
        aids_covisit_clicks = aids_covisit_clicks.groupby('session', maintain_order=True).head(60)
        aids_covisit_clicks.write_parquet(f'aids_covisit_click_{phase}.parquet')
        user_item_similarity(phase, aids_covisit_clicks, user_vector_bpr, item_vector_bpr, 'covisit_click')
        user_item_similarity(phase, aids_covisit_clicks, user_vector_lmf, item_vector_lmf, ndim_lmf, 'covisit_click_lmf')
        user_item_similarity(phase, aids_covisit_clicks, user_vector_als, item_vector_als, ndim_als, 'covisit_click_als')
    else:
        aids_covisit_carts_orders = df.rename({'aid':'aid_x'}).join(covisit_carts_orders, on='aid_x', how='left').drop_nulls(subset='aid')
        aids_covisit_buy2buy = df_carts_orders.rename({'aid':'aid_x'}).join(covisit_buy2buy, on='aid_x', how='left').drop_nulls(subset='aid')
        aids_covisit_carts_orders = pl.concat([aids_covisit_carts_orders, aids_covisit_buy2buy])
        aids_covisit_carts_orders = aids_covisit_carts_orders.groupby(['session', 'aid'], maintain_order=True).agg([pl.col('log_recency_score').sum().alias('co-visit'), pl.col('wgt').sum()]).sort(['session', 'co-visit'], reverse=[False, True])
        aids_covisit_carts_orders = aids_covisit_carts_orders.groupby('session', maintain_order=True).head(80)
        aids_covisit_carts_orders.write_parquet(f'aids_covisit_{action_type}_{phase}.parquet')
        user_item_similarity(phase, aids_covisit_carts_orders, user_vector_bpr, item_vector_bpr, f'covisit_{action_type}')
        user_item_similarity(phase, aids_covisit_carts_orders, user_vector_lmf, item_vector_lmf, ndim_lmf, f'covisit_{action_type}_lmf')
        user_item_similarity(phase, aids_covisit_carts_orders, user_vector_als, item_vector_als, ndim_als, f'covisit_{action_type}_als')

    # userが過去に閲覧したitemとのword2vecのembeddingのcos類似度が大きいitemを選出する
    aids_w2vec = df.join(top20_w2vec, on='aid', how='left').groupby(['session', 'top_20'], maintain_order=True).agg(pl.col('log_recency_score').sum().alias('num_recommended_by_w2vec'))
    aids_w2vec = aids_w2vec.sort(['session', 'num_recommended_by_w2vec'], reverse=[False, True]).rename({'top_20':'aid'})
    aids_w2vec = aids_w2vec.groupby('session', maintain_order=True).head(20)
    aids_w2vec.write_parquet(f'aids_w2vec_{action_type}_{phase}.parquet')
    user_item_similarity(phase, aids_w2vec, user_vector_bpr, item_vector_bpr, f'w2vec_{action_type}')
    user_item_similarity(phase, aids_w2vec, user_vector_lmf, item_vector_lmf, ndim_lmf, f'w2vec_{action_type}_lmf')
    user_item_similarity(phase, aids_w2vec, user_vector_als, item_vector_als, ndim_als, f'w2vec_{action_type}_als')

    # userが過去に閲覧したitemとのbpr行列分解のembeddingのcos類似度が大きいitemを選出する
    aids_bpr = df.join(top20_bpr, on='aid', how='left').groupby(['session', 'top_20'], maintain_order=True).agg(pl.col('log_recency_score').sum().alias('num_recommended_by_bpr'))
    aids_bpr = aids_bpr.sort(['session', 'num_recommended_by_bpr'], reverse=[False, True]).rename({'top_20':'aid'})
    aids_bpr = aids_bpr.groupby('session', maintain_order=True).head(20)
    aids_bpr.write_parquet(f'aids_bpr_{action_type}_{phase}.parquet')
    user_item_similarity(phase, aids_bpr, user_vector_bpr, item_vector_bpr, f'bpr_{action_type}')
    user_item_similarity(phase, aids_bpr, user_vector_lmf, item_vector_lmf, ndim_lmf, f'bpr_{action_type}_lmf')
    user_item_similarity(phase, aids_bpr, user_vector_als, item_vector_als, ndim_als, f'bpr_{action_type}_als')
    
    # userが過去に閲覧したitemとのlogistic行列分解のembeddingのcos類似度が大きいitemを選出する
    aids_lmf = df.join(top20_lmf, on='aid', how='left').groupby(['session', 'top_20'], maintain_order=True).agg(pl.col('log_recency_score').sum().alias('num_recommended_by_lmf'))
    aids_lmf = aids_lmf.sort(['session', 'num_recommended_by_lmf'], reverse=[False, True]).rename({'top_20':'aid'})
    aids_lmf = aids_lmf.groupby('session', maintain_order=True).head(20)
    aids_lmf.write_parquet(f'aids_lmf_{action_type}_{phase}.parquet')  
    user_item_similarity(phase, aids_lmf, user_vector_bpr, item_vector_bpr, ndim_bpr, f'lmf_{action_type}')
    user_item_similarity(phase, aids_lmf, user_vector_lmf, item_vector_lmf, ndim_lmf, f'lmf_{action_type}_lmf')
    user_item_similarity(phase, aids_lmf, user_vector_als, item_vector_als, ndim_als, f'lmf_{action_type}_als')

In [17]:
def join_candidate(phase, action_type, df):
    """
    各基準によって選出したitemのDataFrameを結合し、1つのDataFrameにする
    """

    df = df.drop(['type', 'ts']).unique(subset=['session', 'aid'], keep='last')[::-1]

    aids_covisit = pl.read_parquet(f'aids_covisit_{action_type}_{phase}.parquet')
    aids_w2vec = pl.read_parquet(f'aids_w2vec_{action_type}_{phase}.parquet')
    aids_bpr = pl.read_parquet(f'aids_bpr_{action_type}_{phase}.parquet')
    aids_lmf = pl.read_parquet(f'aids_lmf_{action_type}_{phase}.parquet')

    namelist = ['userlog', f'covisit_{action_type}', f'w2vec_{action_type}', f'bpr_{action_type}', f'lmf_{action_type}']
    user2item_similarity = []
    for name in namelist:
        user2item_similarity.append(pl.read_parquet(f'user_item_similarity_{name}_{phase}.parquet'))
    user2item_similarity = pl.concat(user2item_similarity).unique(subset=['session', 'aid']).rename({'user-item_similarity':'user-item_similarity_bpr'})
    
    user2item_similarity_lmf = []
    for name in namelist:
        user2item_similarity_lmf.append(pl.read_parquet(f'user_item_similarity_{name}_lmf_{phase}.parquet'))
    user2item_similarity_lmf = pl.concat(user2item_similarity_lmf).unique(subset=['session', 'aid']).rename({'user-item_similarity':'user-item_similarity_lmf'})
    
    user2item_similarity_als = []
    for name in namelist:
        user2item_similarity_als.append(pl.read_parquet(f'user_item_similarity_{name}_als_{phase}.parquet'))
    user2item_similarity_als = pl.concat(user2item_similarity_als).unique(subset=['session', 'aid']).rename({'user-item_similarity':'user-item_similarity_als'})
        
    df = df.join(aids_w2vec, on=['session', 'aid'], how='outer')
    df = df.join(aids_bpr, on=['session', 'aid'], how='outer')
    df = df.join(aids_lmf, on=['session', 'aid'], how='outer')
    df = df.join(aids_covisit, on=['session', 'aid'], how='outer')
    df = df.join(user2item_similarity, on=['session', 'aid'], how='left')
    df = df.join(user2item_similarity_lmf, on=['session', 'aid'], how='left')
    df = df.join(user2item_similarity_als, on=['session', 'aid'], how='left').sort('session')
    
    df.write_parquet(f'candidate_{action_type}_{phase}.parquet')

    return df
    

In [None]:
%%time
make_candidate('train', 'click', df_test, covisit_clicks_train, covisit_carts_orders_train, covisit_buy2buy_train,
                top20_w2vec_train, top20_bpr_train, top20_lmf_train, user_vec_bpr_train, item_vec_bpr_train, user_vec_lmf_train, item_vec_lmf_train, user_vec_als_train, item_vec_als_train)
make_candidate('train', 'cart', df_test, covisit_clicks_train, covisit_carts_orders_train, covisit_buy2buy_train,
                top20_w2vec_train, top20_bpr_train, top20_lmf_train, user_vec_bpr_train, item_vec_bpr_train, user_vec_lmf_train, item_vec_lmf_train, user_vec_als_train, item_vec_als_train)
make_candidate('train', 'order', df_test, covisit_clicks_train, covisit_carts_orders_train, covisit_buy2buy_train,
                top20_w2vec_train, top20_bpr_train, top20_lmf_train, user_vec_bpr_train, item_vec_bpr_train, user_vec_lmf_train, item_vec_lmf_train, user_vec_als_train, item_vec_als_train)
candidate_click_train = join_candidate('train', 'click', df_test)
candidate_cart_train = join_candidate('train', 'cart', df_test)
candidate_order_train = join_candidate('train', 'order', df_test)

In [None]:
train_label = pl.read_parquet('input/otto_train_and_test_data_for_local_validation/test_labels.parquet')
train_label = train_label.explode('ground_truth').rename({'ground_truth':'aid'})
train_label = train_label.with_column(pl.lit(1).alias('gt'))
train_label = train_label.with_column(pl.col(['session', 'aid']).cast(pl.Int32))

In [None]:
%%time
candidate_click_train = candidate_click_train.join(train_label.filter(pl.col('type')=='clicks')[['session', 'aid', 'gt']], on=['session', 'aid'], how='left')
candidate_cart_train = candidate_cart_train.join(train_label.filter(pl.col('type')=='carts')[['session', 'aid', 'gt']], on=['session', 'aid'], how='left')
candidate_order_train = candidate_order_train.join(train_label.filter(pl.col('type')=='orders')[['session', 'aid', 'gt']], on=['session', 'aid'], how='left')

In [22]:
#全ての候補に対するrecall
print(candidate_click_train['gt'].sum() / len(train_label.filter(pl.col('type')=='clicks')))
print(candidate_cart_train['gt'].sum() / len(train_label.filter(pl.col('type')=='carts')))
print(candidate_order_train['gt'].sum() / len(train_label.filter(pl.col('type')=='orders')))

0.6044559661049003
0.49199834026896594
0.700609194926454


In [None]:
del candidate_click_train, candidate_cart_train, candidate_order_train, df_test
gc.collect()

In [27]:
df_test = pl.read_parquet('input/Otto_Full_Optimized_Memory_Footprint/test.parquet')

make_df_top20_MF('test', df_test, 'bpr', 101)
make_df_top20_MF('test', df_test, 'lmf', 102)
make_df_vector_MF('test', 'bpr', 101)
make_df_vector_MF('test', 'lmf', 102)
make_df_vector_MF('test', 'als', 50)
make_df_top20_w2vec('test')

top20_bpr_test = pl.read_parquet('top20_bpr_test.parquet')
user_vec_bpr_test = pl.read_parquet('user_vec_bpr_test.parquet')
item_vec_bpr_test = pl.read_parquet('item_vec_bpr_test.parquet')
top20_w2vec_test = pl.read_parquet('top20_w2vec_test.parquet')
top20_lmf_test = pl.read_parquet('top20_lmf_test.parquet')
user_vec_lmf_test = pl.read_parquet('user_vec_lmf_test.parquet')
item_vec_lmf_test = pl.read_parquet('item_vec_lmf_test.parquet')
user_vec_als_test = pl.read_parquet('user_vec_als_test.parquet')
item_vec_als_test = pl.read_parquet('item_vec_als_test.parquet')

In [None]:
DISK_PIECES = 4
VER = 5
covisit_clicks_test = pl.concat([pl.read_parquet(f'input/covisitation_matrix_test/top_20_clicks_v{VER}_{i}.pqt') for i in range(DISK_PIECES)])
covisit_carts_orders_test = pl.concat([pl.read_parquet(f'input/covisitation_matrix_test/top_20_carts_orders_v{VER}_{i}.pqt') for i in range(DISK_PIECES)])
covisit_buy2buy_test = pl.read_parquet(f'input/covisitation_matrix_test/top_20_buy2buy_v{VER}_0.pqt')
covisit_clicks_test = covisit_clicks_test.drop(['__index_level_0__']).rename({'aid_y':'aid'}).with_column(pl.col(['aid_x', 'aid']).cast(pl.Int32))
covisit_carts_orders_test = covisit_carts_orders_test.drop(['__index_level_0__']).rename({'aid_y':'aid'}).with_column(pl.col(['aid_x', 'aid']).cast(pl.Int32))
covisit_buy2buy_test = covisit_buy2buy_test.drop(['__index_level_0__']).rename({'aid_y':'aid'}).with_column(pl.col(['aid_x', 'aid']).cast(pl.Int32))

In [None]:
%%time
make_candidate('test', 'click', df_test, covisit_clicks_test, covisit_carts_orders_test, covisit_buy2buy_test,
                top20_w2vec_test, top20_bpr_test, top20_lmf_test, user_vec_bpr_test, item_vec_bpr_test, user_vec_lmf_test, item_vec_lmf_test, user_vec_als_test, item_vec_als_test)
make_candidate('test', 'cart', df_test, covisit_clicks_test, covisit_carts_orders_test, covisit_buy2buy_test,
                top20_w2vec_test, top20_bpr_test, top20_lmf_test, user_vec_bpr_test, item_vec_bpr_test, user_vec_lmf_test, item_vec_lmf_test, user_vec_als_test, item_vec_als_test)
make_candidate('test', 'order', df_test, covisit_clicks_test, covisit_carts_orders_test, covisit_buy2buy_test,
                top20_w2vec_test, top20_bpr_test, top20_lmf_test, user_vec_bpr_test, item_vec_bpr_test, user_vec_lmf_test, item_vec_lmf_test, user_vec_als_test, item_vec_als_test)
candidate_click_test = join_candidate('test', 'click', df_test)
candidate_cart_test = join_candidate('test', 'cart', df_test)
candidate_order_test = join_candidate('test', 'order', df_test)