In [1]:
import polars as pl
import numpy as np
import scipy
from implicit.nearest_neighbours import bm25_weight
from implicit.bpr import BayesianPersonalizedRanking
from evaluate import evaluate
from time import sleep

In [3]:
class self:
    data_path = '../data/'
    add_train = True
    fold = 'valid2__'
    mapper = {0: 1, 1: 6, 2: 9}
    max_cands = 20
    name = 'bpr'

In [4]:
if self.add_train:
    df1 = pl.read_parquet(
        f'{self.data_path}raw/{self.fold}test.parquet')
    df2 = pl.read_parquet(
        f'{self.data_path}raw/{self.fold}train.parquet')
    df1 = df1.with_column(pl.lit(1, pl.Int8).alias('test'))
    df2 = df2.with_column(pl.lit(0, pl.Int8).alias('test'))
    df = pl.concat([df1, df2], how='vertical')
    del df1, df2
else:
    df = pl.read_parquet(
        f'{self.data_path}raw/{self.fold}test.parquet')
    df = df.with_column(pl.lit(1, pl.Int8).alias('test'))

df = df.drop('ts')
df = df.with_column(pl.col('type').apply(lambda x: self.mapper[x]))
df = df.groupby(['session', 'aid', 'test']).sum()

aid_cnt = df.groupby('aid').agg(
    pl.col('session').n_unique().alias('cnt'))
aid_cnt = aid_cnt.filter(pl.col('cnt') >= 0)
df = df.join(aid_cnt, on='aid').drop('cnt')
del aid_cnt
df = df.with_column(
    (pl.col('session').rank('dense') - 1).alias('session_idx'))
df = df.with_column((pl.col('aid').rank('dense') - 1).alias('aid_idx'))
values = df.select(pl.col('type')).to_numpy().ravel()
session_idx = df.select(pl.col('session_idx')).to_numpy().ravel()
aid_idx = df.select(pl.col('aid_idx')).to_numpy().ravel()
aid_session = scipy.sparse.coo_matrix((values, (aid_idx, session_idx)), shape=(np.unique(aid_idx).shape[0],
                                                                                np.unique(session_idx).shape[0]))
session_idx = np.unique(session_idx)
aid_session = aid_session.tocsr()
aid_session = bm25_weight(aid_session, K1=100, B=0.8)
session_aid = aid_session.T.tocsr()
model = BayesianPersonalizedRanking(100)
model.fit(session_aid)

  0%|          | 0/100 [00:00<?, ?it/s]

In [5]:
session_idx = np.unique(df.filter(pl.col('test') == 1).select(
    pl.col('session_idx')).to_numpy().ravel())
batch_aids, _ = model.recommend(
        session_idx, session_aid[session_idx], self.max_cands, filter_already_liked_items=False)

In [6]:
batch_aids = batch_aids.ravel().reshape(-1, 1)
session_idx = np.repeat(
        session_idx, self.max_cands).reshape(-1, 1)
result = pl.DataFrame(
        np.hstack([session_idx, batch_aids]), columns=['session_idx', 'aid_idx'])

result = result.with_column(pl.lit(1).alias('one'))
result = result.with_column((pl.col('one').cumsum().over(
    'session_idx') - 1).alias(self.name)).drop('one')
session_inv = df.select(pl.col(['session', 'session_idx'])).unique()
aid_inv = df.select(pl.col(['aid', 'aid_idx'])).unique()

session_inv = session_inv.select(pl.col('*').cast(pl.Int32))
aid_inv = aid_inv.select(pl.col('*').cast(pl.Int32))
result = result.select(pl.col('*').cast(pl.Int32))
result = result.join(session_inv, on='session_idx')
result = result.join(aid_inv, on='aid_idx')
result = result.drop(['session_idx', 'aid_idx'])

In [7]:
reco = result.groupby(
    'session').agg(pl.col('aid'))
reco_clicks = reco.select([(pl.col('session').cast(str) + pl.lit(f'_clicks')).alias('session_type'), pl.col(
            'aid').apply(lambda x: ' '.join([str(i) for i in x[:20]])).alias('labels')])

reco_carts = reco_clicks.with_column(pl.col('session_type').str.replace('clicks', 'carts'))
reco_orders = reco_clicks.with_column(pl.col('session_type').str.replace('clicks', 'orders'))
reco_fin = pl.concat([reco_clicks, reco_carts, reco_orders])

reco_fin.write_csv('bpr_test.csv')
scores = evaluate(f'../data/raw/valid2__test_labels.jsonl', 'bpr_test.csv')

evaluating solution


In [8]:
scores

{'clicks': 0.09508767104385196,
 'carts': 0.07384254139419459,
 'orders': 0.09475026259914521,
 'total': 0.0885116870821307}

In [2]:
class self:
    data_path = '../data/'
    add_train = True
    fold = 'valid2__'
    mapper = {0: 1, 1: 2, 2: 3}
    max_cands = 20
    name = 'bpr'

In [3]:
if self.add_train:
    df1 = pl.read_parquet(
        f'{self.data_path}raw/{self.fold}test.parquet')
    df2 = pl.read_parquet(
        f'{self.data_path}raw/{self.fold}train.parquet')
    df1 = df1.with_column(pl.lit(1, pl.Int8).alias('test'))
    df2 = df2.with_column(pl.lit(0, pl.Int8).alias('test'))
    df = pl.concat([df1, df2], how='vertical')
    del df1, df2
else:
    df = pl.read_parquet(
        f'{self.data_path}raw/{self.fold}test.parquet')
    df = df.with_column(pl.lit(1, pl.Int8).alias('test'))

df = df.drop('ts')
df = df.with_column(pl.col('type').apply(lambda x: self.mapper[x]))
df = df.groupby(['session', 'aid', 'test']).max()

aid_cnt = df.groupby('aid').agg(
    pl.col('session').n_unique().alias('cnt'))
aid_cnt = aid_cnt.filter(pl.col('cnt') >= 0)
df = df.join(aid_cnt, on='aid').drop('cnt')
del aid_cnt
df = df.with_column(
    (pl.col('session').rank('dense') - 1).alias('session_idx'))
df = df.with_column((pl.col('aid').rank('dense') - 1).alias('aid_idx'))
values = df.select(pl.col('type')).to_numpy().ravel()
session_idx = df.select(pl.col('session_idx')).to_numpy().ravel()
aid_idx = df.select(pl.col('aid_idx')).to_numpy().ravel()
aid_session = scipy.sparse.coo_matrix((values, (aid_idx, session_idx)), shape=(np.unique(aid_idx).shape[0],
                                                                                np.unique(session_idx).shape[0]))
session_idx = np.unique(session_idx)
aid_session = aid_session.tocsr()
aid_session = bm25_weight(aid_session, K1=100, B=0.8)
session_aid = aid_session.T.tocsr()
model = BayesianPersonalizedRanking(100)
model.fit(session_aid)

  0%|          | 0/100 [00:00<?, ?it/s]

In [4]:
session_idx = np.unique(df.filter(pl.col('test') == 1).select(
    pl.col('session_idx')).to_numpy().ravel())
batch_aids, _ = model.recommend(
        session_idx, session_aid[session_idx], self.max_cands, filter_already_liked_items=False)

In [5]:
batch_aids = batch_aids.ravel().reshape(-1, 1)
session_idx = np.repeat(
        session_idx, self.max_cands).reshape(-1, 1)
result = pl.DataFrame(
        np.hstack([session_idx, batch_aids]), columns=['session_idx', 'aid_idx'])

result = result.with_column(pl.lit(1).alias('one'))
result = result.with_column((pl.col('one').cumsum().over(
    'session_idx') - 1).alias(self.name)).drop('one')
session_inv = df.select(pl.col(['session', 'session_idx'])).unique()
aid_inv = df.select(pl.col(['aid', 'aid_idx'])).unique()

session_inv = session_inv.select(pl.col('*').cast(pl.Int32))
aid_inv = aid_inv.select(pl.col('*').cast(pl.Int32))
result = result.select(pl.col('*').cast(pl.Int32))
result = result.join(session_inv, on='session_idx')
result = result.join(aid_inv, on='aid_idx')
result = result.drop(['session_idx', 'aid_idx'])

In [6]:
reco = result.groupby(
    'session').agg(pl.col('aid'))
reco_clicks = reco.select([(pl.col('session').cast(str) + pl.lit(f'_clicks')).alias('session_type'), pl.col(
            'aid').apply(lambda x: ' '.join([str(i) for i in x[:20]])).alias('labels')])

reco_carts = reco_clicks.with_column(pl.col('session_type').str.replace('clicks', 'carts'))
reco_orders = reco_clicks.with_column(pl.col('session_type').str.replace('clicks', 'orders'))
reco_fin = pl.concat([reco_clicks, reco_carts, reco_orders])

reco_fin.write_csv('bpr_test.csv')
scores = evaluate(f'../data/raw/valid2__test_labels.jsonl', 'bpr_test.csv')

evaluating solution


In [7]:
scores

{'clicks': 0.0959503279994925,
 'carts': 0.07467856293322539,
 'orders': 0.0953066484365059,
 'total': 0.0891825907418204}

In [8]:
class self:
    data_path = '../data/'
    add_train = True
    fold = 'valid2__'
    mapper = {0: 1, 1: 2, 2: 3}
    max_cands = 20
    name = 'bpr'

In [9]:
if self.add_train:
    df1 = pl.read_parquet(
        f'{self.data_path}raw/{self.fold}test.parquet')
    df2 = pl.read_parquet(
        f'{self.data_path}raw/{self.fold}train.parquet')
    df1 = df1.with_column(pl.lit(1, pl.Int8).alias('test'))
    df2 = df2.with_column(pl.lit(0, pl.Int8).alias('test'))
    df = pl.concat([df1, df2], how='vertical')
    del df1, df2
else:
    df = pl.read_parquet(
        f'{self.data_path}raw/{self.fold}test.parquet')
    df = df.with_column(pl.lit(1, pl.Int8).alias('test'))

df = df.drop('ts')
df = df.with_column(pl.col('type').apply(lambda x: self.mapper[x]))
df = df.groupby(['session', 'aid', 'test']).max()

aid_cnt = df.groupby('aid').agg(
    pl.col('session').n_unique().alias('cnt'))
aid_cnt = aid_cnt.filter(pl.col('cnt') >= 0)
df = df.join(aid_cnt, on='aid').drop('cnt')
del aid_cnt
df = df.with_column(
    (pl.col('session').rank('dense') - 1).alias('session_idx'))
df = df.with_column((pl.col('aid').rank('dense') - 1).alias('aid_idx'))
values = df.select(pl.col('type')).to_numpy().ravel()
session_idx = df.select(pl.col('session_idx')).to_numpy().ravel()
aid_idx = df.select(pl.col('aid_idx')).to_numpy().ravel()
aid_session = scipy.sparse.coo_matrix((values, (aid_idx, session_idx)), shape=(np.unique(aid_idx).shape[0],
                                                                                np.unique(session_idx).shape[0]))
session_idx = np.unique(session_idx)
aid_session = aid_session.tocsr()
# aid_session = bm25_weight(aid_session, K1=100, B=0.8)
session_aid = aid_session.T.tocsr()
model = BayesianPersonalizedRanking(100)
model.fit(session_aid)

  0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
session_idx = np.unique(df.filter(pl.col('test') == 1).select(
    pl.col('session_idx')).to_numpy().ravel())
batch_aids, _ = model.recommend(
        session_idx, session_aid[session_idx], self.max_cands, filter_already_liked_items=False)

In [None]:
batch_aids = batch_aids.ravel().reshape(-1, 1)
session_idx = np.repeat(
        session_idx, self.max_cands).reshape(-1, 1)
result = pl.DataFrame(
        np.hstack([session_idx, batch_aids]), columns=['session_idx', 'aid_idx'])

result = result.with_column(pl.lit(1).alias('one'))
result = result.with_column((pl.col('one').cumsum().over(
    'session_idx') - 1).alias(self.name)).drop('one')
session_inv = df.select(pl.col(['session', 'session_idx'])).unique()
aid_inv = df.select(pl.col(['aid', 'aid_idx'])).unique()

session_inv = session_inv.select(pl.col('*').cast(pl.Int32))
aid_inv = aid_inv.select(pl.col('*').cast(pl.Int32))
result = result.select(pl.col('*').cast(pl.Int32))
result = result.join(session_inv, on='session_idx')
result = result.join(aid_inv, on='aid_idx')
result = result.drop(['session_idx', 'aid_idx'])

In [None]:
reco = result.groupby(
    'session').agg(pl.col('aid'))
reco_clicks = reco.select([(pl.col('session').cast(str) + pl.lit(f'_clicks')).alias('session_type'), pl.col(
            'aid').apply(lambda x: ' '.join([str(i) for i in x[:20]])).alias('labels')])

reco_carts = reco_clicks.with_column(pl.col('session_type').str.replace('clicks', 'carts'))
reco_orders = reco_clicks.with_column(pl.col('session_type').str.replace('clicks', 'orders'))
reco_fin = pl.concat([reco_clicks, reco_carts, reco_orders])

reco_fin.write_csv('bpr_test.csv')
scores = evaluate(f'../data/raw/valid2__test_labels.jsonl', 'bpr_test.csv')

In [None]:
class self:
    data_path = '../data/'
    add_train = True
    fold = 'valid2__'
    mapper = {0: 1, 1: 1, 2: 1}
    max_cands = 20
    name = 'bpr'

In [None]:
if self.add_train:
    df1 = pl.read_parquet(
        f'{self.data_path}raw/{self.fold}test.parquet')
    df2 = pl.read_parquet(
        f'{self.data_path}raw/{self.fold}train.parquet')
    df1 = df1.with_column(pl.lit(1, pl.Int8).alias('test'))
    df2 = df2.with_column(pl.lit(0, pl.Int8).alias('test'))
    df = pl.concat([df1, df2], how='vertical')
    del df1, df2
else:
    df = pl.read_parquet(
        f'{self.data_path}raw/{self.fold}test.parquet')
    df = df.with_column(pl.lit(1, pl.Int8).alias('test'))

df = df.drop('ts')
df = df.with_column(pl.col('type').apply(lambda x: self.mapper[x]))
df = df.groupby(['session', 'aid', 'test']).max()

aid_cnt = df.groupby('aid').agg(
    pl.col('session').n_unique().alias('cnt'))
aid_cnt = aid_cnt.filter(pl.col('cnt') >= 0)
df = df.join(aid_cnt, on='aid').drop('cnt')
del aid_cnt
df = df.with_column(
    (pl.col('session').rank('dense') - 1).alias('session_idx'))
df = df.with_column((pl.col('aid').rank('dense') - 1).alias('aid_idx'))
values = df.select(pl.col('type')).to_numpy().ravel()
session_idx = df.select(pl.col('session_idx')).to_numpy().ravel()
aid_idx = df.select(pl.col('aid_idx')).to_numpy().ravel()
aid_session = scipy.sparse.coo_matrix((values, (aid_idx, session_idx)), shape=(np.unique(aid_idx).shape[0],
                                                                                np.unique(session_idx).shape[0]))
session_idx = np.unique(session_idx)
aid_session = aid_session.tocsr()
# aid_session = bm25_weight(aid_session, K1=100, B=0.8)
session_aid = aid_session.T.tocsr()
model = BayesianPersonalizedRanking(100)
model.fit(session_aid)

In [None]:
session_idx = np.unique(df.filter(pl.col('test') == 1).select(
    pl.col('session_idx')).to_numpy().ravel())
batch_aids, _ = model.recommend(
        session_idx, session_aid[session_idx], self.max_cands, filter_already_liked_items=False)

In [None]:
batch_aids = batch_aids.ravel().reshape(-1, 1)
session_idx = np.repeat(
        session_idx, self.max_cands).reshape(-1, 1)
result = pl.DataFrame(
        np.hstack([session_idx, batch_aids]), columns=['session_idx', 'aid_idx'])

result = result.with_column(pl.lit(1).alias('one'))
result = result.with_column((pl.col('one').cumsum().over(
    'session_idx') - 1).alias(self.name)).drop('one')
session_inv = df.select(pl.col(['session', 'session_idx'])).unique()
aid_inv = df.select(pl.col(['aid', 'aid_idx'])).unique()

session_inv = session_inv.select(pl.col('*').cast(pl.Int32))
aid_inv = aid_inv.select(pl.col('*').cast(pl.Int32))
result = result.select(pl.col('*').cast(pl.Int32))
result = result.join(session_inv, on='session_idx')
result = result.join(aid_inv, on='aid_idx')
result = result.drop(['session_idx', 'aid_idx'])

In [None]:
reco = result.groupby(
    'session').agg(pl.col('aid'))
reco_clicks = reco.select([(pl.col('session').cast(str) + pl.lit(f'_clicks')).alias('session_type'), pl.col(
            'aid').apply(lambda x: ' '.join([str(i) for i in x[:20]])).alias('labels')])

reco_carts = reco_clicks.with_column(pl.col('session_type').str.replace('clicks', 'carts'))
reco_orders = reco_clicks.with_column(pl.col('session_type').str.replace('clicks', 'orders'))
reco_fin = pl.concat([reco_clicks, reco_carts, reco_orders])

reco_fin.write_csv('bpr_test.csv')
scores = evaluate(f'../data/raw/valid2__test_labels.jsonl', 'bpr_test.csv')