In [None]:
import os
import warnings

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import scipy.sparse as sp

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from tqdm.auto import tqdm

#Set up Screen output
pd.set_option('display.max_columns', 200)

# Игнорируем назойливые предупреждения
warnings.simplefilter("ignore", FutureWarning)
warnings.simplefilter("ignore", UserWarning)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
DATA_DIR = "/content/drive/MyDrive/hack_the_crack/"

hist_data = pd.read_csv(os.path.join(DATA_DIR, "hist_data.csv"))
test = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))

# Part 0. Loading data

hist_data.csv - исторические данные о покупках
- buyer_id - id покупателя
- pav_order_id - id заказа
- created - время добавления в заказ
- item_id - id item'а
- count - количество выбранного товара в заказе
- price_sold - цена за 1 item
- flag_weight_goods - бинарный флаг того, является ли товар весовым
- weight - вес заказа

test.csv - текущее состояние корзины (последний заказ пользователя)
- buyer_id - id покупателя
- pav_order_id - id заказа
- created - время добавления в заказ
- item_id - id item'а (товара)
- count - количество выбранного товара в заказе
- price_sold - цена за 1 item
- flag_weight_goods - бинарный флаг того, является ли товар весовым

In [None]:
def simple_info(df: pd.DataFrame) -> None: 
    display(
        df.shape,
        pd.concat((
            pd.concat((df.head(2), df.sample(3))),
            df.tail(2)
        ))
    )

In [None]:
simple_info(hist_data)

(4529889, 8)

Unnamed: 0,buyer_id,pav_order_id,created,item_id,count,price_sold,flag_weight_goods,weight
0,95203091,98506637863,2021-07-01 00:03:44,202808329,1.0,79.99,False,11.14
1,95203091,98506637863,2021-07-01 00:03:44,202953905,1.072,44.945,True,11.14
2532363,95090304,98513613476,2021-08-03 19:08:08,203408257,1.0,49.99,False,4.39
2449868,94762672,98513390539,2021-08-02 16:28:52,203999545,1.0,49.99,False,5.87
1599613,96094843,98511225652,2021-07-22 11:53:11,203269249,1.0,13.58,False,10.71
4529887,95619205,4620221347,2021-09-07 18:48:29,203566418,1.0,62.19,False,9.84
4529888,95619205,4620221347,2021-09-07 18:48:29,204351820,1.0,99.99,False,9.84


In [None]:
simple_info(test)

(1081420, 7)

Unnamed: 0,buyer_id,pav_order_id,created,item_id,count,price_sold,flag_weight_goods
0,94640077,98519243164,2021-08-30 17:56:31,203053459,1.0,67.62,False
1,95865222,98512083628,2021-07-26 16:17:21,202967705,1.14,406.8,True
875127,94761197,98517242515,2021-08-21 23:10:04,203480880,1.0,169.89,False
856732,95092891,98512896248,2021-07-31 08:16:58,203101233,1.0,419.89,False
373493,95864944,98514748200,2021-08-09 15:48:57,203439079,1.0,116.49,False
1081418,95688424,98509396234,2021-07-13 14:41:37,203423005,1.0,46.74,False
1081419,96135931,98513272242,2021-08-02 06:41:34,202808313,1.0,80.1,False


In [None]:
# метрики оцениваются для вектора релевантности. пример:
# реальные item_id, которые приобрел покупатель: [1 ,4, 5, 69]
# рекомендованные алгоритмом item_id: [4, 6, 7, 8, 1, 2, 67, 90]
# тогда вектор релеватности будет выглядеть следующим образом: [1, 0, 0, 0, 1, 0, 0, 0]
# и уже по не му будет расчитываться ndcg
def dcg(
    y_relevance: np.ndarray
) -> float:
    
    return np.sum([(2**i - 1) / np.log2(k + 1) for (k, i) in enumerate(y_relevance, start=1)])

def ndcg(
    y_relevance: np.ndarray,
    k: int
) -> float:
    
    if y_relevance.sum() == 0:
        return 0.0
    DCG = dcg(y_relevance[:k])
    IDCG = dcg(-np.sort(-y_relevance)[:k])
    return DCG / IDCG

# Part I. Preparation

In [None]:
import numpy as np
import pandas as pd
import scipy.sparse as sp


class BuyerEncoder:
    def __init__(self, buyer_ids: np.array):
        """Class for encoding buyer ids"""
        self.buyers_mapping = {}
        self.buyers_inv_mapping = {}
        for idx, bid in enumerate(buyer_ids):
            self.buyers_mapping[bid] = idx
            self.buyers_inv_mapping[idx] = bid

    def toIdx(self, bid: int) -> int:
        return self.buyers_mapping[bid]

    def toId(self, idx: int) -> int:
        return self.buyers_inv_mapping[idx]

    @property
    def num_buyers(self):
        return len(self.buyers_mapping)


class ItemEncoder:
    def __init__(self, item_ids: np.array):
        """Class for encoding item ids"""
        self.items_mapping = {}
        self.items_inv_mapping = {}
        for idx, iid in enumerate(item_ids):
            self.items_mapping[iid] = idx
            self.items_inv_mapping[idx] = iid

    def toIdx(self, iid: int) -> int:
        return self.items_mapping[iid]

    def toId(self, idx: int) -> int:
        return self.items_inv_mapping[idx]

    @property
    def num_items(self):
        return len(self.items_mapping)


def make_coo_matrix(
        df: pd.DataFrame,        
        buyer_encoder: BuyerEncoder, 
        item_encoder: ItemEncoder, 
        buyer_col: str='buyer_id', 
        item_col: str='item_id',
        weights_col=None, 
        normalize=False
):
    """https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.coo_matrix.html"""
    if weights_col is None:
        weights = np.ones(len(df), dtype=np.float32)
    else:
        weights = df[weights_col].astype(np.float32)

    sparse_matrix = sp.coo_matrix((
        weights,
        (
            df[buyer_col].map(buyer_encoder.buyers_mapping.get), 
            df[item_col].map(item_encoder.items_mapping.get)
        )
        ), shape=(buyer_encoder.num_buyers, item_encoder.num_items)
    )

    if normalize:
        sparse_matrix = sp.coo_matrix(
            np.divide(
                sparse_matrix.toarray().T, 
                np.sum(sparse_matrix.toarray(), axis=1, dtype=int)
            ).T
        )

    return sparse_matrix

In [None]:
buyer_encoder = BuyerEncoder(np.unique(hist_data.buyer_id))
item_encoder = ItemEncoder(np.unique(hist_data.item_id))

In [None]:
%time
interactions_matrix = make_coo_matrix(
    hist_data, 
    buyer_encoder=buyer_encoder, item_encoder=item_encoder
).tocsr()
interactions_matrix

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 9.3 µs


<63925x52472 sparse matrix of type '<class 'numpy.float32'>'
	with 3644166 stored elements in Compressed Sparse Row format>

In [None]:
interactions_matrix.size / np.prod(interactions_matrix.shape)

0.0010864251164320992

# Part III: Item Based Recommendations

## BM25Recommender Implicit

In [None]:
!pip install -q implicit
from implicit.nearest_neighbours import BM25Recommender



In [None]:
interactions_matrix.shape

(63925, 52472)

Параметры ${K1, B}$ позаимствованы отсюда [BM25](https://ru.wikipedia.org/wiki/Okapi_BM25)

In [None]:
bm25_model = BM25Recommender(K=34, K1=2.0, B=0.75)
bm25_model.fit(interactions_matrix.T)

  0%|          | 0/63925 [00:00<?, ?it/s]

In [131]:
TOP_N = 20

def get_recs(buyer_id):
    buyer_idx = buyer_encoder.toIdx(buyer_id)
    recs = bm25_model.recommend(
        buyer_idx, 
        interactions_matrix, 
        N=TOP_N,
        filter_already_liked_items=True
    )[0]
    return [item_encoder.toId(item) for item in recs]

In [147]:
list(item_encoder.items_inv_mapping.keys())[-1]

52471

In [132]:
for bid in test['buyer_id'].unique():
    print(get_recs(bid))

KeyError: ignored

In [128]:
%time
recs = pd.DataFrame({
    'pav_order_id': test['pav_order_id'].unique()
})
recs['item_id'] = [get_recs(bid) for bid in test['buyer_id'].unique()]
recs

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.87 µs


TypeError: ignored

In [121]:
def _recs_mapper(buyer_id):
    buyer_idx = buyer_encoder.toIdx(buyer_id)
    recs = bm25_model.recommend(
        buyer_idx, 
        interactions_matrix, 
        N=TOP_N,
        filter_already_liked_items=True
    )[0]
    return [item for item in recs]
    # return [item_encoder.toId(item) for item, _ in recs]

In [None]:
recs

Unnamed: 0,pav_order_id
0,98519243164
1,98512083628
2,98519972197
3,98518646272
4,98510857920
...,...
80239,98519541004
80240,98519131074
80241,98516851043
80242,98515690157
