User-Item матрица. Загрузка данных.

In [42]:
import pandas as pd
import numpy as np
df = pd.read_csv('user_item_matrix_1.csv')
df.head()

Unnamed: 0,user_id,item_id,qty,price
0,1,12224,1,1137.15
1,1,14816,1,700.0
2,1,17872,1,672.6
3,1,23481,1,1965.0
4,2,1197,3,986.0


Sparce matrix.
Создание разряжённой матрицы из датафрейма.

In [43]:
from typing import Dict

import pandas as pd
from scipy.sparse import csr_matrix


class UserItemMatrix:
    def __init__(self, sales_data: pd.DataFrame):
        """Class initialization. You can make necessary
        calculations here.

        Args:
            sales_data (pd.DataFrame): Sales dataset.

        Example:
            sales_data (pd.DataFrame):

               user_id  item_id  qty    price
            0        1      118    1   626.66
            1        1      285    1  1016.57
            2        2     1229    3   518.99
            3        4     1688    2   940.84
            4        5     2068    1   571.36
            ...

        """
        self._data = sales_data
        
    @property
    def sales_data(self):
        return self._data
    
    @property
    def user_count(self) -> int:
        """
        Returns:
            int: the number of users in sales_data.
        """
        return self._data['user_id'].nunique()

    @property
    def item_count(self) -> int:
        """
        Returns:
            int: the number of items in sales_data.
        """
        return self._data['item_id'].nunique()
    
    @property
    def user_map(self) -> Dict[int, int]:
        """Creates a mapping from user_id to matrix rows indexes.

        Example:
            sales_data (pd.DataFrame):

                user_id  item_id  qty    price
            0        1      118    1   626.66
            1        1      285    1  1016.57
            2        2     1229    3   518.99
            3        4     1688    2   940.84
            4        5     2068    1   571.36

            user_map (Dict[int, int]):
                {1: 0, 2: 1, 4: 2, 5: 3}

        Returns:
            Dict[int, int]: User map
        """
        
        return {user:i for i, user in enumerate (self._data['user_id'].unique())}

    @property
    def item_map(self) -> Dict[int, int]:
        """Creates a mapping from user_id to matrix rows indexes.

        Example:
            sales_data (pd.DataFrame):

                user_id  item_id  qty    price
            0        1      118    1   626.66
            1        1      285    1  1016.57
            2        2     1229    3   518.99
            3        4     1688    2   940.84
            4        5     2068    1   571.36

            item_map (Dict[int, int]):
                {118: 0, 285: 1, 1229: 2, 1688: 3, 2068: 4}

        Returns:
            Dict[int, int]: Item map
        """
        return {item:i for i, item in enumerate (self._data['item_id'].unique())}

    @property
    def csr_matrix(self) -> csr_matrix:
        """User items matrix in form of CSR matrix.

        User row_ind, col_ind as
        rows and cols indecies (mapped from user/item map).

        Returns:
            csr_matrix: CSR matrix
        """
        user_item_matrix = self._data.pivot(index='user_id', 
                                  columns='item_id', 
                                  values='qty').fillna(0)
        return csr_matrix(user_item_matrix)

Matrix Normalization

In [44]:
user_item = UserItemMatrix(df)
matrix = user_item.csr_matrix

In [4]:
from scipy.sparse import csr_matrix
from sklearn.preprocessing import normalize

class Normalization:
    @staticmethod
    def by_column(matrix: csr_matrix) -> csr_matrix:
        """Normalization by column

        Args:
            matrix (csr_matrix): User-Item matrix of size (N, M)

        Returns:
            csr_matrix: Normalized matrix of size (N, M)
        """
        norm_matrix = normalize(matrix, norm='l1', axis=0)
        return norm_matrix

    @staticmethod
    def by_row(matrix: csr_matrix) -> csr_matrix:
        """Normalization by row

        Args:
            matrix (csr_matrix): User-Item matrix of size (N, M)

        Returns:
            csr_matrix: Normalized matrix of size (N, M)
        """
        norm_matrix = normalize(matrix, norm='l1', axis=1)
        return norm_matrix

    @staticmethod
    def tf_idf(matrix: csr_matrix) -> csr_matrix:
        """Normalization using tf-idf

        Args:
            matrix (csr_matrix): User-Item matrix of size (N, M)

        Returns:
            csr_matrix: Normalized matrix of size (N, M)
        """
        tf = normalize(matrix, norm='l1', axis=1)
        idf = normalize(matrix, norm='l1', axis=0).log1p()
        norm_matrix = tf.multiply(idf)

        return norm_matrix

    @staticmethod
    def bm_25(
        matrix: csr_matrix, k1: float = 2.0, b: float = 0.75
    ) -> csr_matrix:
        """Normalization based on BM-25

        Args:
            matrix (csr_matrix): User-Item matrix of size (N, M)

        Returns:
            csr_matrix: Normalized matrix of size (N, M)
        """
        avgdl = matrix.sum(axis=1).mean()
        d = matrix.sum(axis=1)
        tf = normalize(matrix, norm='l1', axis=1)
        idf = normalize(matrix, norm='l1', axis=0).log1p()
        norm_matrix = idf.multiply((tf.multiply(k1 + 1)) / (tf + k1 * (1 - b + b * (d / avgdl))))
        return norm_matrix


In [45]:
n = Normalization()
norm_matrix = n.bm_25(matrix)

Построение эмбедингов

In [65]:
import numpy as np
import implicit
from scipy.sparse import csr_matrix


def items_embeddings(ui_matrix: csr_matrix, dim: int, item_map: dict) -> np.ndarray:
    """Build items embedding using factorization model.
    The order of items should be the same in the output matrix.

    Args:
        ui_matrix (csr_matrix): User-Item matrix of size (N, M)
        dim (int): Dimention of embedding vectors
        item_map (dict): items indexes

    Returns:
        np.ndarray: Items embeddings matrix of size (M, dim)
    """
    model = implicit.nearest_neighbours.CosineRecommender(K=dim)
    model.fit(ui_matrix.T.tocsr())    
    items = np.array(list(item_map.values()))
    items_vec = model.similar_items(itemid=items, N=dim, item_users=ui_matrix.tocsr())[0]    
    return items_vec

In [66]:
item_map = user_item.item_map
items_embeddings(norm_matrix, 5, item_map)



HBox(children=(IntProgress(value=0, max=38101), HTML(value='')))




array([[    0, 34139,   662, 15624,  9291],
       [    1, 33905,   253,   155, 31710],
       [    2, 13174, 13916, 13162, 13883],
       ...,
       [27750, 33636, 15382, 20396, 26145],
       [27751, 25659, 30847, 37605, 18898],
       [27752, 30971, 26122, 29756, 21883]])