# Вебинар 1. Введение, примеры задач, бизнес- и ML-метрики 

In [1]:
import pandas as pd
import numpy as np

# ML-метрики качества

In [2]:
recommended_list = [156, 1134, 27, 991, 143, 1543, 3345, 533, 11, 43] #id товаров
bought_list = [521, 32, 143, 991]

### 1. Hit rate

Hit rate = был ли хотя бы 1 релевантный товар среди рекомендованных

- Иногда применяется, когда продаются достаточно дрогие товары (например, бытовая техника) 

----
Hit rate = (был ли хотя бы 1 релевантный товар среди рекомендованных)   

Hit rate@k = (был ли хотя бы 1 релевантный товар среди топ-k рекомендованных)

In [3]:
def hit_rate(recommended_list, bought_list):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    flags = np.isin(bought_list, recommended_list)
    
    hit_rate = (flags.sum() > 0) * 1
    
    return hit_rate


def hit_rate_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    top_k = recommended_list[:k]

    flags = np.isin(bought_list, top_k)
    
    hit_rate = (flags.sum() > 0) * 1
    
    return hit_rate

In [4]:
hit_rate(recommended_list, bought_list)

1

In [5]:
hit_rate_at_k(recommended_list, bought_list, k=3)

0

### 2. Precision

*Precision* - доля релевантных товаров среди рекомендованных = Какой % рекомендованных товаров  юзер купил

- Пожалуй, самая приближенная к бизнес-метрикам и самая популярная метрика

---

Precision= (# of recommended items that are relevant) / (# of recommended items)  

Precision@k = (# of recommended items @k that are relevant) / (# of recommended items @k)

Money Precision@k = (revenue of recommended items @k that are relevant) / (revenue of recommended items @k)  

**Note:** Обычно k в precision@k достаточно невелико (5-20) и определяется из бизнес-логики. Например, 5 товаров в e-mail рассылке, 20 ответов на первой странице google и т.д

Красная рыба - 400 руб  
Молоко - 60 руб  
Хлеб = 40 руб  
Гречка = 40 руб  
Шоколад = 90 руб  

------  
Варенье - 240 руб  
...  

**Case 1**  
prices_resommended = [400, 60, 40, 40 , 90]  
flags = [1, 0, 0, 0 , 1]  

$precison@5 = \frac{1 + 0 + 0 +0 + 1}{1+1+1+1+1} = 40\%$  
$money precision@5 = \frac{1*400 + 0*60 + ... + 1*90}{1*400 + 1*60 + ... + 1*90} = 77.7\%$  

  
**Case 2**   
prices_resommended = [400, 60, 40, 40 , 90]  
flags = [0, 1, 0, 0 , 1]  

$precison@5 = \frac{0 + 1 + 0 +0 + 1}{1+1+1+1+1} = 40\%$  
$money precision@5 = \frac{0*400 + 1*60 + ... + 1*90}{1*400 + 1*60 + ... + 1*90} = 15.8\%$

In [6]:
def precision(recommended_list, bought_list):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    flags = np.isin(bought_list, recommended_list)
    
    precision = flags.sum() / len(recommended_list)
    
    return precision


def precision_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    bought_list = bought_list  # Тут нет [:k] !!
    recommended_list = recommended_list[:k]
    
    flags = np.isin(bought_list, recommended_list)
    
    precision = flags.sum() / len(recommended_list)    
    
    return precision


def money_precision_at_k(recommended_list, bought_list, prices_recommended, k=5):
        
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    prices_recommended = np.array(prices_recommended)

    recommended_list = recommended_list[:k]
    prices_recommended = prices_recommended[:k]
    
    # flags по спискам товаров
    bought_in_recommended = np.isin(bought_list, recommended_list)
    recommended_in_bought = np.isin(recommended_list, bought_list[bought_in_recommended])

    precision = np.sum(prices_recommended[recommended_in_bought])/np.sum(prices_recommended)
    
    return precision

In [7]:
recommended_list = [156, 1134, 27, 7, 991, 1] #id товаров
prices_recommended = [400, 60, 40, 40, 90, 240]
bought_list = [156, 32, 14, 3, 991, 78]

In [8]:
precision(recommended_list, bought_list)

0.3333333333333333

In [9]:
precision_at_k(recommended_list, bought_list, k=5)

0.4

In [10]:
money_precision_at_k(recommended_list, bought_list, prices_recommended, k=5)

0.7777777777777778

### 3. Recall

*Recall* - доля рекомендованных товаров среди релевантных = Какой % купленных товаров был среди рекомендованных

- Обычно используется для моделей пре-фильтрации товаров (убрать те товары, которые точно не будем рекомендовать)

---

Recall= (# of recommended items that are relevant) / (# of relevant items)  

Recall@k = (# of recommended items @k that are relevant) / (# of relevant items)

Money Recall@k = (revenue of recommended items @k that are relevant) / (revenue of relevant items)  

    
  
**Note:** в recall@k число k обычно достаточно большое (50-200), больше чем покупок у среднестатистического юзера

In [11]:
def recall(recommended_list, bought_list):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    flags = np.isin(bought_list, recommended_list)
    
    recall = flags.sum() / len(bought_list)
    
    return recall


def recall_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    bought_list = bought_list
    recommended_list = recommended_list[:k]
    
    flags = np.isin(bought_list, recommended_list)
    
    recall = flags.sum() / len(bought_list)    
    
    return recall


def money_recall_at_k(recommended_list, bought_list, prices_recommended, prices_bought, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    prices_recommended = np.array(prices_recommended)

    recommended_list = recommended_list[:k]
    prices_recommended = prices_recommended[:k]
    
    # flags по спискам товаров
    bought_in_recommended = np.isin(bought_list, recommended_list)
    recommended_in_bought = np.isin(recommended_list, bought_list[bought_in_recommended])
    
    # меняем только знаменатель
    recall = np.sum(prices_recommended[recommended_in_bought])/np.sum(prices_bought)
    
    return recall

In [12]:
recommended_list = [156, 1134, 27, 7, 991, 1] #id товаров
prices_recommended = [400, 60, 40, 40, 90, 240]
bought_list = [156, 32, 14, 3, 991, 78, 93]

prices_bought = [400, 50, 180, 120, 90, 10, 5]

In [13]:
# Проверка совпадения нужных цен в списках
assert np.isin(bought_list, recommended_list).sum() == np.isin(prices_bought, prices_recommended).sum()

In [14]:
recall(recommended_list, bought_list)

0.2857142857142857

In [15]:
recall_at_k(recommended_list, bought_list, k=3)

0.14285714285714285

In [16]:
money_recall_at_k(recommended_list, bought_list, prices_recommended, prices_bought, k=5)

0.5730994152046783

---

# Метрики ранжирования

Если важен порядок рекомендаций. Подробнее можно почитать [здесь](https://habr.com/ru/company/econtenta/blog/303458/). Формулы в статье могут несколько отличаться от формул в лекции 

## AP@k
AP@k - average precision at k

$$AP@k = \frac{1}{r} \sum{[recommended_{relevant_i}] * precision@k}$$

- r - кол-во релевантный среди рекомендованных
- Суммируем по всем релевантным товарам
- Зависит от порядка реокмендаций

In [17]:
def ap_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    flags = np.isin(recommended_list, bought_list)
    
    if sum(flags) == 0:
        return 0
    
    sum_ = 0
    for i in range(1, k+1):
        
        if flags[i] == True:
            p_k = precision_at_k(recommended_list, bought_list, k=i)
            sum_ += p_k
            
    result = sum_ / sum(flags)
    
    return result

In [18]:
ap_k(recommended_list, bought_list, k=5)

0.125

### MAP@k

MAP@k (Mean Average Precision@k)  
Среднее AP@k по всем юзерам
- Показывает средневзвешенную точность рекомендаций

$$MAP@k = \frac{1}{|U|} \sum_u{AP_k}$$
  
|U| - кол-во юзеров

In [19]:
# Пусть были три пользователя - для каждого свой список рекомендаций и свой список покупок
recommended_list = [[156, 1134, 27, 7, 991, 1],
                    [1, 56, 99, 0, 89, 11],
                    [2, 156, 99, 8, 90, 32]]
bought_list = [[156, 32, 14, 3, 991, 78, 93],
               [1, 11],
               [67, 89, 90, 99, 88, 11, 156, 95]]

In [20]:
def map_k(recommended_list, bought_list, k=5):
    
    assert len(recommended_list) == len(bought_list)
    U = len(bought_list)

    sum_ = 0.
    for u in range(U):
        sum_ += ap_k(recommended_list[u], bought_list[u], k)
    result = sum_ / U
    
    return result

In [21]:
map_k(recommended_list, bought_list, k=5)

0.18611111111111112

### MRR@k
Mean Reciprocal Rank

- Считаем для первых k рекоммендаций
- Найти ранк первого релевантного предсказания $k_u$
- Посчитать reciprocal rank = $\frac{1}{k_u}$

$$MRR = mean(\frac{1}{k_u})$$

In [22]:
def reciprocal_rank(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    bought_list = bought_list
    recommended_list = recommended_list[:k]
    
    ranks = np.where(np.isin(recommended_list, bought_list))[0]
    if ranks.size == 0:
        result = 0.
    else:
        result = 1. / (ranks[0] + 1)
    
    return result

In [23]:
recommended_list = [143, 156, 1134, 991, 27, 1543, 3345, 533, 11, 43] #id товаров
bought_list = [521, 32, 143, 991]

reciprocal_rank(recommended_list, bought_list, k=5)

1.0

In [24]:
recommended_list = [13, 156, 1134, 991, 27, 1543, 3345, 533, 11, 43] #id товаров
bought_list = [521, 32, 143, 991]

reciprocal_rank(recommended_list, bought_list, k=5)

0.25