In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

from metrics import precision_at_k, recall_at_k

from sklearn.preprocessing import MaxAbsScaler

In [2]:
data = pd.read_csv('../../Lectures/Lecture_2/webinar_2/webinar_2/data/transaction_data.csv')
data.head(3)

Unnamed: 0,household_key,BASKET_ID,DAY,PRODUCT_ID,QUANTITY,SALES_VALUE,STORE_ID,RETAIL_DISC,TRANS_TIME,WEEK_NO,COUPON_DISC,COUPON_MATCH_DISC
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0


In [3]:
data.columns = [col.lower() for col in data.columns]

In [4]:
data.rename(columns={'household_key': 'user_id',
                    'product_id': 'item_id'},
           inplace=True)

In [5]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

data_train.head(5)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0


In [6]:
item_features = pd.read_csv('../../Lectures/Lecture_2/webinar_2/webinar_2/data/product.csv')
item_features.columns = [col.lower() for col in item_features.columns]
item_features.rename(columns={'product_id': 'item_id'}, inplace=True)

item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [7]:
item_features.department.unique()

array(['GROCERY', 'MISC. TRANS.', 'PASTRY', 'DRUG GM', 'MEAT-PCKGD',
       'SEAFOOD-PCKGD', 'PRODUCE', 'NUTRITION', 'DELI', 'COSMETICS',
       'MEAT', 'FLORAL', 'TRAVEL & LEISUR', 'SEAFOOD', 'MISC SALES TRAN',
       'SALAD BAR', 'KIOSK-GAS', 'ELECT &PLUMBING', 'GRO BAKERY',
       'GM MERCH EXP', 'FROZEN GROCERY', 'COUP/STR & MFG', 'SPIRITS',
       'GARDEN CENTER', 'TOYS', 'CHARITABLE CONT', 'RESTAURANT', 'RX',
       'PROD-WHS SALES', 'MEAT-WHSE', 'DAIRY DELI', 'CHEF SHOPPE', 'HBC',
       'DELI/SNACK BAR', 'PORK', 'AUTOMOTIVE', 'VIDEO RENTAL', ' ',
       'CNTRL/STORE SUP', 'HOUSEWARES', 'POSTAL CENTER', 'PHOTO', 'VIDEO',
       'PHARMACY SUPPLY'], dtype=object)

#### Создадим датасет, куда будем записывать результаты работы алгоритмов разных моделей с разными параметрами

In [8]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[879517, 934369, 1115576, 1124029, 5572301, 65..."
1,3,"[823704, 834117, 840244, 913785, 917816, 93870..."


Определим самые популярные товары и сохраним в список их item_id.

In [9]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)

top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()

In [10]:
popularity.head(5)

Unnamed: 0,item_id,n_sold
0,25671,6
1,26081,1
2,26093,1
3,26190,1
4,26355,2


In [11]:
popularity.sort_values('n_sold', ascending=False).head(5)

Unnamed: 0,item_id,n_sold
56632,6534178,207490068
56592,6533889,17706394
56627,6534166,13420640
56744,6544236,2620630
44379,1404121,1688052


In [12]:
item_features.loc[item_features['item_id'] == 6534178]

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
57221,6534178,69,KIOSK-GAS,Private,COUPON/MISC ITEMS,GASOLINE-REG UNLEADED,


#### Заведем фиктивный item_id, которым пометим все товары которые не попали в группу топ-5000

In [13]:
data_train.loc[~data_train['item_id'].isin(top_5000), 'item_id'] = 999_999

Создадим user_item таблицу

In [14]:
user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity', # Можно пробоват другие варианты
                                  aggfunc='count', 
                                  fill_value=0
                                 )

In [15]:
user_item_matrix.shape

(2500, 5001)

In [16]:
user_item_matrix[999_999].sum()

908677

Под этим item_id мы агрегировали все "не популярные" товары. Создадим две матрицы: одна будет содержать этот item_id, а вдругой исключим этот item_id и сравним результаты работы алгоритма на этих двух матрицах. Исключением мы добъемся не получение "не популярного" товара в рекомендации тем самым повысив метрику.

In [17]:
user_item_matrix_top = user_item_matrix.drop(columns = [999_999], axis = 1)
user_item_matrix_top.shape

(2500, 5000)

In [18]:
# Приведем матрицы к необходимому типу для implicit
user_item_matrix = user_item_matrix.astype(float) 
user_item_matrix_top = user_item_matrix_top.astype(float) 

In [19]:
# переведем в формат sparse matrix
sparse_user_item = csr_matrix(user_item_matrix)
sparse_user_item_top = csr_matrix(user_item_matrix_top)

In [20]:
def id_to_id(matrix):
    userids = matrix.index.values
    itemids = matrix.columns.values
    
    matrix_userids = np.arange(len(userids))
    matrix_itemids = np.arange(len(itemids))
    
    id_to_itemid = dict(zip(matrix_itemids, itemids))
    id_to_userid = dict(zip(matrix_userids, userids))
    
    itemid_to_id = dict(zip(itemids, matrix_itemids))
    userid_to_id = dict(zip(userids, matrix_userids))
    
    return id_to_itemid, id_to_userid, itemid_to_id, userid_to_id

In [21]:
id_to_itemid, id_to_userid, itemid_to_id, userid_to_id = id_to_id(user_item_matrix)
id_to_itemid_top, id_to_userid_top, itemid_to_id_top, userid_to_id_top = id_to_id(user_item_matrix_top)

In [29]:
type(userid_to_id)

dict

In [35]:
len(data_train)

2485538

In [34]:
len(user_item_matrix.index.values)

2500

In [31]:
userid_to_id[2000]

1999

# ALS

In [32]:
%%time

model = AlternatingLeastSquares(factors=64, 
                                regularization=0.05,
                                iterations=15, 
                                calculate_training_loss=True, 
                                num_threads=10,
                                use_gpu=False)

CPU times: total: 0 ns
Wall time: 1 ms


In [23]:
%%time

model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
          show_progress=True)

  0%|          | 0/15 [00:00<?, ?it/s]

CPU times: total: 21.2 s
Wall time: 2.17 s


In [24]:
def get_recommendations(user, sparse_user_item, model, N=5):
    res = [id_to_itemid[rec[0]] for rec in 
                    model.recommend(userid=userid_to_id[user], 
                                    user_items=sparse_user_item,   # на вход user-item matrix
                                    N=N, 
                                    filter_already_liked_items=False, 
                                    filter_items=None, 
                                    recalculate_user=True)]
    return res

In [25]:
def get_recommendations_top(user, sparse_user_item_top, model, N=5):
    res = [id_to_itemid_top[rec[0]] for rec in 
                    model.recommend(userid=userid_to_id_top[user], 
                                    user_items=sparse_user_item_top,   # на вход user-item matrix
                                    N=N, 
                                    filter_already_liked_items=False, 
                                    filter_items=None, 
                                    recalculate_user=True)]
    return res

In [26]:
%%time
    
result['als'] = result['user_id'].apply(lambda x: get_recommendations(x, sparse_user_item, model=model, N=5))

CPU times: total: 7min 9s
Wall time: 53.7 s


In [27]:
result.head(3)

Unnamed: 0,user_id,actual,als
0,1,"[879517, 934369, 1115576, 1124029, 5572301, 65...","[1005186, 885290, 999999, 5569374, 9527290]"
1,3,"[823704, 834117, 840244, 913785, 917816, 93870...","[951590, 5569327, 883404, 1092026, 1106523]"
2,5,"[913077, 1118028, 1386668]","[999999, 1082185, 6534178, 1029743, 995242]"


#### Метрика

In [28]:
result.apply(lambda row: precision_at_k(row['als'], row['actual']), axis=1).mean()

0.15499748869914406

--------------------------------------------------------------------
#### тоже для top-матрицы

In [33]:
%%time

model.fit(csr_matrix(user_item_matrix_top).T.tocsr(),  # На вход item-user matrix
          show_progress=True)

  0%|          | 0/15 [00:00<?, ?it/s]

CPU times: total: 21.3 s
Wall time: 1.81 s


In [34]:
%%time
    
result['als_top'] = result['user_id'].apply(lambda x: get_recommendations_top(x, sparse_user_item_top, model=model, N=5))

CPU times: total: 4min 43s
Wall time: 35.5 s


#### Метрика

In [35]:
result.apply(lambda row: precision_at_k(row['als_top'], row['actual']), axis=1).mean()

0.1842290306378678

In [36]:
result.head(3)

Unnamed: 0,user_id,actual,als,als_top
0,1,"[879517, 934369, 1115576, 1124029, 5572301, 65...","[1005186, 885290, 999999, 5569374, 9527290]","[1100972, 1033142, 995242, 1051211, 1082185]"
1,3,"[823704, 834117, 840244, 913785, 917816, 93870...","[951590, 5569327, 883404, 1092026, 1106523]","[1044078, 951590, 1106523, 1133018, 5569327]"
2,5,"[913077, 1118028, 1386668]","[999999, 1082185, 6534178, 1029743, 995242]","[1126899, 1058997, 1082185, 1050851, 981760]"


In [63]:
round((1 - 0.1550/0.1842) * 100, 2)

15.85

### 2. TF-IDF взвешивание

In [38]:
user_item_matrix_tfidf = tfidf_weight(user_item_matrix.T).T  
user_item_matrix_tfidf_top = tfidf_weight(user_item_matrix_top.T).T  

In [39]:
%%time

model = AlternatingLeastSquares(factors=64, 
                                regularization=0.05,
                                iterations=15, 
                                calculate_training_loss=True, 
                                num_threads=20)

CPU times: total: 0 ns
Wall time: 1 ms


In [40]:
model.fit(csr_matrix(user_item_matrix_tfidf).T.tocsr(),
          show_progress=True)

  0%|          | 0/15 [00:00<?, ?it/s]

In [41]:
result['als_tfidf'] = result['user_id'].apply(lambda x: get_recommendations(x, sparse_user_item, model=model, N=5))

result.apply(lambda row: precision_at_k(row['als_tfidf'], row['actual']), axis=1).mean()

0.16243093922651688

In [62]:
round((1 - 0.1550/0.1624) * 100, 2)

4.56

--------------------------------------------------------------------
#### тоже для top-матрицы

In [43]:
model = AlternatingLeastSquares(factors=64, 
                                regularization=0.05,
                                iterations=15, 
                                calculate_training_loss=True, 
                                num_threads=20)

In [44]:
model.fit(csr_matrix(user_item_matrix_tfidf_top).T.tocsr(),
          show_progress=True)

  0%|          | 0/15 [00:00<?, ?it/s]

In [45]:
result['als_tfidf_top'] = result['user_id'].apply(lambda x: get_recommendations_top(x, sparse_user_item_top, model=model, N=5))

result.apply(lambda row: precision_at_k(row['als_tfidf_top'], row['actual']), axis=1).mean()

0.19748869914615502

In [61]:
round((1 - 0.1565/0.1975) * 100, 2)

20.76

### 3. BM25 взвешивание

In [47]:
user_item_matrix = bm25_weight(user_item_matrix.T).T
user_item_matrix_top = bm25_weight(user_item_matrix_top.T).T 

In [48]:
%%time

model = AlternatingLeastSquares(factors=128, 
                                regularization=0.05,
                                iterations=15, 
                                calculate_training_loss=True, 
                                num_threads=20) # 

# model = AlternatingLeastSquares(factors=64, 
#                                 regularization=0.05,
#                                 iterations=15, 
#                                 calculate_training_loss=True, 
#                                 num_threads=10)

CPU times: total: 0 ns
Wall time: 0 ns


In [51]:
model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
          show_progress=True)

result['als_bm25'] = result['user_id'].apply(lambda x: get_recommendations(x, sparse_user_item, model=model, N=5))

  0%|          | 0/15 [00:00<?, ?it/s]

In [52]:
result.apply(lambda row: precision_at_k(row['als_bm25'], row['actual']), axis=1).mean()

0.19075841285785758

In [58]:
round((1 - 0.1565/0.1907) * 100, 2)

17.93

--------------------------------------------------------------------
#### тоже для top-матрицы

In [54]:
model = AlternatingLeastSquares(factors=128, 
                                regularization=0.05,
                                iterations=15, 
                                calculate_training_loss=True, 
                                num_threads=20) # 

In [55]:
model.fit(csr_matrix(user_item_matrix_top).T.tocsr(),  # На вход item-user matrix
          show_progress=True)

result['als_bm25_top'] = result['user_id'].apply(lambda x: get_recommendations_top(x, sparse_user_item_top, model=model, N=5))

  0%|          | 0/15 [00:00<?, ?it/s]

In [56]:
result.apply(lambda row: precision_at_k(row['als_bm25_top'], row['actual']), axis=1).mean()

0.22682069311903277

In [57]:
round((1 - 0.1565/0.2268) * 100, 2)

31.0

In [59]:
model = AlternatingLeastSquares(factors=128, 
                                regularization=0.01,
                                iterations=15, 
                                calculate_training_loss=True, 
                                num_threads=20) # 

In [60]:
model.fit(csr_matrix(user_item_matrix_top).T.tocsr(),  # На вход item-user matrix
          show_progress=True)

result['als_bm25_top_v2'] = result['user_id'].apply(lambda x: get_recommendations_top(x, sparse_user_item_top, model=model, N=5))

  0%|          | 0/15 [00:00<?, ?it/s]

In [64]:
result.apply(lambda row: precision_at_k(row['als_bm25_top_v2'], row['actual']), axis=1).mean()

0.22189854344550206

In [65]:
round((1 - 0.1565/0.2219) * 100, 2)

29.47

In [66]:
model = AlternatingLeastSquares(factors=128, 
                                regularization=0.01,
                                iterations=25, 
                                calculate_training_loss=True, 
                                num_threads=20) # 

In [67]:
model.fit(csr_matrix(user_item_matrix_top).T.tocsr(),  # На вход item-user matrix
          show_progress=True)

result['als_bm25_top_v3'] = result['user_id'].apply(lambda x: get_recommendations_top(x, sparse_user_item_top, model=model, N=5))

  0%|          | 0/25 [00:00<?, ?it/s]

In [68]:
result.apply(lambda row: precision_at_k(row['als_bm25_top_v3'], row['actual']), axis=1).mean()

0.2213962832747335

In [69]:
round((1 - 0.1565/0.2214) * 100, 2)

29.31

In [70]:
model = AlternatingLeastSquares(factors=128, 
                                regularization=0.08,
                                iterations=10, 
                                calculate_training_loss=True, 
                                num_threads=20) # 

In [71]:
model.fit(csr_matrix(user_item_matrix_top).T.tocsr(),  # На вход item-user matrix
          show_progress=True)

result['als_bm25_top_v4'] = result['user_id'].apply(lambda x: get_recommendations_top(x, sparse_user_item_top, model=model, N=5))

  0%|          | 0/10 [00:00<?, ?it/s]

In [72]:
result.apply(lambda row: precision_at_k(row['als_bm25_top_v4'], row['actual']), axis=1).mean()

0.22179809141134815

In [73]:
round((1 - 0.1565/0.2218) * 100, 2)

29.44

---
#### Лучшие метрики (precision_at_k) получили для модели ALS с предварительной обработкой данных методом "BM25 взвешивание" для данных из топ-5000 популярных товаров, с относительным приростом в метрике в 31%.

---
- А точно нужно сортировать по вероятности? ---
- Какую метрику использовать? - ту, которая наиболее отвечает потребности бизнеса на данный момент.
- Сколько раз в неделю отпрпавляем рассылку? - Зависит от опыта рекламных кампаний и временных откликов на них.
- В какое время отправляем рассылку? - За несколько дней до проведения акций, праздников, выходных.
- Будем отправлять одному юзеру много раз наши рекоммендации. Как добиться того, чтобы они хоть немного отличались? ---
- Нужно ли, чтобы в одной рассылке были *разные* товары? Как определить, что товары *разные*? Как добиться того, чтобы они были разными?
- И многое другое:)