In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

In [2]:
data = pd.read_csv('C:/Study/Рекомендательные системы/2/webinar_2/retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [3]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

In [4]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


### Функции метрик:

In [5]:
def precision_at_k(recommended_list, bought_list, k=5):
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    bought_list = bought_list  # Тут нет [:k] !!
    recommended_list = recommended_list[:k]
    flags = np.isin(recommended_list, bought_list)
    precision = flags.sum() / len(recommended_list)
    return precision

In [6]:
def ap_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    flags = np.isin(recommended_list,bought_list)
    
    if sum(flags) == 0:
        return 0
    
    sum_ = 0
    for i in range(k):
        
        if flags[i]:
            p_k = precision_at_k(recommended_list, bought_list, k=i+1)
            sum_ += p_k
            
    result = sum_ / k
    
    return result

### Обработка TOP-5000 товаров:

In [7]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)
top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()
data_train.loc[~data_train['item_id'].isin(top_5000), 'item_id'] = 999999
user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0
                                 )
user_item_matrix = user_item_matrix.astype(float)
sparse_user_item = csr_matrix(user_item_matrix).tocsr()
#user_item_matrix.head(3)
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values
matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))
id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))
itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

### 1. ALS:

In [8]:
def get_recommendations(user, model, sparse_user_item, N=5):
    res = [id_to_itemid[rec] for rec in 
                    model.recommend(userid=userid_to_id[user], 
                                    user_items=sparse_user_item[userid_to_id[user]],
                                    N=N, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[999999]], 
                                    recalculate_user=True)[0]]
    return res

In [9]:
model = AlternatingLeastSquares(factors=100, 
                                regularization=0.001,
                                iterations=15, 
                                calculate_training_loss=True, 
                                num_threads=4,random_state=42)
model.fit(sparse_user_item,
          show_progress=True)
recs = model.recommend(userid=userid_to_id[2],
                        user_items=sparse_user_item,
                        N=5,
                        filter_already_liked_items=False, 
                        filter_items=[itemid_to_id[999999]], 
                        recalculate_user=False)

  0%|          | 0/15 [00:00<?, ?it/s]

In [10]:
result['als'] = result['user_id'].map(lambda x: get_recommendations(x, model, sparse_user_item, N=5))
pak0 = result.apply(lambda row: precision_at_k(row['als'], row['actual']), axis=1).mean()
mapk0 = result.apply(lambda row: ap_at_k(row['als'], row['actual'], 5), axis=1).mean()

In [11]:
model = AlternatingLeastSquares(factors=200, 
                                regularization=0.005,
                                iterations=2, 
                                calculate_training_loss=True, 
                                num_threads=4,random_state=42)
model.fit(sparse_user_item,
          show_progress=True)
recs = model.recommend(userid=userid_to_id[2],
                        user_items=sparse_user_item,
                        N=5,
                        filter_already_liked_items=False, 
                        filter_items=[itemid_to_id[999999]], 
                        recalculate_user=False)

  0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
result['als'] = result['user_id'].map(lambda x: get_recommendations(x, model, sparse_user_item, N=5))
pak1 = result.apply(lambda row: precision_at_k(row['als'], row['actual']), axis=1).mean()
mapk1 = result.apply(lambda row: ap_at_k(row['als'], row['actual'], 5), axis=1).mean()

### 2. TF-IDF взвешивание:

In [13]:
tfidf_user_item_matrix = tfidf_weight(user_item_matrix).tocsr()

In [14]:
model = AlternatingLeastSquares(factors=100, 
                                regularization=0.01,
                                iterations=10, 
                                calculate_training_loss=True, 
                                num_threads=4,
                                random_state=42)

model.fit(tfidf_user_item_matrix,
          show_progress=True)

result['als_tfidf'] = result['user_id'].map(lambda x: get_recommendations(x, model, sparse_user_item, N=5))
pak2 = result.apply(lambda row: precision_at_k(row['als_tfidf'], row['actual']), axis=1).mean()
mapk2 = result.apply(lambda row: ap_at_k(row['als_tfidf'], row['actual']), axis=1).mean()

  0%|          | 0/10 [00:00<?, ?it/s]

In [15]:
model = AlternatingLeastSquares(factors=250, 
                                regularization=0.05,
                                iterations=2, 
                                calculate_training_loss=True, 
                                num_threads=4,
                                random_state=42)

model.fit(tfidf_user_item_matrix,
          show_progress=True)

result['als_tfidf'] = result['user_id'].map(lambda x: get_recommendations(x, model, sparse_user_item, N=5))
pak3 = result.apply(lambda row: precision_at_k(row['als_tfidf'], row['actual']), axis=1).mean()
mapk3 = result.apply(lambda row: ap_at_k(row['als_tfidf'], row['actual']), axis=1).mean()

  0%|          | 0/2 [00:00<?, ?it/s]

### 3. BM25 взвешивание:

In [16]:
bm25_user_item_matrix = bm25_weight(user_item_matrix.T).T.tocsr()

In [17]:
model = AlternatingLeastSquares(factors=200, 
                                regularization=0.01,
                                iterations=13, 
                                calculate_training_loss=True, 
                                num_threads=4,
                                random_state=42)

model.fit(csr_matrix(bm25_user_item_matrix).tocsr(),
          show_progress=True)

result['als_bm25'] = result['user_id'].map(lambda x: get_recommendations(x, model, sparse_user_item, N=5))


pak4 = result.apply(lambda row: precision_at_k(row['als_bm25'], row['actual']), axis=1).mean()
mapk4 = result.apply(lambda row: ap_at_k(row['als_bm25'], row['actual']), axis=1).mean()

  0%|          | 0/13 [00:00<?, ?it/s]

In [18]:
model = AlternatingLeastSquares(factors=400, 
                                regularization=0.005,
                                iterations=2, 
                                calculate_training_loss=True, 
                                num_threads=4,
                                random_state=42)

model.fit(csr_matrix(bm25_user_item_matrix).tocsr(),
          show_progress=True)

result['als_bm25'] = result['user_id'].map(lambda x: get_recommendations(x, model, sparse_user_item, N=5))

pak5 = result.apply(lambda row: precision_at_k(row['als_bm25'], row['actual']), axis=1).mean()
mapk5 = result.apply(lambda row: ap_at_k(row['als_bm25'], row['actual']), axis=1).mean()

  0%|          | 0/2 [00:00<?, ?it/s]

### Результаты:

In [19]:
result.head(2)

Unnamed: 0,user_id,actual,als,als_tfidf,als_bm25
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[995242, 9527290, 940947, 1033142, 1082185]","[1082185, 1033142, 1082212, 995242, 965766]","[995242, 1082185, 9527290, 940947, 1033142]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[952317, 910032, 1098066, 1101173, 1042438]","[1101010, 1133018, 1106523, 1098066, 1092026]","[1022003, 1133018, 1092026, 910032, 938700]"


In [20]:
print('ALS:')
print(f'Старые параметры: factors=100,regularization=0.001,iterations=15; P@K - {pak0}, MAP@K - {mapk0}')
print(f'Новые  параметры: factors=200,regularization=0.005,iterations=1;  P@K - {pak1}, MAP@K - {mapk1}')
print('TF-IDF:')
print(f'Старые параметры: factors=100,regularization=0.01,iterations=10;  P@K - {pak2},  MAP@K - {mapk2}')
print(f'Новые  параметры: factors=250,regularization=0.05,iterations=2;   P@K - {pak3}, MAP@K - {mapk3}')
print('BM25:')
print(f'Старые параметры: factors=200,regularization=0.01,iterations=13;  P@K - {pak4}, MAP@K - {mapk4}')
print(f'Новые  параметры: factors=400,regularization=0.005,iterations=1;  P@K - {pak5}, MAP@K - {mapk5}')

ALS:
Старые параметры: factors=100,regularization=0.001,iterations=15; P@K - 0.18099902056807052, MAP@K - 0.11083578191315703
Новые  параметры: factors=200,regularization=0.005,iterations=1;  P@K - 0.20773751224289913, MAP@K - 0.13107574273587985
TF-IDF:
Старые параметры: factors=100,regularization=0.01,iterations=10;  P@K - 0.2406464250734574,  MAP@K - 0.1642866470780281
Новые  параметры: factors=250,regularization=0.05,iterations=2;   P@K - 0.24554358472086193, MAP@K - 0.1663728370878224
BM25:
Старые параметры: factors=200,regularization=0.01,iterations=13;  P@K - 0.23016650342801176, MAP@K - 0.15431603003591252
Новые  параметры: factors=400,regularization=0.005,iterations=1;  P@K - 0.2378060724779628, MAP@K - 0.16143160300359127


### Выводы: Лучшие показатели метрик у BM25. Причем улучшение качества у всех трех моделей наблюдается при увеличении количества факторов, уменьшении итераций и увеличении регуляризации.