In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight, tfidf_weight


# Написанные нами функции
from best_rec_lib.metrics import precision_at_k, recall_at_k, ap_k
from best_rec_lib.utils import prefilter_items
from best_rec_lib.recommenders import MainRecommender

import gc

import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('../retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [3]:
item_features = pd.read_csv('../product.csv')
user_features = pd.read_csv('../hh_demographic.csv')

In [4]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

In [5]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [6]:
n_items_before = data_train['item_id'].nunique()

data_train = prefilter_items(data_train, item_features=item_features, take_n_popular=5000)

n_items_after = data_train['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 86865 to 5001


In [7]:
user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity', # Можно пробовать другие варианты
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат saprse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

user_item_matrix.head(3)

item_id,117847,279994,818981,819255,819308,819400,819487,819590,819594,819840,...,15926844,15926886,15972074,15972298,15972565,15972790,16100266,16729299,16729415,16770156
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

### Задание 1. Подбор оптимальных гиперпараметров для ALS

- Попробуйте улучшить базовый вариант ALS, изменяя следующие параметры
  - regularization, iterations
  - factors
  - Вес (TF_IDF, BM25  взвешивание)
  
- Посчитайте метрики (Precision@5, MAP@5) для разных наборов гиперпараметров и выберете лучший набор

In [9]:
test_users = result.shape[0]
new_test_users = len(set(data_test['user_id']) - set(data_train['user_id']))

print('В тестовом дата сете {} юзеров'.format(test_users))
print('В тестовом дата сете {} новых юзеров'.format(new_test_users))
new_test_users = list(set(data_test['user_id']) - set(data_train['user_id']))

В тестовом дата сете 2042 юзеров
В тестовом дата сете 2 новых юзеров


In [10]:
result = result[~result['user_id'].isin(new_test_users)]

In [12]:
def get_recommendations(user, model, sparse_user_item, N=350):
    res = [id_to_itemid[rec] for rec in 
                    model.recommend(userid=userid_to_id[user], 
                                    user_items=sparse_user_item[userid_to_id[user]],   # на вход user-item matrix
                                    N=N, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[999999]], 
                                    recalculate_user=True)[0]]
    return res

In [13]:
def als_base(prec_met, map_met, rec_met, result, sparse_user_item, factors, reg_st, iterations, N=350):
    
    for fact in factors:
        for reg in reg_st:
            for itrn in iterations:
                
                model = AlternatingLeastSquares(factors=fact, 
                                                regularization=reg,
                                                iterations=itrn, 
                                                calculate_training_loss=True, 
                                                num_threads=-1,
                                                random_state=42)

                model.fit(sparse_user_item,  # На вход item-user matrix
                          show_progress=False)

                result[f'als_{fact}_{reg}_{itrn}'] = result['user_id'].map(lambda x: get_recommendations(x, model, sparse_user_item, N))
                prec_met[f'als_{fact}_{reg}_{itrn}'] = result.apply(lambda row: precision_at_k(row[f'als_{fact}_{reg}_{itrn}'], row['actual']), axis=1).mean()
                map_met[f'als_{fact}_{reg}_{itrn}'] = result.apply(lambda row: ap_k(row[f'als_{fact}_{reg}_{itrn}'], row['actual']), axis=1).mean()
                rec_met[f'als_{fact}_{reg}_{itrn}'] = result.apply(lambda row: recall_at_k(row[f'als_{fact}_{reg}_{itrn}'], row['actual']), axis=1).mean()
                    
    return result, prec_met, map_met, rec_met



def tfidf(prec_met, map_met, rec_met, result, sparse_user_item, user_item_matrix, factors, reg_tfidf, iterations_tfidf, N=350):
    
    tfidf_user_item_matrix = tfidf_weight(user_item_matrix).tocsr()
    for fact in factors:
        for reg in reg_tfidf:
            for itrn in iterations_tfidf:
                
                if sum([fact == 150, reg == 0.01, (itrn in (5, 8))]) != 3:

                    model = AlternatingLeastSquares(factors=fact, 
                                    regularization=reg,
                                    iterations=itrn, 
                                    calculate_training_loss=True, 
                                    num_threads=-1,
                                    random_state=42)

                    model.fit(tfidf_user_item_matrix,  # На вход item-user matrix
                              show_progress=False)

                    result[f'als_tfidf_{fact}_{reg}_{itrn}'] = result['user_id'].map(lambda x: get_recommendations(x, model, sparse_user_item, N))
                    prec_met[f'als_tfidf_{fact}_{reg}_{itrn}'] = result.apply(lambda row: precision_at_k(row[f'als_tfidf_{fact}_{reg}_{itrn}'], row['actual']), axis=1).mean()
                    map_met[f'als_tfidf_{fact}_{reg}_{itrn}'] = result.apply(lambda row: ap_k(row[f'als_tfidf_{fact}_{reg}_{itrn}'], row['actual']), axis=1).mean()
                    rec_met[f'als_tfidf_{fact}_{reg}_{itrn}'] = result.apply(lambda row: recall_at_k(row[f'als_tfidf_{fact}_{reg}_{itrn}'], row['actual']), axis=1).mean()
                    
    return result, prec_met, map_met, rec_met

def tfidf_T(prec_met, map_met, rec_met, result, sparse_user_item, user_item_matrix, factors, reg_st, iterations, N=350):
    
    tfidf_user_item_matrix = tfidf_weight(user_item_matrix.T).T.tocsr()
    for fact in factors:
        for reg in reg_st:
            for itrn in iterations:

                model = AlternatingLeastSquares(factors=fact, 
                                regularization=reg,
                                iterations=itrn,
                                calculate_training_loss=True, 
                                num_threads=-1,
                                random_state=42)

                model.fit(tfidf_user_item_matrix,  # На вход item-user matrix
                          show_progress=False)

                result[f'als_tfidf_T_{fact}_{reg}_{itrn}'] = result['user_id'].map(lambda x: get_recommendations(x, model, sparse_user_item, N))
                prec_met[f'als_tfidf_T_{fact}_{reg}_{itrn}'] = result.apply(lambda row: precision_at_k(row[f'als_tfidf_T_{fact}_{reg}_{itrn}'], row['actual']), axis=1).mean()
                map_met[f'als_tfidf_T_{fact}_{reg}_{itrn}'] = result.apply(lambda row: ap_k(row[f'als_tfidf_T_{fact}_{reg}_{itrn}'], row['actual']), axis=1).mean()
                rec_met[f'als_tfidf_T_{fact}_{reg}_{itrn}'] = result.apply(lambda row: recall_at_k(row[f'als_tfidf_T_{fact}_{reg}_{itrn}'], row['actual']), axis=1).mean()
                    
    return result, prec_met, map_met, rec_met

def bm25(prec_met, map_met, rec_met, result, sparse_user_item, user_item_matrix, factors, reg_bm, iterations_bm, N=350):
    
    bm25_user_item_matrix = bm25_weight(user_item_matrix).tocsr()

    for fact in factors:
        for reg in reg_bm:
            for itrn in iterations_bm:
                
                if sum([fact == 150, reg == 0.01, (itrn in (3,))]) != 3:

                    model = AlternatingLeastSquares(factors=fact, 
                                    regularization=reg,
                                    iterations=itrn,
                                    calculate_training_loss=True, 
                                    num_threads=-1,
                                    random_state=42)

                    model.fit(bm25_user_item_matrix,  # На вход item-user matrix
                              show_progress=False)

                    result[f'als_bm25_{fact}_{reg}_{itrn}'] = result['user_id'].map(lambda x: get_recommendations(x, model, sparse_user_item, N))
                    prec_met[f'als_bm25_{fact}_{reg}_{itrn}'] = result.apply(lambda row: precision_at_k(row[f'als_bm25_{fact}_{reg}_{itrn}'], row['actual']), axis=1).mean()
                    map_met[f'als_bm25_{fact}_{reg}_{itrn}'] = result.apply(lambda row: ap_k(row[f'als_bm25_{fact}_{reg}_{itrn}'], row['actual']), axis=1).mean()
                    rec_met[f'als_bm25_{fact}_{reg}_{itrn}'] = result.apply(lambda row: recall_at_k(row[f'als_bm25_{fact}_{reg}_{itrn}'], row['actual']), axis=1).mean()

                    
    return result, prec_met, map_met, rec_met

def bm25_T(prec_met, map_met, rec_met, result, sparse_user_item, user_item_matrix, factors, reg_st, iterations, N=350):
    
    bm25_user_item_matrix = bm25_weight(user_item_matrix.T).T.tocsr()
    for fact in factors:
        for reg in reg_st:
            for itrn in iterations:

                model = AlternatingLeastSquares(factors=fact, 
                                regularization=reg,
                                iterations=itrn,
                                calculate_training_loss=True, 
                                num_threads=-1,
                                random_state=42)

                model.fit(bm25_user_item_matrix,  # На вход item-user matrix
                          show_progress=False)

                result[f'als_bm25_T_{fact}_{reg}_{itrn}'] = result['user_id'].map(lambda x: get_recommendations(x, model, sparse_user_item, N))
                prec_met[f'als_bm25_T_{fact}_{reg}_{itrn}'] = result.apply(lambda row: precision_at_k(row[f'als_bm25_T_{fact}_{reg}_{itrn}'], row['actual']), axis=1).mean()
                map_met[f'als_bm25_T_{fact}_{reg}_{itrn}'] = result.apply(lambda row: ap_k(row[f'als_bm25_T_{fact}_{reg}_{itrn}'], row['actual']), axis=1).mean()
                rec_met[f'als_bm25_T_{fact}_{reg}_{itrn}'] = result.apply(lambda row: recall_at_k(row[f'als_bm25_T_{fact}_{reg}_{itrn}'], row['actual']), axis=1).mean()
                    
    return result, prec_met, map_met, rec_met

In [14]:
'''
Списки для функций пришлось создать разные, т.к. при попытке получения предсказаний
на не транспоннированной матрице user_item_matrix функци get_recommendations()
выпадает в ошибку по значениям регуляризации.
По той же причине в функциях сделаны и точечные исключения сочетания значений из списков.
'''

# Словари метрик
prec_met = dict()
map_met = dict()
rec_met = dict()

# Списки гиперпараметров для моделей
factors = [50, 150, 250, 350]
reg_st = [0.01, 0.03, 0.05]
iterations = [1, 2, 3, 5, 8, 10]

reg_bm = [0.01, 0.03, 0.05]
iterations_bm = [1, 2, 3]

reg_tfidf = [0.01, 0.03, 0.05]
iterations_tfidf = [1, 2, 3, 5]

In [15]:
%%time
print('Шаг 1/5. Обучаем дефолтную als модель')
# Дефолтная als модель
result, prec_met, map_met, rec_met = als_base(prec_met, map_met, rec_met, result, sparse_user_item, factors, reg_st, iterations, N=350)

print('Шаг 2/5. Обучаем als модель со взвешиванием tfidf без трансформации user_item_matrix')
# tfidf без трансформации user_item_matrix
result, prec_met, map_met, rec_met = tfidf(prec_met, map_met, rec_met, result, sparse_user_item, user_item_matrix, factors, reg_tfidf, iterations_tfidf, N=350)

print('Шаг 3/5. Обучаем als модель со взвешиванием tfidf c трансформацией user_item_matrix')
# tfidf c трансформацией user_item_matrix
result, prec_met, map_met, rec_met = tfidf_T(prec_met, map_met, rec_met, result, sparse_user_item, user_item_matrix, factors, reg_st, iterations, N=350)

print('Шаг 4/5. Обучаем als модель со взвешиванием bm25 без трансформации user_item_matrix')
# bm25 без трансформации user_item_matrix
result, prec_met, map_met, rec_met = bm25(prec_met, map_met, rec_met, result, sparse_user_item, user_item_matrix, factors, reg_bm, iterations_bm, N=350)

print('Шаг 5/5. Обучаем als модель со взвешиванием bm25 c трансформацией user_item_matrix')
# bm25 c трансформацией user_item_matrix
result, prec_met, map_met, rec_met = bm25_T(prec_met, map_met, rec_met, result, sparse_user_item, user_item_matrix, factors, reg_st, iterations, N=350)

Шаг 1/5. Обучаем дефолтную als модель
Шаг 2/5. Обучаем als модель со взвешиванием tfidf без трансформации user_item_matrix
Шаг 3/5. Обучаем als модель со взвешиванием tfidf c трансформацией user_item_matrix
Шаг 4/5. Обучаем als модель со взвешиванием bm25 без трансформации user_item_matrix
Шаг 5/5. Обучаем als модель со взвешиванием bm25 c трансформацией user_item_matrix
CPU times: total: 7h 23min 26s
Wall time: 53min 9s


In [16]:
data = [prec_met, map_met, rec_met]
data = pd.DataFrame(data, index =['Precision@5', 'MAP@5', 'Recall@5'])
data = data.T

In [17]:
data

Unnamed: 0,Precision@5,MAP@5,Recall@5
als_50_0.01_1,0.161176,0.109474,0.021135
als_50_0.01_2,0.164020,0.110317,0.022455
als_50_0.01_3,0.164804,0.109046,0.023216
als_50_0.01_5,0.162647,0.106301,0.023388
als_50_0.01_8,0.161765,0.105951,0.023212
...,...,...,...
als_bm25_T_350_0.05_2,0.211667,0.149587,0.030202
als_bm25_T_350_0.05_3,0.206373,0.142546,0.029219
als_bm25_T_350_0.05_5,0.198333,0.134399,0.028721
als_bm25_T_350_0.05_8,0.194118,0.131283,0.028918


In [18]:
# Лучшая модель по метрике Precision@5
data[data['Precision@5'] == data['Precision@5'].max()]

Unnamed: 0,Precision@5,MAP@5,Recall@5
als_bm25_T_350_0.01_1,0.223235,0.162536,0.03181


In [19]:
# Лучшая модель по метрике MAP@5
data[data['MAP@5'] == data['MAP@5'].max()]

Unnamed: 0,Precision@5,MAP@5,Recall@5
als_bm25_T_350_0.01_1,0.223235,0.162536,0.03181


In [20]:
# Лучшая модель по метрике Recall@k
data[data['Recall@5'] == data['Recall@5'].max()]

Unnamed: 0,Precision@5,MAP@5,Recall@5
als_bm25_T_350_0.01_1,0.223235,0.162536,0.03181
