In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight, tfidf_weight


# Написанные нами функции
from best_rec_lib.metrics import precision_at_k, recall_at_k, ap_k
from best_rec_lib.utils import prefilter_items
from best_rec_lib.recommenders import MainRecommender

import gc

import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('../retail_train.csv')
item_features = pd.read_csv('../product.csv')
user_features = pd.read_csv('../hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

In [3]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

In [4]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']

In [5]:
n_items_before = data_train['item_id'].nunique()

data_train = prefilter_items(data_train, item_features=item_features, take_n_popular=3300)

n_items_after = data_train['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 86865 to 3301


In [6]:
user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity', # Можно пробовать другие варианты
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат saprse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

In [7]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [8]:
test_users = result.shape[0]
new_test_users = len(set(data_test['user_id']) - set(data_train['user_id']))

print('В тестовом дата сете {} юзеров'.format(test_users))
print('В тестовом дата сете {} новых юзеров'.format(new_test_users))
new_test_users = list(set(data_test['user_id']) - set(data_train['user_id']))

В тестовом дата сете 2042 юзеров
В тестовом дата сете 2 новых юзеров


In [9]:
result = result[~result['user_id'].isin(new_test_users)]

In [10]:
def get_recommendations(user, model, sparse_user_item, N=5000):
    res = [id_to_itemid[rec] for rec in 
                    model.recommend(userid=userid_to_id[user], 
                                    user_items=sparse_user_item[userid_to_id[user]],   # на вход user-item matrix
                                    N=N, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[999999]], 
                                    recalculate_user=True)[0]]
    return res

In [11]:
def als_base(rec_met, result, sparse_user_item, factors, reg_st, iterations, N=500):
    
    for fact in factors:
        for reg in reg_st:
            for itrn in iterations:
                
                model = AlternatingLeastSquares(factors=fact, 
                                                regularization=reg,
                                                iterations=itrn, 
                                                calculate_training_loss=True, 
                                                num_threads=-1,
                                                random_state=42)

                model.fit(sparse_user_item,  # На вход item-user matrix
                          show_progress=False)

                result[f'als_{fact}_{reg}_{itrn}'] = result['user_id'].map(lambda x: get_recommendations(x, model, sparse_user_item, N))
                rec_met[f'als_{fact}_{reg}_{itrn}'] = result.apply(lambda row: recall_at_k(row[f'als_{fact}_{reg}_{itrn}'], row['actual'], N), axis=1).mean()
                    
    return result, rec_met



def tfidf(rec_met, result, sparse_user_item, user_item_matrix, factors, reg_tfidf, iterations_tfidf, N=500):
    
    tfidf_user_item_matrix = tfidf_weight(user_item_matrix).tocsr()
    for fact in factors:
        for reg in reg_tfidf:
            for itrn in iterations_tfidf:
                
                if sum([fact == 150, reg == 0.01, (itrn in (5, 8))]) != 3:

                    model = AlternatingLeastSquares(factors=fact, 
                                    regularization=reg,
                                    iterations=itrn, 
                                    calculate_training_loss=True, 
                                    num_threads=-1,
                                    random_state=42)

                    model.fit(tfidf_user_item_matrix,  # На вход item-user matrix
                              show_progress=False)

                    result[f'als_tfidf_{fact}_{reg}_{itrn}'] = result['user_id'].map(lambda x: get_recommendations(x, model, sparse_user_item, N))
                    rec_met[f'als_tfidf_{fact}_{reg}_{itrn}'] = result.apply(lambda row: recall_at_k(row[f'als_tfidf_{fact}_{reg}_{itrn}'], row['actual'], N), axis=1).mean()
                    
    return result, rec_met

def tfidf_T(rec_met, result, sparse_user_item, user_item_matrix, factors, reg_st, iterations, N=500):
    
    tfidf_user_item_matrix = tfidf_weight(user_item_matrix.T).T.tocsr()
    for fact in factors:
        for reg in reg_st:
            for itrn in iterations:

                model = AlternatingLeastSquares(factors=fact, 
                                regularization=reg,
                                iterations=itrn,
                                calculate_training_loss=True, 
                                num_threads=-1,
                                random_state=42)

                model.fit(tfidf_user_item_matrix,  # На вход item-user matrix
                          show_progress=False)

                result[f'als_tfidf_T_{fact}_{reg}_{itrn}'] = result['user_id'].map(lambda x: get_recommendations(x, model, sparse_user_item, N))
                rec_met[f'als_tfidf_T_{fact}_{reg}_{itrn}'] = result.apply(lambda row: recall_at_k(row[f'als_tfidf_T_{fact}_{reg}_{itrn}'], row['actual'], N), axis=1).mean()
                    
    return result, rec_met

def bm25(rec_met, result, sparse_user_item, user_item_matrix, factors, reg_bm, iterations_bm, N=500):
    
    bm25_user_item_matrix = bm25_weight(user_item_matrix).tocsr()

    for fact in factors:
        for reg in reg_bm:
            for itrn in iterations_bm:
                
                if sum([fact == 150, reg == 0.01, (itrn in (3,))]) != 3:

                    model = AlternatingLeastSquares(factors=fact, 
                                    regularization=reg,
                                    iterations=itrn,
                                    calculate_training_loss=True, 
                                    num_threads=-1,
                                    random_state=42)

                    model.fit(bm25_user_item_matrix,  # На вход item-user matrix
                              show_progress=False)

                    result[f'als_bm25_{fact}_{reg}_{itrn}'] = result['user_id'].map(lambda x: get_recommendations(x, model, sparse_user_item, N))
                    rec_met[f'als_bm25_{fact}_{reg}_{itrn}'] = result.apply(lambda row: recall_at_k(row[f'als_bm25_{fact}_{reg}_{itrn}'], row['actual'], N), axis=1).mean()

                    
    return result, rec_met

def bm25_T(rec_met, result, sparse_user_item, user_item_matrix, factors, reg_st, iterations, N=500):
    
    bm25_user_item_matrix = bm25_weight(user_item_matrix.T).T.tocsr()
    for fact in factors:
        for reg in reg_st:
            for itrn in iterations:

                model = AlternatingLeastSquares(factors=fact, 
                                regularization=reg,
                                iterations=itrn,
                                calculate_training_loss=True, 
                                num_threads=-1,
                                random_state=42)

                model.fit(bm25_user_item_matrix,  # На вход item-user matrix
                          show_progress=False)

                result[f'als_bm25_T_{fact}_{reg}_{itrn}'] = result['user_id'].map(lambda x: get_recommendations(x, model, sparse_user_item, N))
                rec_met[f'als_bm25_T_{fact}_{reg}_{itrn}'] = result.apply(lambda row: recall_at_k(row[f'als_bm25_T_{fact}_{reg}_{itrn}'], row['actual'], N), axis=1).mean()
                    
    return result, rec_met

In [12]:
'''
Списки для функций пришлось создать разные, т.к. при попытке получения предсказаний
на не транспоннированной матрице user_item_matrix функци get_recommendations()
выпадает в ошибку по значениям регуляризации.
По той же причине в функциях сделаны и точечные исключения сочетания значений из списков.
'''

# Словари метрик
rec_met = dict()

# Списки гиперпараметров для моделей
factors = [50, 150, 250, 350]
reg_st = [0.01, 0.03, 0.05]
iterations = [1, 2, 3, 5, 8, 10]

reg_bm = [0.01, 0.03, 0.05]
iterations_bm = [1, 2, 3]

reg_tfidf = [0.01, 0.03, 0.05]
iterations_tfidf = [1, 2, 3, 5]

In [13]:
%%time
print('Шаг 1/5. Обучаем дефолтную als модель')
# Дефолтная als модель
result, rec_met = als_base(rec_met, result, sparse_user_item, factors, reg_st, iterations, N=500)

print('Шаг 2/5. Обучаем als модель со взвешиванием tfidf без трансформации user_item_matrix')
# tfidf без трансформации user_item_matrix
result, rec_met = tfidf(rec_met, result, sparse_user_item, user_item_matrix, factors, reg_tfidf, iterations_tfidf, N=500)

print('Шаг 3/5. Обучаем als модель со взвешиванием tfidf c трансформацией user_item_matrix')
# tfidf c трансформацией user_item_matrix
result, rec_met = tfidf_T(rec_met, result, sparse_user_item, user_item_matrix, factors, reg_st, iterations, N=500)

print('Шаг 4/5. Обучаем als модель со взвешиванием bm25 без трансформации user_item_matrix')
# bm25 без трансформации user_item_matrix
result, rec_met = bm25(rec_met, result, sparse_user_item, user_item_matrix, factors, reg_bm, iterations_bm, N=500)

print('Шаг 5/5. Обучаем als модель со взвешиванием bm25 c трансформацией user_item_matrix')
# bm25 c трансформацией user_item_matrix
result, rec_met = bm25_T(rec_met, result, sparse_user_item, user_item_matrix, factors, reg_st, iterations, N=500)

Шаг 1/5. Обучаем дефолтную als модель
Шаг 2/5. Обучаем als модель со взвешиванием tfidf без трансформации user_item_matrix
Шаг 3/5. Обучаем als модель со взвешиванием tfidf c трансформацией user_item_matrix
Шаг 4/5. Обучаем als модель со взвешиванием bm25 без трансформации user_item_matrix
Шаг 5/5. Обучаем als модель со взвешиванием bm25 c трансформацией user_item_matrix
CPU times: total: 6h 35min 51s
Wall time: 41min 9s


In [15]:
data = [rec_met]
data = pd.DataFrame(data, index =['Recall@5'])
data = data.T

In [19]:
data.sort_values('Recall@5', ascending=False)

Unnamed: 0,Recall@5
als_tfidf_150_0.01_3,0.192145
als_bm25_T_250_0.01_2,0.191859
als_bm25_T_250_0.03_2,0.191798
als_bm25_T_250_0.01_3,0.191560
als_bm25_T_150_0.01_3,0.191182
...,...
als_50_0.05_1,0.166797
als_50_0.03_1,0.166589
als_bm25_T_50_0.05_1,0.162419
als_bm25_T_50_0.03_1,0.162154


In [17]:
# Лучшая модель по метрике Recall@k
data[data['Recall@5'] == data['Recall@5'].max()]

Unnamed: 0,Recall@5
als_tfidf_150_0.01_3,0.192145
