# Домашнее задание

## Подбор оптимальных гиперпараметров для LightFM

- Постройте модели с помощью библиотеки LightFM, изменяя следующие параметры
  - функция потерь, регуляризация
  - количество компонент
  - отдельно постройте модели, используя только матрицу взаимодействий и матрицу взаимодействий + признаки (набор признаков может быть различным, например как на вебинаре)
  
- Посчитайте метрики (Precision@5, MAP@5) для разных наборов гиперпараметров и выберете лучший набор

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

from lightfm import LightFM
# from lightfm.evaluation import precision_at_k, recall_at_k

# Функции из 1-ого вебинара
import os, sys

module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from best_rec_lib.metrics import precision_at_k, ap_k
from best_rec_lib.utils import prefilter_items

import warnings
warnings.filterwarnings("ignore")



In [2]:
data = pd.read_csv('../retail_train.csv')
item_features = pd.read_csv('../product.csv')
user_features = pd.read_csv('../hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

# train test split
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

data_train.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [3]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [4]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


In [5]:
user_features['age_desc'].unique()

array(['65+', '45-54', '25-34', '35-44', '19-24', '55-64'], dtype=object)

In [6]:
user_features['marital_status_code'].unique()

array(['A', 'U', 'B'], dtype=object)

In [7]:
user_features['household_size_desc'].unique()

array(['2', '3', '4', '1', '5+'], dtype=object)

## 1. Filter items

In [8]:
n_items_before = data_train['item_id'].nunique()

data_train = prefilter_items(data_train, 5000, item_features)

n_items_after = data_train['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 86865 to 5001


## 2. Prepare data set

In [10]:
user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity', # Можно пробоват ьдругие варианты
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат saprse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

user_item_matrix.head(2)

item_id,244960,818981,819330,819400,819487,819590,819969,820011,820301,820341,...,15972565,15972790,16053142,16100266,16729296,16729299,16729363,16729415,16770156,16809649
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
data_test = data_test[data_test['item_id'].isin(data_train['item_id'].unique())]
test_user_item_matrix = pd.pivot_table(data_test, 
                                  index='user_id', columns='item_id', 
                                  values='quantity', # Можно пробоват ьдругие варианты
                                  aggfunc='count', 
                                  fill_value=0
                                 )

test_user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

In [12]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

## 3. Prepare user and item features

In [13]:
user_feat = pd.DataFrame(user_item_matrix.index)
user_feat = user_feat.merge(user_features, on='user_id', how='left')
user_feat.set_index('user_id', inplace=True)

item_feat = pd.DataFrame(user_item_matrix.columns)
item_feat = item_feat.merge(item_features, on='item_id', how='left')
item_feat.set_index('item_id', inplace=True)

user_feat.head(2)

Unnamed: 0_level_0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,65+,A,35-49K,Homeowner,2 Adults No Kids,2.0,None/Unknown
2,,,,,,,


In [14]:
user_feat_lightfm = pd.get_dummies(user_feat, columns=user_feat.columns.tolist())
item_feat_lightfm = pd.get_dummies(item_feat, columns=item_feat.columns.tolist())

In [15]:
user_feat_lightfm.head(2)

Unnamed: 0_level_0,age_desc_19-24,age_desc_25-34,age_desc_35-44,age_desc_45-54,age_desc_55-64,age_desc_65+,marital_status_code_A,marital_status_code_B,marital_status_code_U,income_desc_100-124K,...,hh_comp_desc_Unknown,household_size_desc_1,household_size_desc_2,household_size_desc_3,household_size_desc_4,household_size_desc_5+,kid_category_desc_1,kid_category_desc_2,kid_category_desc_3+,kid_category_desc_None/Unknown
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,1,1,0,0,0,...,0,0,1,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
item_feat_lightfm.head(2)

Unnamed: 0_level_0,manufacturer_2.0,manufacturer_5.0,manufacturer_16.0,manufacturer_20.0,manufacturer_26.0,manufacturer_33.0,manufacturer_35.0,manufacturer_36.0,manufacturer_42.0,manufacturer_43.0,...,curr_size_of_product_L 12 OZ,curr_size_of_product_L 16 OZ,curr_size_of_product_L 24 OZ,curr_size_of_product_L16 5/8 OZ,curr_size_of_product_LB,curr_size_of_product_N 40 OZ,curr_size_of_product_PINT,curr_size_of_product_PK,curr_size_of_product_PT,curr_size_of_product_QT
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
244960,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
818981,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Создадим датасет занесения рекомендаций и расчета метрик

In [17]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[856942, 951954, 971585, 1042083, 1131115, 883..."
1,3,[920626]


In [18]:
test_users = result.shape[0]
new_test_users = len(set(data_test['user_id']) - set(data_train['user_id']))

print('В тестовом дата сете {} юзеров'.format(test_users))
print('В тестовом дата сете {} новых юзеров'.format(new_test_users))
new_test_users = list(set(data_test['user_id']) - set(data_train['user_id']))

В тестовом дата сете 1807 юзеров
В тестовом дата сете 3 новых юзеров


In [19]:
# уберем пользователей, которых нет в трейне
result = result[~result['user_id'].isin(new_test_users)]

### ALS + BM25 weighting

Получим тестовые метрики до применения LightFM. Используем ALS и взвешивание BM25 с лучшими гиперпараметрами, вычисленными ранее

In [20]:
def get_recommendations(user, model, sparse_user_item, N=5):
    res = [id_to_itemid[rec] for rec in 
                    model.recommend(userid=userid_to_id[user], 
                                    user_items=sparse_user_item[userid_to_id[user]],   # на вход user-item matrix
                                    N=N, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[999999]], 
                                    recalculate_user=True)[0]]
    return res

In [21]:
bm25_user_item_matrix = bm25_weight(user_item_matrix.T).T.tocsr()

model = AlternatingLeastSquares(factors=350, 
                regularization=0.05,
                iterations=1,
                calculate_training_loss=True, 
                num_threads=16,
                random_state=42)

model.fit(bm25_user_item_matrix,  # На вход item-user matrix
          show_progress=False)

In [22]:
%%time
result['als_bm25'] = result['user_id'].map(lambda x: get_recommendations(x, model, sparse_user_item, N=5))

CPU times: total: 1min 17s
Wall time: 5.76 s


In [23]:
# Словари метрик
prec_met = dict()
map_met = dict()

In [24]:
print('Precision@k: ', result.apply(lambda row: precision_at_k(row['als_bm25'], row['actual']), axis=1).mean())
print('Recall@k: ', result.apply(lambda row: ap_k(row['als_bm25'], row['actual']), axis=1).mean())

Precision@k:  0.1592017738359189
Recall@k:  0.11200295639319985


In [25]:
prec_met['als_bm25'] = result.apply(lambda row: precision_at_k(row['als_bm25'], row['actual']), axis=1).mean()
map_met['als_bm25'] = result.apply(lambda row: ap_k(row['als_bm25'], row['actual']), axis=1).mean()

Получили значение Precision@5 = 0.1592017738359189 и MAP@5 = 0.11200295639319985. Это существенно меньше, чем без предварительной фильтации посредством функции prefilter_items, но для сравнения это не принципиально.

### LightFM

In [26]:
def get_recommendations_lfm(x, test_item_ids, model, N=5):  
    # Получаем предсказания для пользователя по всем товарам   
    predictions = model.predict(user_ids=int(userid_to_id[x]), item_ids=test_item_ids,
                                user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                                item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                                num_threads=16)
        
    # Делаем из массива список
    predictions = list(predictions)
    
    # Находим id 6 товаров с максимальным значением. 6, т.к. может быть рекомендован фиктивный товар
    lst_num = list(enumerate(predictions, 0)) # кортежи значений с индексами
    prod_max_id = [] # список товаров для пользователя с максимальным значением
    for _ in range(N+1):
        spam = max(lst_num, key=lambda i : i[1])
        prod_max_id.append(spam) # добавляем в спсисок товар с максимальным значением
        lst_num.remove(spam) # удаляем этот товар из исходного списка кортежей
    
    prod_max_id = [el[0] for el in prod_max_id] # оставляем в списке только индексы
    
    # Получаем список из 5 id рекомендуемых товаров
    prod_max_id = [id_to_itemid[el] for el in prod_max_id] # переводим индексы в реальные id товаров
    if 999999 in prod_max_id:
        prod_max_id.remove(999999) # удаляем фиктивный, если он рекомендован

    return prod_max_id[:N]

In [27]:
# test_item_ids = np.arange(0, len(item_feat_lightfm.values))
test_item_ids = [itemid_to_id[el] for el in list(item_feat_lightfm.index)]

components = [10, 30, 50, 100]
loss_func = ['bpr', 'warp']
learning_rate = [0.001, 0.005, 0.01, 0.05]
item_alpha = [0.05, 0.001, 0.005, 0.0001]

In [28]:
def fitLightFM(prec_met, map_met, result, test_item_ids, components, learning_rate, loss_func, item_alpha, N=5):
    
    for comp in components:
        for loss in loss_func:
            for lr in learning_rate:
                for alpha in item_alpha:
                
                    model = LightFM(no_components=comp,
                                    loss=loss,
                                    learning_rate=lr, 
                                    item_alpha=alpha, 
                                    user_alpha=0.1, 
                                    random_state=42)

                    model.fit((sparse_user_item > 0) * 1,  # user-item matrix из 0 и 1
                              sample_weight=coo_matrix(user_item_matrix), # вес можно не передавать
                              user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                              item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                              epochs=15, 
                              num_threads=16)

                    result[f'LightFM_{comp}_{loss}_{lr}_{alpha}'] = result['user_id'].map(lambda x: get_recommendations_lfm(x, test_item_ids, model, N))
                    prec_met[f'LightFM_{comp}_{loss}_{lr}_{alpha}'] = result.apply(lambda row: precision_at_k(row[f'LightFM_{comp}_{loss}_{lr}_{alpha}'], row['actual']), axis=1).mean()
                    map_met[f'LightFM_{comp}_{loss}_{lr}_{alpha}'] = result.apply(lambda row: ap_k(row[f'LightFM_{comp}_{loss}_{lr}_{alpha}'], row['actual']), axis=1).mean()
                    
    return result, prec_met, map_met

In [29]:
%%time
result, prec_met, map_met = fitLightFM(prec_met, map_met, result, test_item_ids, components, learning_rate, loss_func, item_alpha, N=5)

CPU times: total: 11h 35min 52s
Wall time: 11h 36min 12s


Обучим дополнительно модель без фичей. Без перебора параметров

In [30]:
%%time

# Обучаем модель без фичей
model = LightFM(no_components=150,
                loss='bpr', # 'warp'
                learning_rate=0.05, 
                item_alpha=0.0001, 
                user_alpha=0.1, 
                random_state=42)

model.fit((sparse_user_item > 0) * 1,  # user-item matrix из 0 и 1
          sample_weight=coo_matrix(user_item_matrix), # вес можно не передавать
#           user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
#           item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
          epochs=15, 
          num_threads=16)

CPU times: total: 1min 51s
Wall time: 1min 51s


<lightfm.lightfm.LightFM at 0x203969fc0d0>

In [31]:
%%time

result['LightFM_no_features'] = result['user_id'].map(lambda x: get_recommendations_lfm(x, test_item_ids, model, N=5))

CPU times: total: 3min 44s
Wall time: 3min 44s


In [33]:
print('Precision@k: ', result.apply(lambda row: precision_at_k(row['LightFM_no_features'], row['actual']), axis=1).mean())
print('Recall@k: ', result.apply(lambda row: ap_k(row['LightFM_no_features'], row['actual']), axis=1).mean())

Precision@k:  0.0023281596452328166
Recall@k:  0.0010070214338507022


In [34]:
prec_met['LightFM_no_features'] = result.apply(lambda row: precision_at_k(row['LightFM_no_features'], row['actual']), axis=1).mean()
map_met['LightFM_no_features'] = result.apply(lambda row: ap_k(row['LightFM_no_features'], row['actual']), axis=1).mean()

Найдем лучшую модель

In [35]:
data = [prec_met, map_met]
data = pd.DataFrame(data, index =['Precision@5', 'MAP@5'])
data = data.T

In [36]:
data.shape

(130, 2)

In [41]:
data

Unnamed: 0,Precision@5,MAP@5
als_bm25,0.159202,0.112003
LightFM_10_bpr_0.001_0.05,0.003215,0.001587
LightFM_10_bpr_0.001_0.001,0.003215,0.001587
LightFM_10_bpr_0.001_0.005,0.003215,0.001587
LightFM_10_bpr_0.001_0.0001,0.003215,0.001587
...,...,...
LightFM_100_warp_0.05_0.05,0.004324,0.002382
LightFM_100_warp_0.05_0.001,0.004989,0.002615
LightFM_100_warp_0.05_0.005,0.005211,0.002459
LightFM_100_warp_0.05_0.0001,0.005765,0.002197


In [38]:
# Лучшие параметры модели по метрике Precision@5
data[data['Precision@5'] == data['Precision@5'].max()]

Unnamed: 0,Precision@5,MAP@5
als_bm25,0.159202,0.112003


In [39]:
# Лучшие параметры модели по метрике MAP@5
data[data['MAP@5'] == data['MAP@5'].max()]

Unnamed: 0,Precision@5,MAP@5
als_bm25,0.159202,0.112003


**Выводы:** модель LightFM при любых параметрах показала крайне низкие результаты, которые оказались значительно хуже, чем показала модель ALS со взвешиванием BM25