- подобрать параметры для LightFM
- сделать предсказание
- посчитать метрики

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
import copy
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

from lightfm import LightFM
from lightfm.evaluation import precision_at_k, recall_at_k

# Функции из 1-ого вебинара
import os, sys

module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items

import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('../raw_data/retail_train.csv')
item_features = pd.read_csv('../raw_data/product.csv')
user_features = pd.read_csv('../raw_data/hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

# train test split
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

data_train.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


## 1. Filter items

In [3]:
n_items_before = data_train['item_id'].nunique()

data_train = prefilter_items(data_train)

n_items_after = data_train['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 86865 to 5001


## 2. Prepare data set

In [4]:
user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity', # Можно пробовать другие варианты
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат saprse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

user_item_matrix.head(2)

item_id,397896,818980,819063,819255,819304,819308,819330,819518,819594,819765,...,15596515,15778533,15831255,15926712,15926775,15926844,15926886,15927403,15927850,16809471
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
data_test = data_test[data_test['item_id'].isin(data_train['item_id'].unique())]
test_user_item_matrix = pd.pivot_table(data_test, 
                                  index='user_id', columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0
                                 )

test_user_item_matrix = user_item_matrix.astype(float)

In [6]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 914190, 95804..."
1,3,"[851057, 872021, 878302, 879948, 909638, 91320..."


In [7]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

## 3. Prepare user and item features

In [8]:
user_feat = pd.DataFrame(user_item_matrix.index)
user_feat = user_feat.merge(user_features, on='user_id', how='left')
user_feat.set_index('user_id', inplace=True)  

item_feat = pd.DataFrame(user_item_matrix.columns)
item_feat = item_feat.merge(item_features, on='item_id', how='left')
item_feat.set_index('item_id', inplace=True)

user_feat.head(2)

Unnamed: 0_level_0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,65+,A,35-49K,Homeowner,2 Adults No Kids,2.0,None/Unknown
2,,,,,,,


In [9]:
user_feat_lightfm = pd.get_dummies(user_feat, columns=user_feat.columns.tolist())
item_feat_lightfm = pd.get_dummies(item_feat, columns=item_feat.columns.tolist())

In [10]:
user_feat_lightfm.head(2)

Unnamed: 0_level_0,age_desc_19-24,age_desc_25-34,age_desc_35-44,age_desc_45-54,age_desc_55-64,age_desc_65+,marital_status_code_A,marital_status_code_B,marital_status_code_U,income_desc_100-124K,...,hh_comp_desc_Unknown,household_size_desc_1,household_size_desc_2,household_size_desc_3,household_size_desc_4,household_size_desc_5+,kid_category_desc_1,kid_category_desc_2,kid_category_desc_3+,kid_category_desc_None/Unknown
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,1,1,0,0,0,...,0,0,1,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
model = LightFM(no_components=30,
                loss='bpr', # 'warp'
                learning_rate=0.05, 
                item_alpha=0.1, user_alpha=0.1, 
                random_state=42)

model.fit((sparse_user_item > 0) * 1,
          user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
          sample_weight=coo_matrix(user_item_matrix),  
          item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
          epochs=15, 
          num_threads=4) 

<lightfm.lightfm.LightFM at 0x7f7b2010c8e0>

In [12]:
user_emb = model.get_user_representations(features=csr_matrix(user_feat_lightfm.values).tocsr())
item_emb = model.get_item_representations(features=csr_matrix(item_feat_lightfm.values).tocsr())

### - предсказание и метрика

In [13]:
# фя принимает 1 userid, for_items, или по всем items из матрицы
def get_recommendations_LFM(user, model, sparse_user_item, N=5, 
                            for_items=None, fake_id=999999):
    
    user = int(userid_to_id[1])  # user_ids требует int или np.int32, numpy.int32 не подходит
    if for_items:
        item_ids = np.array([itemid_to_id[i] for i in for_item])
    else:
        item_ids = np.arange(sparse_user_item.shape[1])  # или передавать снаружи?
    
    # убираем фейковый id
    if fake_id:
        item_ids = np.delete(item_ids, itemid_to_id[fake_id])
    
    predictions = model.predict(user_ids=user, item_ids=item_ids,
                            user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                            item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                            num_threads=4)
    mask = predictions.argsort()[::-1][:5]
    recs = item_ids[mask]
    recs = [id_to_itemid[x] for x in recs]
    return recs

Предсказание для одного user-а

In [14]:
get_recommendations_LFM(1, model, sparse_user_item)

[10356441, 1113780, 9835619, 909479, 1049788]

Тот же результат (за исключением 999999) - перемножением эмбеддингов

In [16]:
user = int(userid_to_id[1])
item_ids = np.arange(sparse_user_item.shape[1])
# item_ids = np.delete(item_ids, itemid_to_id[999999])

In [17]:
pred = (user_emb[1][0] @ item_emb[1].T + user_emb[0][0] + item_emb[0])
pred

array([-196.14073, -338.69025, -339.62585, ..., -338.69464, -338.50528,
       -232.13681], dtype=float32)

In [18]:
mask = pred.argsort()[::-1][:6]
recs = item_ids[mask]
recs = [id_to_itemid[x] for x in recs]
recs

[999999, 10356441, 1113780, 9835619, 909479, 1049788]

In [19]:
%%time
result['LFM'] = result['user_id'].apply(lambda x: get_recommendations_LFM(x, model, sparse_user_item))

CPU times: user 9min 52s, sys: 24.6 s, total: 10min 17s
Wall time: 2min 34s


Как видно, даёт всем одинаковые рекомендации

In [20]:
result.head(5)

Unnamed: 0,user_id,actual,LFM
0,1,"[821867, 834484, 856942, 865456, 914190, 95804...","[10356441, 1113780, 9835619, 909479, 1049788]"
1,3,"[851057, 872021, 878302, 879948, 909638, 91320...","[10356441, 1113780, 9835619, 909479, 1049788]"
2,6,"[920308, 926804, 1017061, 1078346, 1120741, 82...","[10356441, 1113780, 9835619, 909479, 1049788]"
3,7,"[840386, 889774, 898068, 909714, 953476, 97699...","[10356441, 1113780, 9835619, 909479, 1049788]"
4,8,"[835098, 872137, 910439, 924610, 1041259, 5569...","[10356441, 1113780, 9835619, 909479, 1049788]"


In [21]:
result.LFM.apply(lambda x: frozenset(x)).nunique()

1

In [22]:
# и метрика очень невелика
prec5 = result.apply(lambda row: precision_at_k(row['LFM'], row['actual']), axis=1).mean()
prec5

0.004805725971370143

In [23]:
# максимальное единичное предсказание
result.apply(lambda row: precision_at_k(row['LFM'], row['actual']), axis=1).max()

0.2

### - подбор гиперпараметров

In [24]:
def grid_search(base_model, fit_matrix, user_features, item_features, 
                result_df, param_grid):
    model_params = []
    metrics = []
    keys, values = zip(*param_grid.items())
    
    for i, v in enumerate(itertools.product(*values)):
        params = dict(zip(keys, v))
        model_params.append(params)
        
        this_model = copy.deepcopy(base_model)
        for k, v in params.items():
            setattr(this_model, k, v)

        this_model.fit((fit_matrix > 0) * 1,
          user_features=csr_matrix(user_features.values).tocsr(),
          sample_weight=coo_matrix(fit_matrix),  
          item_features=csr_matrix(item_features.values).tocsr(),
          epochs=15, 
          num_threads=4) 
        
        result_df[f'LFM_{i}'] = result_df['user_id'].apply(lambda x: get_recommendations_LFM(x, this_model, fit_matrix, 5))
        prec5 = result_df.apply(lambda row: precision_at_k(row[f'LFM_{i}'], row['actual']), axis=1).mean()
        metrics.append(prec5)
        
        print(f'params={params}, precision@k={prec5}')
    
    return model_params, metrics

In [25]:
# 4 модели для начала
param_grid = {'no_components': [20, 50],
              'loss': ['bpr', 'warp']}

In [168]:
model = LightFM(learning_rate=0.05, 
                item_alpha=0.6, user_alpha=0.6, 
                random_state=42)

In [169]:
%%time

par, metr = grid_search(base_model=model, 
                        fit_matrix=sparse_user_item, 
                        user_features=user_feat_lightfm,
                        item_features=item_feat_lightfm,
                        result_df=result, 
                        param_grid=param_grid)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df[f'LFM_{i}'] = result_df['user_id'].apply(lambda x: get_recommendations_LFM(x, this_model, fit_matrix, 5))


params={'no_components': 20, 'loss': 'bpr'}, precision@k=0.054042988741044476


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df[f'LFM_{i}'] = result_df['user_id'].apply(lambda x: get_recommendations_LFM(x, this_model, fit_matrix, 5))


params={'no_components': 20, 'loss': 'warp'}, precision@k=0.010440122824974392


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df[f'LFM_{i}'] = result_df['user_id'].apply(lambda x: get_recommendations_LFM(x, this_model, fit_matrix, 5))


params={'no_components': 50, 'loss': 'bpr'}, precision@k=0.002251791197543501
params={'no_components': 50, 'loss': 'warp'}, precision@k=0.010440122824974392
CPU times: user 34min 28s, sys: 1min 6s, total: 35min 34s
Wall time: 8min 54s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df[f'LFM_{i}'] = result_df['user_id'].apply(lambda x: get_recommendations_LFM(x, this_model, fit_matrix, 5))


params={'no_components': 20, 'loss': 'bpr'}, precision@k=0.054042988741044476

Оставим bpr и число компонент 20:

In [175]:
model = LightFM(no_components=20,
                learning_rate=0.05,
                loss='bpr',
                random_state=42)

In [187]:
# 12 моделей
param_grid = {'item_alpha': [0.6, 0.8],
              'user_alpha': [0.6, 0.8, 1],
              'max_sampled': [10, 30]
             }

In [188]:
%%time

par, metr = grid_search(base_model=model, 
                        fit_matrix=sparse_user_item, 
                        user_features=user_feat_lightfm,
                        item_features=item_feat_lightfm,
                        result_df=result, 
                        param_grid=param_grid)

params={'item_alpha': 0.6, 'user_alpha': 0.6, 'max_sampled': 10}, precision@k=0.023643807574206813
params={'item_alpha': 0.6, 'user_alpha': 0.6, 'max_sampled': 30}, precision@k=0.002558853633572161
params={'item_alpha': 0.6, 'user_alpha': 0.8, 'max_sampled': 10}, precision@k=0.006038894575230292
params={'item_alpha': 0.6, 'user_alpha': 0.8, 'max_sampled': 30}, precision@k=0.0015353121801432957
params={'item_alpha': 0.6, 'user_alpha': 1, 'max_sampled': 10}, precision@k=0.02159672466734905
params={'item_alpha': 0.6, 'user_alpha': 1, 'max_sampled': 30}, precision@k=0.010542476970317276
params={'item_alpha': 0.8, 'user_alpha': 0.6, 'max_sampled': 10}, precision@k=0.0030706243602865928
params={'item_alpha': 0.8, 'user_alpha': 0.6, 'max_sampled': 30}, precision@k=0.005629477993858748
params={'item_alpha': 0.8, 'user_alpha': 0.8, 'max_sampled': 10}, precision@k=0.005731832139201634
params={'item_alpha': 0.8, 'user_alpha': 0.8, 'max_sampled': 30}, precision@k=0.0035823950870010257
params={'ite

Итоговые параметры после более тонкого подбора
- не смотря на random_state, даёт большой разброс в ранжировании (от примерно 0.001 до 0.83 при одинаковых параметрах)

In [39]:
model = LightFM(no_components=20,
                learning_rate=0.05,
                loss='bpr',
                item_alpha=0.6,
                user_alpha=0.8,
                random_state=42)

model.fit((sparse_user_item > 0) * 1,
          user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
          sample_weight=coo_matrix(user_item_matrix),  
          item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
          epochs=15, 
          num_threads=4) 

<lightfm.lightfm.LightFM at 0x7f7b2010ca90>

In [40]:
result['LFM_optim'] = result['user_id'].apply(lambda x: get_recommendations_LFM(x, model, sparse_user_item))
prec5 = result.apply(lambda row: precision_at_k(row['LFM_optim'], row['actual']), axis=1).mean()
prec5

0.053067484662577144