<a href="https://colab.research.google.com/github/PoluboyarinovSI/SF_DataScience/blob/main/project_8/lfm_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install lightfm
import numpy as np
import pandas as pd

from sklearn import preprocessing, metrics
from scipy import sparse
from scipy.sparse import coo_matrix, csr_matrix

from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, auc_score

import warnings
warnings.filterwarnings("ignore")

Collecting lightfm
  Downloading lightfm-1.17.tar.gz (316 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/316.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━[0m [32m184.3/316.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py) ... [?25l[?25hdone
  Created wheel for lightfm: filename=lightfm-1.17-cp310-cp310-linux_x86_64.whl size=808333 sha256=0c435d1cc9b1cc11e0e75d77d1958ab6b12b34f988bd77afd1581240ea7d2d97
  Stored in directory: /root/.cache/pip/wheels/4f/9b/7e/0b256f2168511d8fa4dae4fae0200fdbd729eb424a912ad636
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.17


In [2]:
raw_data_events = pd.read_csv('events.csv')

# кодирование стобца события
# 0 - просмотр + добавление в корзину
# 1 - покупка
raw_data_events['event_grade'] = raw_data_events['event'].apply(lambda x: 1 if x == 'transaction' else 0)
raw_data_events.event_grade.sum()

# преобразование типа данных признаков
raw_data_events['event'] = raw_data_events['event'].astype('category')

# формирование временных признаков
raw_data_events['date'] = pd.to_datetime(raw_data_events['timestamp'], unit='ms', origin='unix')
raw_data_events['year'] = raw_data_events['date'].dt.year
raw_data_events['month'] = raw_data_events['date'].dt.month
raw_data_events['day_of_week'] = raw_data_events['date'].dt.day
raw_data_events['hour'] = raw_data_events['date'].dt.hour
display(raw_data_events.head())

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,event_grade,date,year,month,day_of_week,hour
0,1433221332117,257597,view,355908,,0,2015-06-02 05:02:12.117,2015,6,2,5
1,1433224214164,992329,view,248676,,0,2015-06-02 05:50:14.164,2015,6,2,5
2,1433221999827,111016,view,318965,,0,2015-06-02 05:13:19.827,2015,6,2,5
3,1433221955914,483717,view,253185,,0,2015-06-02 05:12:35.914,2015,6,2,5
4,1433221337106,951259,view,367447,,0,2015-06-02 05:02:17.106,2015,6,2,5


In [3]:
# формирование датасета для модели LightFM
lfm_data_events = raw_data_events

train_val_point = int(np.round(lfm_data_events.shape[0]*0.8))
lfm_train_data_events = lfm_data_events.iloc[0:train_val_point]
lfm_val_data_events = lfm_data_events.iloc[train_val_point::]
lfm_val_data_events = lfm_val_data_events[(lfm_val_data_events['visitorid'].isin(lfm_train_data_events['visitorid']))
                          & (lfm_val_data_events['itemid'].isin(lfm_train_data_events['itemid']))]

print(f'Количество событий в тренировочном датасете: {lfm_train_data_events.shape[0]}')
print(f'Количество событий в валидационном датасете: {lfm_val_data_events.shape[0]}')

Количество событий в тренировочном датасете: 245405
Количество событий в валидационном датасете: 8790


In [4]:
# кодирование признаков
cols_to_code = ['visitorid', 'itemid']
cat_lfm_train_data_events = dict()
cat_lfm_val_data_events = dict()

for col in cols_to_code:
    cat_encoder = preprocessing.LabelEncoder()
    cat_lfm_train_data_events[col] = cat_encoder.fit_transform(lfm_train_data_events[col].values)
    cat_lfm_val_data_events[col] = cat_encoder.transform(lfm_val_data_events[col].values)

# Кодирование признака события
event_scores = dict()
event_encoder = preprocessing.LabelEncoder()
event_scores['train'] = event_encoder.fit_transform(lfm_train_data_events['event'].values)
event_scores['val'] = event_encoder.transform(lfm_val_data_events['event'].values)

In [5]:
# item-user матрица
num_visitors = len(np.unique(cat_lfm_train_data_events['visitorid']))
num_items = len(np.unique(cat_lfm_train_data_events['itemid']))

sparse_matrix = dict()

sparse_matrix['train'] = csr_matrix((event_scores['train'],
                                  (cat_lfm_train_data_events['visitorid'],
                                  cat_lfm_train_data_events['itemid'])),
                                  shape=(num_visitors, num_items))

sparse_matrix['val'] = csr_matrix((event_scores['val'],
                                  (cat_lfm_val_data_events['visitorid'],
                                  cat_lfm_val_data_events['itemid'])),
                                  shape=(num_visitors, num_items))

In [6]:
lfm_model = LightFM(no_components=150, learning_rate=0.05, loss='warp', random_state=42)
lfm_model.fit(sparse_matrix['train'], epochs=30, num_threads=4, verbose=True)

# оценка качества модели
display(auc_score(lfm_model, sparse_matrix['val']).mean())

Epoch: 100%|██████████| 30/30 [00:52<00:00,  1.75s/it]


0.84556925

In [7]:
# расчет метрики Precision@3
map_at3 = precision_at_k(lfm_model, sparse_matrix['val'], k=3).mean()
print('Mean Average Precision at 3 (validation dataset): {:.5f}'.format(map_at3))

Mean Average Precision at 3 (validation dataset): 0.17493


In [8]:
# Реализация механизма выдачи рекомендаций
all_items = np.unique(cat_lfm_train_data_events['itemid'])
all_users = np.unique(cat_lfm_train_data_events['visitorid'])
item_id = np.arange(0, sparse_matrix['train'].shape[1])
user_id = 5

In [9]:
# Формирование предсказаний для конкретного пользователя
list_pred = lfm_model.predict(user_id, item_id)
recomendations_id = np.argsort(-list_pred)[:3]
rec_items = all_items[recomendations_id]
print('Рекомендация для пользователя {}: {}'.format(user_id, rec_items))

Рекомендация для пользователя 5: [8578 8958 6799]


In [10]:
import pickle

In [11]:
with open('lfm_model.pkl', 'wb') as output:
    pickle.dump(lfm_model, output)

In [12]:
with open('all_items.pkl', 'wb') as output:
    pickle.dump(all_items, output)

In [13]:
with open('all_users.pkl', 'wb') as output:
    pickle.dump(all_users, output)

In [14]:
with open('item_id.pkl', 'wb') as output:
    pickle.dump(item_id, output)