In [1]:
import os
import re
import sys
import time
import implicit
import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import tqdm_notebook as tqdm

In [2]:
IS_LOCAL = False
HOME_DIR = '/mnt/E/Projects/Content-based-Neural-Recommender-Systems/' if IS_LOCAL else '../../'
os.environ['HOME_DIR'] = HOME_DIR
WORKING_DIR = f'{HOME_DIR}/models/als/'
os.chdir(WORKING_DIR)

sys.path.append("../..")
from utils.prepare_data import get_zen_data

# ALS

## Load data

In [3]:
items_df, (train_df, test_df) = get_zen_data()

loading items: 328050it [00:32, 9981.21it/s] 
loading users: 42977it [02:53, 247.68it/s]


## CSR-matrix

In [4]:
# user_ids = np.concatenate([[v]*v_len for v, v_len in zip(
#     train_df['userId'].values, train_df['userItems'].apply(len).values)])
# items_ids = np.concatenate(train_df['userItems'].values)
# ratings = np.concatenate(train_df['userRatings'].values)
# item_user_data = sparse.csr_matrix((ratings, (items_ids, user_ids)), dtype=np.float32)

In [5]:
user_ids = np.concatenate([[v]*v_len for v, v_len in zip(
    train_df['userId'].values, train_df['userItems'].apply(len).values)])
items_ids = np.concatenate(train_df['userItems'].values)
ratings = np.concatenate(np.array(list(map(lambda x: x - x.mean(), train_df['userRatings'].values))))
item_user_data = sparse.csr_matrix((ratings, (items_ids, user_ids)), dtype=np.float32)

In [6]:
mean_ratings = np.array(list(map(np.mean, train_df['userRatings'].values)))

### Save matrix

In [7]:
# sparse.save_npz(f'{WORKING_DIR}item_user_data.npz', item_user_data)

In [8]:
# item_user_data = scipy.sparse.load_npz(f'{WORKING_DIR}item_user_data.npz')

## Train model

In [9]:
item_user_data.shape

(328050, 42977)

In [10]:
users_n = train_df.shape[0]
items_n = train_df['userItems'].apply(max).max()
print(f'users_n: {users_n}\titems_n: {items_n}')

users_n: 42977	items_n: 328049


In [11]:
model = implicit.als.AlternatingLeastSquares(
    factors=64,
    regularization=0.001,
    iterations=20,
    calculate_training_loss=True
)
model.fit(item_user_data)

100%|██████████| 20.0/20 [02:18<00:00,  6.86s/it, loss=0.00721]


In [24]:
text = items_df['title'].iloc[1]
text

'История улицы Ирининской в  Гомеле'

In [23]:
model.similar_items(1)

[(1, 0.013250079),
 (192494, 0.0132500585),
 (308446, 0.013250051),
 (113800, 0.013250028),
 (114111, 0.013249887),
 (318089, 0.011219429),
 (72474, 0.010409303),
 (42722, 0.009123073),
 (309233, 0.0090502305),
 (7028, 0.008998316)]

In [25]:
items_df['title'].iloc[192494]

'Где можно дешево покушать в центре Праги'

In [26]:
items_df['content'].iloc[308446]

'Бредовский двор - еще одно разрекламированное заведение с  большим количеством отзывов и  посетителей. Находится недалеко от Вацлавской площади. Обстановка стилизованная под старину.   Всегда свежее танковое пиво. Наливают чешскую классику: светлое - Pilsner Urquell, тёмное - Kozel   Заказали рульку. Подача оригинальная  Стоит 329 крон . На вкус - у Фердинанда рулька нам понравилась больше. Здесь показалась более пресная что ли... Свичкова - тушеная говядина  Она идет со сливками и сладковатым брусничным соусом. Не впечатлило. Кнедлики - вареный хлеб - тоже. Оставили лебедям) Стоимость - 209 крон. Драники и чесночный хлеб - на четверочку. Пиво - 42 крон за 0,5  Вывод: еще раз посещать не станем; бывает более вкусная кухня. Возможно - кому-то повезло больше нашего)    '

## Test model

In [12]:
from sklearn.metrics import log_loss

In [13]:
loss = 0
eps = 1e-04
user_items_data = item_user_data.T
for _, (idd, items, ratings) in tqdm(test_df.iterrows()):
    predictied_ratings = model.item_factors.dot(model.user_factors[idd])[items]+mean_ratings[idd]
#     predictied_ratings = (predictied_ratings - predictied_ratings.min()) / (predictied_ratings.max() - predictied_ratings.min())
    predictied_ratings = np.clip(predictied_ratings, eps, 1 - eps)
    loss += log_loss(ratings.astype(np.float64), predictied_ratings)
loss /= test_df.shape[0]

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [14]:
loss

0.36588847761163773

## Find best params

In [15]:
!export OPENBLAS_NUM_THREADS=1

In [20]:
def train_als(params):
    start_time = time.time()
    model = implicit.als.AlternatingLeastSquares(**params)
    model.fit(item_user_data)
    elapsed_time = time.time() - start_time
    return model, elapsed_time


def test_als(model, test_df, eps=1e-04):
    loss = 0
    for _, (idd, items, ratings) in tqdm(test_df.iterrows()):
        predictied_ratings = model.item_factors.dot(model.user_factors[idd])[items]+mean_ratings[idd]
        predictied_ratings = np.clip(predictied_ratings, eps, 1 - eps)
        loss += log_loss(ratings.astype(np.float64), predictied_ratings)
    return loss / test_df.shape[0]
    
    
calculate_training_loss = True
iterations = 20
regularization = 0.001
with open(f'{WORKING_DIR}als_params.txt', 'a+', encoding='utf-8') as log_file:
    for factors in tqdm([32, 64, 96, 128]):
        for regularization in [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1]:
            params = {
                'factors': factors,
                'regularization': regularization,
                'iterations': iterations,
                'calculate_training_loss': calculate_training_loss,
#                 'use_gpu': False,
            }
            model, elapsed_time = train_als(params)
            loss = test_als(model, test_df)
            params['elapsed_time'] = elapsed_time
            params['loss'] = loss
            print(params, file=log_file, flush=True)

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

100%|██████████| 20.0/20 [01:13<00:00,  3.58s/it, loss=0.00727]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [01:13<00:00,  3.58s/it, loss=0.00727]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [01:13<00:00,  3.58s/it, loss=0.00727]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [01:13<00:00,  3.58s/it, loss=0.00727]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [01:13<00:00,  3.58s/it, loss=0.00727]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [01:13<00:00,  3.59s/it, loss=0.00727]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [01:37<00:00,  4.75s/it, loss=0.00721]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [01:37<00:00,  4.74s/it, loss=0.00721]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [01:37<00:00,  4.75s/it, loss=0.00721]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [01:37<00:00,  4.75s/it, loss=0.00721]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [01:37<00:00,  4.75s/it, loss=0.00721]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [01:37<00:00,  4.75s/it, loss=0.00721]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [01:49<00:00,  5.36s/it, loss=0.00716]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [01:49<00:00,  5.36s/it, loss=0.00716]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [01:49<00:00,  5.36s/it, loss=0.00716]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [01:49<00:00,  5.35s/it, loss=0.00716]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [01:49<00:00,  5.35s/it, loss=0.00716]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [01:49<00:00,  5.34s/it, loss=0.00716]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [02:04<00:00,  6.11s/it, loss=0.00712]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [02:04<00:00,  6.14s/it, loss=0.00712]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [02:04<00:00,  6.12s/it, loss=0.00712]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [02:04<00:00,  6.12s/it, loss=0.00712]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [02:04<00:00,  6.09s/it, loss=0.00712]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [02:04<00:00,  6.11s/it, loss=0.00712]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

## Save model

## Load model