In [1]:
import os
import re
import sys
import time
import implicit
import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import tqdm_notebook as tqdm

In [2]:
IS_LOCAL = False
HOME_DIR = '/mnt/E/Projects/Content-based-Neural-Recommender-Systems/' if IS_LOCAL else '../../'
os.environ['HOME_DIR'] = HOME_DIR
WORKING_DIR = f'{HOME_DIR}/models/als/'
os.chdir(WORKING_DIR)

sys.path.append("../..")
from utils.prepare_data import zen
from utils.evaluate import test

100%|██████████| 4349/4349 [00:18<00:00, 236.99it/s]


# ALS

## Load data

In [3]:
items_df, users_df = zen.items_df(), zen.users_df()

loading items: 328050it [00:35, 9310.19it/s] 
loading users: 42977it [01:07, 637.55it/s]


## CSR-matrix

In [4]:
# user_ids = np.concatenate([[v]*v_len for v, v_len in zip(
#     users_df['userId'].values, users_df['userItems'].apply(len).values)])
# items_ids = np.concatenate(users_df['userItems'].values)
# ratings = np.concatenate(users_df['userRatings'].values)
# item_user_data = sparse.csr_matrix((ratings, (items_ids, user_ids)), dtype=np.float32)

In [5]:
user_ids = np.concatenate([[v]*v_len for v, v_len in zip(
    users_df['userId'].values, users_df['userItems'].apply(len).values)])
items_ids = np.concatenate(users_df['userItems'].values)
ratings = np.concatenate(np.array(list(map(lambda x: x - x.mean(), users_df['userRatings'].values))))
item_user_data_mean = sparse.csr_matrix((ratings, (items_ids, user_ids)), dtype=np.float32)

In [6]:
mean_ratings = np.array(list(map(np.mean, users_df['userRatings'].values)))

### Save matrix

In [7]:
# sparse.save_npz(f'{WORKING_DIR}item_user_data.npz', item_user_data)

In [8]:
# item_user_data = scipy.sparse.load_npz(f'{WORKING_DIR}item_user_data.npz')

## Train model

In [11]:
item_user_data_mean.shape

(328050, 42977)

In [12]:
users_n = users_df.shape[0]
items_n = users_df['userItems'].apply(max).max()
print(f'users_n: {users_n}\titems_n: {items_n}')

users_n: 42977	items_n: 328049


In [13]:
model = implicit.als.AlternatingLeastSquares(
    factors=20,
    regularization=0.001,
    iterations=20,
    calculate_training_loss=True
)
model.fit(item_user_data_mean)

GPU training requires factor size to be a multiple of 32. Increasing factors from 20 to 32.
100%|██████████| 20.0/20 [01:32<00:00,  4.50s/it, loss=0.00917]


In [14]:
USERS_N = users_df.shape[0]

In [28]:
model.item_factors.dot(model.user_factors[user_id])

array([ 5.2560715e-04, -1.6561240e-05, -3.6799705e-05, ...,
        0.0000000e+00,  1.2052821e-03, -8.2327501e-04], dtype=float32)

In [26]:
v = model.item_factors.dot(model.user_factors[user_id]) - model.item_factors.dot(model.user_factors[user_id])

In [31]:
np.sqrt(((v - mean_ratings[user_id])**2).sum())

59.160545

0.10329089598846986

In [16]:
rmse = 0
for user_id in tqdm(np.arange(USERS_N)):
    rmse += ((item_user_data_mean[:, user_id].toarray() - model.item_factors.dot(model.user_factors[user_id]) - mean_ratings[user_id])**2).sum().sqrt()
rmse /= USERS_N
rmse

HBox(children=(IntProgress(value=0, max=42977), HTML(value='')))




MemoryError: 

In [52]:
text = items_df['title'].iloc[1]
text

'История улицы Ирининской в  Гомеле'

In [53]:
model.similar_items(1)

[(1, 0.021383498),
 (192494, 0.021383176),
 (308446, 0.021382984),
 (65413, 0.021382859),
 (114111, 0.0213828),
 (196468, 0.021382585),
 (279468, 0.02138248),
 (113800, 0.021382136),
 (195986, 0.018813154),
 (318089, 0.0146360025)]

In [54]:
items_df['title'].iloc[192494]

'Где можно дешево покушать в центре Праги'

In [55]:
items_df['content'].iloc[308446]

'Бредовский двор - еще одно разрекламированное заведение с  большим количеством отзывов и  посетителей. Находится недалеко от Вацлавской площади. Обстановка стилизованная под старину.   Всегда свежее танковое пиво. Наливают чешскую классику: светлое - Pilsner Urquell, тёмное - Kozel   Заказали рульку. Подача оригинальная  Стоит 329 крон . На вкус - у Фердинанда рулька нам понравилась больше. Здесь показалась более пресная что ли... Свичкова - тушеная говядина  Она идет со сливками и сладковатым брусничным соусом. Не впечатлило. Кнедлики - вареный хлеб - тоже. Оставили лебедям) Стоимость - 209 крон. Драники и чесночный хлеб - на четверочку. Пиво - 42 крон за 0,5  Вывод: еще раз посещать не станем; бывает более вкусная кухня. Возможно - кому-то повезло больше нашего)    '

## Test model

In [56]:
def als_predict(user_id, items):
    return model.item_factors.dot(model.user_factors[user_id])[items]+mean_ratings[user_id]

In [57]:
test.ndcg(als_predict)

100%|██████████| 4349/4349 [00:40<00:00, 108.67it/s]


0.21468566339566103

In [20]:
from sklearn.metrics import log_loss

In [22]:
loss = 0
eps = 1e-06
user_items_data = item_user_data.T
for _, (idd, items, ratings) in tqdm(users_df.iterrows()):
    predictied_ratings = model.item_factors.dot(model.user_factors[idd])[items]+mean_ratings[idd]
#     predictied_ratings = (predictied_ratings - predictied_ratings.min()) / (predictied_ratings.max() - predictied_ratings.min())
    predictied_ratings = np.clip(predictied_ratings, eps, 1 - eps)
    loss += log_loss(ratings.astype(np.float64), predictied_ratings)
loss /= users_df.shape[0]

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [14]:
loss

0.36588847761163773

## Find best params

In [15]:
!export OPENBLAS_NUM_THREADS=1

In [20]:
def train_als(params):
    start_time = time.time()
    model = implicit.als.AlternatingLeastSquares(**params)
    model.fit(item_user_data)
    elapsed_time = time.time() - start_time
    return model, elapsed_time


def test_als(model, users_df, eps=1e-04):
    loss = 0
    for _, (idd, items, ratings) in tqdm(users_df.iterrows()):
        predictied_ratings = model.item_factors.dot(model.user_factors[idd])[items]+mean_ratings[idd]
        predictied_ratings = np.clip(predictied_ratings, eps, 1 - eps)
        loss += log_loss(ratings.astype(np.float64), predictied_ratings)
    return loss / users_df.shape[0]
    
    
calculate_training_loss = True
iterations = 20
regularization = 0.001
with open(f'{WORKING_DIR}als_params.txt', 'a+', encoding='utf-8') as log_file:
    for factors in tqdm([32, 64, 96, 128]):
        for regularization in [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1]:
            params = {
                'factors': factors,
                'regularization': regularization,
                'iterations': iterations,
                'calculate_training_loss': calculate_training_loss,
#                 'use_gpu': False,
            }
            model, elapsed_time = train_als(params)
            loss = test_als(model, users_df)
            params['elapsed_time'] = elapsed_time
            params['loss'] = loss
            print(params, file=log_file, flush=True)

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

100%|██████████| 20.0/20 [01:13<00:00,  3.58s/it, loss=0.00727]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [01:13<00:00,  3.58s/it, loss=0.00727]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [01:13<00:00,  3.58s/it, loss=0.00727]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [01:13<00:00,  3.58s/it, loss=0.00727]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [01:13<00:00,  3.58s/it, loss=0.00727]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [01:13<00:00,  3.59s/it, loss=0.00727]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [01:37<00:00,  4.75s/it, loss=0.00721]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [01:37<00:00,  4.74s/it, loss=0.00721]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [01:37<00:00,  4.75s/it, loss=0.00721]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [01:37<00:00,  4.75s/it, loss=0.00721]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [01:37<00:00,  4.75s/it, loss=0.00721]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [01:37<00:00,  4.75s/it, loss=0.00721]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [01:49<00:00,  5.36s/it, loss=0.00716]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [01:49<00:00,  5.36s/it, loss=0.00716]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [01:49<00:00,  5.36s/it, loss=0.00716]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [01:49<00:00,  5.35s/it, loss=0.00716]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [01:49<00:00,  5.35s/it, loss=0.00716]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [01:49<00:00,  5.34s/it, loss=0.00716]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [02:04<00:00,  6.11s/it, loss=0.00712]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [02:04<00:00,  6.14s/it, loss=0.00712]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [02:04<00:00,  6.12s/it, loss=0.00712]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [02:04<00:00,  6.12s/it, loss=0.00712]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [02:04<00:00,  6.09s/it, loss=0.00712]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [02:04<00:00,  6.11s/it, loss=0.00712]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

## Save model

## Load model