In [1]:
import os
import re
import sys
import time
import implicit
import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import tqdm_notebook as tqdm

In [2]:
IS_LOCAL = False
HOME_DIR = '/mnt/E/Projects/Content-based-Neural-Recommender-Systems/' if IS_LOCAL else '../../'
os.environ['HOME_DIR'] = HOME_DIR
WORKING_DIR = f'{HOME_DIR}/models/als/'
os.chdir(WORKING_DIR)

sys.path.append("../..")
from utils.prepare_data import get_zen_data

# ALS

## Load data

In [3]:
items_df, (train_df, test_df) = get_zen_data()

loading items: 328050it [00:27, 12066.72it/s]
loading users: 42977it [02:43, 262.63it/s]


## CSR-matrix

In [4]:
user_ids = np.concatenate([[v]*v_len for v, v_len in zip(
    train_df['userId'].values, train_df['userItems'].apply(len).values)])
items_ids = np.concatenate(train_df['userItems'].values)
ratings = np.concatenate(train_df['userRatings'].values)
item_user_data = sparse.csr_matrix((ratings, (items_ids, user_ids)), dtype=np.float32)

In [10]:
user_ids = np.concatenate([[v]*v_len for v, v_len in zip(
    train_df['userId'].values, train_df['userItems'].apply(len).values)])
items_ids = np.concatenate(train_df['userItems'].values)
ratings = np.concatenate(np.array(list(map(lambda x: x - x.mean(), train_df['userRatings'].values))))
item_user_data = sparse.csr_matrix((ratings, (items_ids, user_ids)), dtype=np.float32)

In [17]:
mean_ratings = np.array(list(map(np.mean, train_df['userRatings'].values)))

### Save matrix

In [11]:
# sparse.save_npz(f'{WORKING_DIR}item_user_data.npz', item_user_data)

In [12]:
# item_user_data = scipy.sparse.load_npz(f'{WORKING_DIR}item_user_data.npz')

## Train model

In [13]:
item_user_data.shape

(328050, 42977)

In [14]:
users_n = train_df.shape[0]
items_n = train_df['userItems'].apply(max).max()
print(f'users_n: {users_n}\titems_n: {items_n}')

users_n: 42977	items_n: 328049


In [15]:
model = implicit.als.AlternatingLeastSquares(
    factors=32,
    regularization=0.001,
    iterations=20,
    calculate_training_loss=True
)
model.fit(item_user_data)

100%|██████████| 20.0/20 [01:13<00:00,  3.58s/it, loss=0.00727]


## Test model

In [19]:
from sklearn.metrics import log_loss

In [23]:
loss = 0
eps = 1e-04
user_items_data = item_user_data.T
for _, (idd, items, ratings) in tqdm(test_df.iterrows()):
    predictied_ratings = model.item_factors.dot(model.user_factors[idd])[items]+mean_ratings[idd]
#     predictied_ratings = (predictied_ratings - predictied_ratings.min()) / (predictied_ratings.max() - predictied_ratings.min())
    predictied_ratings = np.clip(predictied_ratings, eps, 1 - eps)
    loss += log_loss(ratings.astype(np.float64), predictied_ratings)
loss /= test_df.shape[0]

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [21]:
loss

0.36633216712295347

## Find best params

In [10]:
!export OPENBLAS_NUM_THREADS=1

In [11]:
def train_als(params):
    start_time = time.time()
    model = implicit.als.AlternatingLeastSquares(**params)
    model.fit(item_user_data)
    elapsed_time = time.time() - start_time
    return model, elapsed_time


def test_als(model, test_df, eps=1e-04):
    loss = 0
    for _, (idd, items, ratings) in tqdm(test_df.iterrows()):
        predictied_ratings = model.item_factors.dot(model.user_factors[idd])[items]
        predictied_ratings = np.clip(predictied_ratings, eps, 1 - eps)
        loss += log_loss(ratings.astype(np.float64), predictied_ratings)
    return loss / test_df.shape[0]
    
    
calculate_training_loss = True
iterations = 20
regularization = 0.001
with open(f'{WORKING_DIR}als_params.txt', 'a+', encoding='utf-8') as log_file:
    for factors in tqdm([32, 96, 192, 288, 384]):
#         for regularization in [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1]:
        params = {
            'factors': factors,
            'regularization': regularization,
            'iterations': iterations,
            'calculate_training_loss': calculate_training_loss,
        }
        model, elapsed_time = train_als(params)
        loss = test_als(model, test_df)
        params['elapsed_time'] = elapsed_time
        params['loss'] = loss
        print(params, file=log_file, flush=True)

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

100%|██████████| 20.0/20 [01:13<00:00,  3.57s/it, loss=0.00409]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [01:51<00:00,  5.45s/it, loss=0.00393]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [03:21<00:00,  9.95s/it, loss=0.00377]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [05:08<00:00, 15.31s/it, loss=0.00364]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

100%|██████████| 20.0/20 [07:09<00:00, 21.41s/it, loss=0.00352]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




## Save model

## Load model