In [1]:
import os
import sys
import re
import implicit
import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import tqdm_notebook as tqdm

sys.path.append("../..")
from prepare_data import get_zen_data

In [2]:
HOME_DIR = '/mnt/E/Projects/Content-based-Neural-Recommender-Systems/'
WORKING_DIR = f'{HOME_DIR}/models/als/'
os.chdir(WORKING_DIR)

# ALS

## Load data

In [None]:
items_df, (train_df, test_df) = get_zen_data()

## CSR-matrix

In [4]:
user_ids = np.concatenate([[v]*v_len for v, v_len in zip(
    train_df['userId'].values, train_df['userItems'].apply(len).values)])
items_ids = np.concatenate(train_df['userItems'].values)
ratings = np.concatenate(train_df['userRatings'].values)
item_user_data = sparse.csr_matrix((ratings, (items_ids, user_ids)), dtype=np.float32)

### Save coo-matrix

In [5]:
# sparse.save_npz(f'{WORKING_DIR}item_user_data.npz', item_user_data)

In [None]:
# item_user_data = scipy.sparse.load_npz(f'{WORKING_DIR}item_user_data.npz')

## Train model

In [6]:
item_user_data.shape

(328050, 42977)

In [7]:
users_n = train_df.shape[0]
items_n = train_df['userItems'].apply(max).max()
print(f'users_n: {users_n}\titems_n: {items_n}')

users_n: 42977	items_n: 328049


## Test model

In [None]:
from sklearn.metrics import log_loss

## Find best params

In [None]:
def train_als(params):
    model = implicit.als.AlternatingLeastSquares(
        factors=100,
        regularization=0.01,
        iterations=15,
        
    )
    model.fit(item_user_data)
    return model


def test_als(model, test_df, eps=1e-04):
    loss = 0
    for _, (idd, items, ratings) in tqdm(test_df.iterrows()):
        predictied_ratings = model.item_factors.dot(model.user_factors[idd])[items]
        predictied_ratings = np.clip(predictied_ratings, eps, 1 - eps)
        loss += log_loss(ratings.astype(np.float64), predictied_ratings)
    loss /= test_df.shape[0]
    
    
calculate_training_loss = True
iterations = 20
with open(f'{WORKING_DIR}als_params.txt', 'a+', encoding='utf-8') as log_file:
    for factors in [10, 40, 80, 120, 250, 500]:
        for regularization in [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1]:
            params = {
                'factors': factors,
                'regularization': regularization,
                'iterations': iterations,
                'calculate_training_loss': calculate_training_loss,
            }
            model = train_als(params)
            loss = test_als(model, test_df)
            print(f'{params}\tloss: {loss}', file=log_file)

## Save model

## Load model