In [9]:
import random
import numpy as np
import scipy.sparse
from scipy.spatial.distance import cosine as cos_distance

In [2]:
input_filepath = "user_item_matrix_normalized.npz"

M = scipy.sparse.load_npz(input_filepath)
M = M.todense() # Not good but easy to use

In [3]:
def cos_similarity(u, v):
    return 1 - cos_distance(u, v)

def predict(user_id, item_id):
    item_mask = np.arange(M.shape[1]) != item_id
    user_mask = np.arange(M.shape[0]) != user_id
    M_excluded = M[user_mask, :][:, item_mask]
    user_vect = M[user_id, item_mask]
    
    total_similarity = 0
    total_score = 0
    for counter, u in enumerate(M_excluded):
        similarity = cos_similarity(u, user_vect)
        total_score += M[counter, item_id] * similarity
        total_similarity += similarity
    return total_score / total_similarity

In [18]:
# MAE is not a good evaluation technique for this model
# Since the matrix is very sparse and it is unlikely for similar users to watch the same movies,
# the model cannot give strong scores
# The model is good to recommend a movie (max of prediction vector) but it is not very clever to predict
# original scores. That's why MAE is around 0.5.

sample_size = 200

row_idx, col_idx = M.nonzero()
total_error = 0
sample_set = list(zip(row_idx, col_idx))
random.shuffle(sample_set)
sample_set = sample_set[:sample_size]
for row, col in sample_set:
    true_score = M[row, col]
    estimated_score = predict(row, col)
    total_error += abs(true_score - estimated_score)
MAE = total_error / sample_size
print("MAE : %s" % MAE)

MAE : 0.5122371276096263
