# Setting up Colab environment

In [None]:
import os
username = 'Personalization-Technologies-Lab'
repo = 'Sber-RecSys-w2024'

# remove local directory if it already exists
if os.path.isdir(repo):
    !rm -rf {repo}

!git clone https://github.com/{username}/{repo}.git

In [None]:
!pip install --no-cache-dir --upgrade git+https://github.com/evfro/polara.git@develop#egg=polara

In [1]:
import numpy as np
from scipy.sparse import coo_matrix, diags, csr_matrix
from scipy.sparse.linalg import norm as spnorm

from polara import get_movielens_data
from polara.preprocessing.dataframes import leave_one_out, reindex

# navigating to cloned repo directory in Colab
%cd {repo}
from dataprep import transform_indices
from evaluation import topn_recommendations, model_evaluate, downvote_seen_items
# restoring original location
%cd -

# Prepraring data

In [2]:
data = get_movielens_data(include_time=True)

In [3]:
training_, holdout_ = leave_one_out(data, target='timestamp', sample_top=True, random_state=0)

In [4]:
training, data_index = transform_indices(training_, 'userid', 'movieid')
holdout = reindex(holdout_, data_index.values(), filter_invalid=True)
holdout = holdout.sort_values('userid')

Filtered 2 invalid observations.


In [5]:
data_description = dict(
    users = data_index['users'].name,
    items = data_index['items'].name,
    feedback = 'rating',
    n_users = len(data_index['users']),
    n_items = len(data_index['items']),
    test_users = holdout[data_index['users'].name].drop_duplicates().values
)
data_description

{'users': 'userid',
 'items': 'movieid',
 'feedback': 'rating',
 'n_users': 6040,
 'n_items': 3704,
 'test_users': array([   0,    1,    2, ..., 6037, 6038, 6039], dtype=int64)}

In [6]:
userid = data_description['users']
seen_data = training.loc[lambda x: x[userid].isin(data_description["test_users"])]

# User-based KNN

In [11]:
def cosine_similarity(matrix):
    row_norm = spnorm(matrix, axis=1).squeeze()
    inv_norm = np.divide(1., row_norm, where=row_norm>0)
    matrix_normed = diags(inv_norm).dot(matrix)
    similarity = matrix_normed.dot(matrix_normed.T)
    similarity.setdiag(0)
    similarity.eliminate_zeros()
    return similarity.tocsr()

In [None]:
def build_naive_uknn_model(config, data, data_description):
    # get indices of observed data
    user_idx = data[data_description['users']].values
    item_idx = data[data_description['items']].values
    feedback = data[data_description['feedback']].values
    # construct rating matrix
    shape = (data_description['n_users'], data_description['n_items'])
    user_item_mtx = coo_matrix((feedback, (user_idx, item_idx)), shape=shape)
    # compute similarity matrix
    user_similarity = cosine_similarity(user_item_mtx)
    return user_item_mtx.tocsr(), user_similarity


def naive_uknn_model_scoring(params, testset, testset_description, weighting_scheme=False):
    user_item_mtx, user_similarity = params
    test_users = testset_description['test_users']
    # compute normalization coefficients and scores
    if weighting_scheme is None: # scheme: KA / KB
        ... # <- your code here
    elif isinstance(weighting_scheme, str):
        if weighting_scheme.startswith('col'): # column-wise scheme: KDA
        ... # <- your code here
        else: # ignore other shemes
            raise ValueError('Unrecognized weighting scheme')
    else:
        raise ValueError('Unrecognized weighting scheme')
    return scores.A # return dense scores array


In [68]:
uknn_params = build_naive_uknn_model({}, training, data_description, elementwise=True)

In [69]:
uknn_scores = naive_uknn_model_scoring(uknn_params, None, data_description, elementwise=True)

 ## Evaluation

In [70]:
downvote_seen_items(uknn_scores, seen_data, data_description)

In [71]:
uknn_recs = topn_recommendations(uknn_scores)

In [None]:
print('HR={:.3}, MRR={:.3}, COV={:.3}'.format(*model_evaluate(uknn_recs, holdout, data_description)))

HR=0.0502, MRR=0.0186, COV=0.0508


# Item-based KNN

In [22]:
def build_naive_iknn_model(config, data, data_description):
    # get indices of observed data
    user_idx = data[data_description['users']].values
    item_idx = data[data_description['items']].values
    relscore = data[data_description['feedback']].values
    # construct rating matrix
    shape = (data_description['n_users'], data_description['n_items'])
    user_item_mtx = coo_matrix((relscore, (user_idx, item_idx)), shape=shape)
    # compute similarity matrix and normalization coefficients
    item_similarity_ = cosine_similarity(user_item_mtx.T)
    item_similarity = truncate_similarity(item_similarity_, config['K'])
    return user_item_mtx.tocsr(), item_similarity


def truncate_similarity(similarity, k):
    similarity = similarity.tocsr()
    inds = similarity.indices
    ptrs = similarity.indptr
    data = similarity.data
    new_ptrs = [0]
    new_inds = []
    new_data = []
    for i in range(len(ptrs)-1):
        start, stop = ptrs[i], ptrs[i+1]
        if start < stop:
            data_chunk = data[start:stop]
            topk = min(len(data_chunk), k)
            idx = np.argpartition(data_chunk, -topk)[-topk:]
            new_data.append(data_chunk[idx])
            new_inds.append(inds[idx+start])
            new_ptrs.append(new_ptrs[-1]+len(idx))
        else:
            new_ptrs.append(new_ptrs[-1])
    new_data = np.concatenate(new_data)
    new_inds = np.concatenate(new_inds)
    truncated = csr_matrix(
        (new_data, new_inds, new_ptrs),
        shape=similarity.shape
    )
    return truncated


def naive_iknn_model_scoring(params, testset, testset_description, weighting_scheme=None):
    user_item_mtx, item_similarity = params
    test_users = testset_description['test_users']
    if weighting_scheme is None: # no weighting
        scores = user_item_mtx[test_users].dot(item_similarity.T)
    elif isinstance(weighting_scheme, str):
        if weighting_scheme.startswith('el'):  # element-wise, ASt / BSt
            scores_unweighted = user_item_mtx[test_users].dot(item_similarity.T)
            weights_data = (user_item_mtx[test_users] != 0).dot(item_similarity.T)
            weights_data.eliminate_zeros()
            weights = weights_data._with_data(1. / weights_data.data)
            scores = scores_unweighted.multiply(weights)
        elif weighting_scheme.startswith('row'):# row-wise, AStD
            weights_data = item_similarity.sum(axis=1).A.squeeze()
            weights = diags(np.divide(1., weights_data, where=weights_data!=0))
            scores = user_item_mtx[test_users].dot(item_similarity.T.dot(weights))
        elif weighting_scheme.startswith('col'): # column-wise, ADSt
            weights_data = item_similarity.sum(axis=1).A.squeeze()
            weights = diags(np.divide(1., weights_data, where=weights_data!=0))
            scores = user_item_mtx[test_users].dot(weights.dot(item_similarity.T))
        else:
            raise ValueError('Unrecognized weighting scheme')
    else:
        raise ValueError('Unrecognized weighting scheme')
    return scores.A


Let's build item-KNN models with additional truncation of similarity weights to sparsify similarity matrix.

In [23]:
iknn_params = build_naive_iknn_model({'K': 100}, training, data_description)

In [13]:
ewn_iknn_scores = naive_iknn_model_scoring(iknn_params, None, data_description, weighting_scheme='element')
rwn_iknn_scores = naive_iknn_model_scoring(iknn_params, None, data_description, weighting_scheme='row')
cwn_iknn_scores = naive_iknn_model_scoring(iknn_params, None, data_description, weighting_scheme='col')

 ## Evaluation

In [14]:
downvote_seen_items(ewn_iknn_scores, seen_data, data_description)
downvote_seen_items(rwn_iknn_scores, seen_data, data_description)
downvote_seen_items(cwn_iknn_scores, seen_data, data_description)

In [15]:
ewn_iknn_recs = topn_recommendations(ewn_iknn_scores)
rwn_iknn_recs = topn_recommendations(rwn_iknn_scores)
cwn_iknn_recs = topn_recommendations(cwn_iknn_scores)

## True elementwise normalization

In [16]:
print('HR={:.3f}, MRR={:.3f}, COV={:.3f}'.format(*model_evaluate(ewn_iknn_recs, holdout, data_description)))

HR=0.001, MRR=0.000, COV=0.994


In [17]:
predicted_rating = ewn_iknn_scores[np.arange(holdout.shape[0]), holdout['movieid'].values]
rmse = np.mean(np.abs(predicted_rating-holdout['rating'].values)**2)
print(f'RMSE={rmse:.3f}')

rmse=1.214


## Row-wise normalization

In [18]:
print('HR={:.3f}, MRR={:.3f}, COV={:.3f}'.format(*model_evaluate(rwn_iknn_recs, holdout, data_description)))

HR=0.066, MRR=0.021, COV=0.568


In [19]:
predicted_rating = rwn_iknn_scores[np.arange(holdout.shape[0]), holdout['movieid'].values]
rmse = np.mean(np.abs(predicted_rating-holdout['rating'].values)**2)
print(f'RMSE={rmse:.3f}')

rmse=9.113


## Column-wise normalization

In [20]:
print('HR={:.3f}, MRR={:.3f}, COV={:.3f}'.format(*model_evaluate(cwn_iknn_recs, holdout, data_description)))

HR=0.077, MRR=0.027, COV=0.437


In [21]:
predicted_rating = cwn_iknn_scores[np.arange(holdout.shape[0]), holdout['movieid'].values]
rmse = np.mean(np.abs(predicted_rating-holdout['rating'].values)**2)
print(f'RMSE={rmse:.3f}')

rmse=9.874


## Without normalization

In [27]:
iknn_scores = naive_iknn_model_scoring(iknn_params, None, data_description, weighting_scheme=None)
downvote_seen_items(iknn_scores, seen_data, data_description)
iknn_recs = topn_recommendations(iknn_scores)

In [28]:
print('HR={:.3f}, MRR={:.3f}, COV={:.3f}'.format(*model_evaluate(iknn_recs, holdout, data_description)))

HR=0.076, MRR=0.027, COV=0.349


In [29]:
predicted_rating = iknn_scores[np.arange(holdout.shape[0]), holdout['movieid'].values]
rmse = np.mean(np.abs(predicted_rating-holdout['rating'].values)**2)
print(f'RMSE={rmse:.3f}')

rmse=1446.419
