# Setting up Colab environment

In [None]:
import os
username = 'recspert'
repo = 'Recommender-Systems-Intro-Sber-2023'

# remove local directory if it already exists
if os.path.isdir(repo):
    !rm -rf {repo}

!git clone https://github.com/{username}/{repo}.git

In [None]:
!pip install --no-cache-dir --upgrade git+https://github.com/evfro/polara.git@develop#egg=polara

In [2]:
import numpy as np
from scipy.sparse import diags, csr_matrix
from scipy.sparse.linalg import norm as spnorm
from scipy.sparse.linalg import svds

from polara import get_movielens_data
from polara.preprocessing.dataframes import leave_one_out, reindex

# navigating to cloned repo directory in Colab
%cd {repo} 
from dataprep import transform_indices
from evaluation import topn_recommendations, model_evaluate, downvote_seen_items
# restoring original location
%cd - 

# Experiment description

You'll split data into 3 parts - training, validation and test.
- You will firstly use training and validation to tune your models and finding optimal configuration.
- Once a set of optimal hyper-parameters is found, you'll need to recompute your models with it on the joint training+validation dataset and report final quality on the test data.

For the test data you simply split one last item from each user. The remaining part goes into training+validation. Likewise, you split it one more time the same way as before to get our dataset for tuning.

So the scheme is as follows:
1. Tune on the training and evaluate on the validation data. Find optimal config.
2. Retrain once on the trainin+validation with the optimal config. Report final quality using the test (holdout) data.

# Prepraring data

In [3]:
data = get_movielens_data(include_time=True)

## data splits

In [4]:
rst = np.random.RandomState(111)
sampling_args = dict(target='timestamp', sample_top=True, random_state=rst)
# final test data
training_validation_, holdout_ = leave_one_out(data, **sampling_args)
# validation data
training_, validation_ = leave_one_out(training_validation_, **sampling_args)

## reindexing

In [5]:
training, data_index = transform_indices(training_, 'userid', 'movieid')
# split validation data
validation = reindex(validation_, data_index.values(), filter_invalid=True)
validation = validation.sort_values('userid')
# split final test data
holdout = reindex(holdout_, data_index.values(), filter_invalid=True)
holdout = holdout.sort_values('userid')

Filtered 3 invalid observations.


In [6]:
data_description = dict(
    users = data_index['users'].name,
    items = data_index['items'].name,
    feedback = 'rating',
    n_users = len(data_index['users']),
    n_items = len(data_index['items']),
    test_users = validation[data_index['users'].name].drop_duplicates().values
)
data_description

{'users': 'userid',
 'items': 'movieid',
 'feedback': 'rating',
 'n_users': 6040,
 'n_items': 3703,
 'test_users': array([   0,    1,    2, ..., 6037, 6038, 6039], dtype=int64)}

# PureSVD

In [9]:
def matrix_from_observations(data, data_description):
    useridx = data[data_description['users']]
    itemidx = data[data_description['items']]
    values = data[data_description['feedback']]
    return csr_matrix((values, (useridx, itemidx)), dtype='f8')

def build_svd_model(config, data, data_description):
    source_matrix = matrix_from_observations(data, data_description)
    ... # <- your code here,
    # if you need help completing the code, see PureSVD_analysis.ipynb
    return item_factors, singular_values

def svd_model_scoring(params, data, data_description):
    test_matrix = matrix_from_observations(data, data_description)
    test_users = data_description['test_users']
    item_factors, sigma = params
    scores = test_matrix[test_users].dot(item_factors) @ item_factors.T
    return scores

In [19]:
svd_config = {'rank': 40}
userid = data_description['users']
seen_data = training.loc[lambda x: x[userid].isin(data_description["test_users"])]

svd_params = build_svd_model(svd_config, training, data_description)
svd_scores = svd_model_scoring(svd_params, seen_data, data_description)

In [20]:
downvote_seen_items(svd_scores, seen_data, data_description)

In [21]:
svd_recs = topn_recommendations(svd_scores, topn=10)
model_evaluate(svd_recs, validation, data_description)

(0.09221854304635761, 0.03232668453694944, 0.25492843640291657)

# Scaled SVD


- Implement data normalization with scaling factor that reduces the effects of item popularity.
- Perform comparison of the two models, which consists of:
  - tuning both models using validation data with the same range of rank values (use HR as target metric),
  - recomputing the models with optimal configuration on the joint train+validation data,
  - reporting final quality on the holdout data using all 3 metrics: HR, MRR, Coverage.

Note that for each fixed scaling factor in Scaled SVD you don't need to recompute the model. You can still use simple rank truncation for faster tuning.  
Also don't forget to update test users in `data_description` before the final evaluation.