# Preamble

Various global variables, parameters

In [None]:
DEBUG = False
DATASET_MAX_SIZE = 100_000

# Data preparation

This work is using the Million Song Dataset, specifically the one with user listenings count.
It's available at http://millionsongdataset.com/tasteprofile/.
> Thierry Bertin-Mahieux, Daniel P.W. Ellis, Brian Whitman, & Paul Lamere (2011). The Million Song Dataset. In Proceedings of the 12th International Conference on Music Information Retrieval (ISMIR 2011).

It consists of a huge list (48m+ entries) of triplets `(user id, song id, listnings count)`.

## Dataset parsing

Read from the text file, simple parse and convert to numpy data structures.

In [None]:
from typing import Literal
import numpy as np

USER_MAPPING: dict[str, int] = {}
SONG_MAPPING: dict[str, int] = {}

# It's a list of tuples (user, song, listenings)
dataset_raw: list[np.ndarray[tuple[Literal[3]], np.dtype[np.int32]]] = []
with open("../../train_triplets.txt", "r") as dataset_file:
    for line in dataset_file:
        user_id, song_id, listenings = line.split("\t")

        line_vec = np.array(
            [
                USER_MAPPING.setdefault(user_id, len(USER_MAPPING)),
                SONG_MAPPING.setdefault(song_id, len(SONG_MAPPING)),
                int(listenings),
            ]
        )
        dataset_raw.append(line_vec)

        if len(dataset_raw) >= DATASET_MAX_SIZE:
            break

dataset = np.array(dataset_raw, dtype=np.float64)

## Dataset training/validation preparation

Shuffle & split into subsets.

We take 2/3 for the training, and 1/3 for validation.

In [None]:
col_means = np.mean(dataset[:, 2], axis=0)
print(col_means)
print(dataset[:, 2])
print(col_means * np.ones(len(dataset[:, 2])))
dataset[:, 2] = (dataset[:, 2] - col_means * np.ones(len(dataset[:, 2]))) / np.std(
    dataset[:, 2], axis=0
)
print(dataset.dtype)
print(dataset[:, 2])


dataset_perm = np.random.permutation(len(dataset))
dataset_shuffled = dataset[dataset_perm]
training_set_size = int(len(dataset_shuffled) * 0.66)
training_set = dataset_shuffled[:training_set_size]
validation_set = dataset_shuffled[training_set_size:]

# Learning

The method used is a Stochastic Gradient Descent (SGD), using Regularized Mean Squared Error (RMSE) as the loss function.
It corresponds to
$$
\min_{q^*,p^*} \sum_{(u,i) \in \mathcal{K}} \left(r_{ui} - q_i^Tp_u\right)^2 + \lambda\left(||q_i||^2 + ||p_u||^2\right)
$$

The overall method is taken from [Matrix Factorization Techniques for Recommender Systems
](https://ieeexplore.ieee.org/document/5197422).
> Y. Koren, R. Bell and C. Volinsky, "Matrix Factorization Techniques for Recommender Systems," in Computer, vol. 42, no. 8, pp. 30-37, Aug. 2009, doi: 10.1109/MC.2009.263. keywords: {Recommender systems;Motion pictures;Filtering;Collaboration;Sea measurements;Predictive models;Genomics;Bioinformatics;Nearest neighbor searches;Computational intelligence;Netflix Prize;Matrix factorization},



## Prepare learning

Prepare learning sets $q$ and $p$, which are random matrices of shapes $(|\mathrm{songs}|, l)$ and $(|\mathrm{users}|, l)$.

Set parameters:
- Size `l` of latent space to embed users & films
- Learning rate $\gamma$
- Regularization $\lambda$
- Number of epochs (rounds)

In [None]:
# (Hyperparameter) Size of latent space to make the embeddings
l = 1000
# Initial (random) values
# Shape: (#SONGS, l)
q = np.random.random_sample((len(SONG_MAPPING), l))
if DEBUG:
    print(q.shape, q.dtype, q)
# Shape: (#USERS, l)
p = np.random.random_sample((len(USER_MAPPING), l))
if DEBUG:
    print(p.shape, p.dtype, p)


# Training parameters
lbd = 0.01
gamma = 0.01
n_epochs = 20

## Actual learning

Process the SGD, accumulating loss so it can be analyzed.

In [None]:
losses = [np.nan] * n_epochs
losses_validation = [np.nan] * n_epochs

for epoch in range(n_epochs):
    print(f"Epoch {epoch+1}")
    loss_sum: float = 0

    np.random.shuffle(training_set)  # Reorder each epoch
    # user \in [0, #USERS - 1]
    # song \in [0, #SONGS - 1]
    # listenings \in N (r_ui, "true" value)
    for i, (user, song, listenings) in enumerate(training_set):
        if DEBUG:
            print(
                f"Training value {i}/{len(training_set)}: ({user},{song},{listenings})"
            )

        # Predicted value
        p_u = p[user].copy()
        q_i = q[song].copy()
        if DEBUG:
            print(p_u)
            print(q_i)

        listenings_hat = p_u.T @ q_i
        if DEBUG:
            print(f"Prediction: {listenings_hat}")

        # Prediction error
        e_ui = listenings - listenings_hat

        # This is the learning part
        q[song] += gamma * (e_ui * p_u - lbd * q_i)
        p[user] += gamma * (e_ui * q_i - lbd * p_u)

        # Loss
        loss = e_ui**2 + lbd * (np.linalg.norm(q_i) ** 2 + np.linalg.norm(p_u) ** 2)
        if DEBUG:
            print(f"Loss: {loss}")
        loss_sum += loss

    losses[epoch] = loss_sum / len(training_set)

    # Now evaluating on validation data
    loss_validation_sum = 0
    for user, song, listenings in validation_set:
        listenings_hat = p[user].T @ q[song]

        e_ui = listenings - listenings_hat

        # Loss
        loss = e_ui**2 + lbd * (
            np.linalg.norm(q[song]) ** 2 + np.linalg.norm(p[user]) ** 2
        )
        loss_validation_sum += loss

    losses_validation[epoch] = loss_validation_sum / len(training_set)

    print(f"Loss: {losses[epoch]}, validation loss: {losses_validation[epoch]}")

# Analysis

## Learning results

We first analyze the learning raw results: training and validation losses.

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(10, 5))

_ = ax.plot(losses, label="Train loss")
_ = ax.plot(losses_validation, label="Validation loss")
_ = ax.set_yscale("log")
_ = ax.set_xlabel("epoch")
_ = ax.set_title("Losses during learning")
_ = fig.legend()
fig.show()

## ... more analysis

Evaluation of the model?