In [202]:
import json
from functools import partial
from pathlib import Path

import numpy as np

In [203]:
with Path("goodreads_reviews_young_adult_train.json").open() as f:
    train = [json.loads(line) for line in f]

with Path("goodreads_reviews_young_adult_val.json").open() as f:
    valid = [json.loads(line) for line in f]

with Path("goodreads_reviews_young_adult_test.json").open() as f:
    test = [json.loads(line) for line in f]

### Encode all data

The data comes in a format that is not easily manipulatable in matrix format. As a result we encode all users and items to unique index values starting at 0 and increasing, using a process called one hot encoding.
It is a reasonable assumption that the encodings for all known users will be known at the time of training, and that any unseen customers will be encoded to have an id that is not currently taken by another user, so creating one set of encodings should not impact our results using any unknown information. Since the weights of items and users will not be updated unless a rating is seen and loss gradient calculations will not be affected, the weights calculated will be safe from unknown data despite pre encoding.


In [204]:
def encode_data(
    data: list[dict], encodings: tuple[dict, dict] | None = None
) -> tuple[list[int], list[int], list[int], tuple[dict, dict]]:
    """One hot encode a list of dictionaries containing User ids & Item ids and their corresponding ratings.

    :param data: List of dictionaries containing a 'user_id', 'item_id' and 'rating' attribute
    :param encodings: A tuple of one hot encoded user & item id's to update, defaults to None
    :return: A tuple of updated one hot encoded user & item id's
    """
    users, items, ratings = zip(
        *[(line["user_id"], line["item_id"], line["rating"]) for line in data],
        strict=False,
    )

    unique_users = set(users)
    unique_items = set(items)

    if encodings is None:
        print("Creating encoding dicts")
        # Create one hot encoding keys
        user_encoding = {user: i for i, user in enumerate(unique_users)}
        item_encoding = {item: i for i, item in enumerate(unique_items)}
    else:
        user_encoding, item_encoding = encodings
        new_users = unique_users - set(user_encoding.keys())
        new_items = unique_items - set(item_encoding.keys())
        max_user = max(user_encoding.values())
        max_item = max(item_encoding.values())
        for i, user in enumerate(new_users):
            user_encoding[user] = max_user + i + 1
        for i, item in enumerate(new_items):
            item_encoding[item] = max_item + i + 1

    # Apply one hot encoding
    encoded_users = [user_encoding[user] for user in users]
    encoded_items = [item_encoding[item] for item in items]
    return encoded_users, encoded_items, ratings, (user_encoding, item_encoding)

In [205]:
train_users, train_items, train_ratings, encodings = encode_data(train)
valid_users, valid_items, valid_ratings, encodings = encode_data(valid, encodings)
test_users, test_items, test_ratings, encodings = encode_data(test, encodings)
user_encoding, item_encoding = encodings
reverse_user_encoding = {val: key for key, val in user_encoding.items()}
reverse_item_encoding = {val: key for key, val in item_encoding.items()}

Creating encoding dicts


# Question 1


## 1A


In [206]:
def global_bias(ratings: list[int]) -> float:
    """Return global bias value for the provided dataset.

    :param ratings: List of user ratings
    :return: global bias value
    """
    return sum(ratings) / len(ratings)


global_bias_val = global_bias(train_ratings)
print(f"Global bias value is: {global_bias_val:.6f}")

Global bias value is: 3.763456


We find the global bias for the dataset by finding the average rating across all users and items. We find the average rating across all items is roughly 3.76.


## 1B


In [215]:
def user_biases(
    ratings: list[int], encoded_users: list[int], global_bias_val: float
) -> dict[int, float]:
    """Calculate the bias values observed for a list of users and their corresponding ratings.

    :param ratings: A list of rating values in the same order as the input encoded_users
    :param encoded_users: A list of one hot encoded user id's
    :param global_bias_val: The precalculated global bias
    :return: A dictionary of one hot encoded user ids and their corresponding biases
    """

    user_ratings = {user: [] for user in set(encoded_users)}
    for user, rating in zip(encoded_users, ratings, strict=True):
        user_ratings[user].append(rating)

    return {
        user: (sum(rating_vals) / len(rating_vals)) - global_bias_val
        if rating_vals
        else 0
        for user, rating_vals in user_ratings.items()
    }


train_user_bias = user_biases(train_ratings, train_users, global_bias_val)
example_user = "91ceb82d91493506532feb02ce751ce7"
print(
    f"User bias value for user '{example_user}' is: {train_user_bias[user_encoding[example_user]]:.6f}"
)

User bias value for user '91ceb82d91493506532feb02ce751ce7' is: -0.997498


Similarly to the global bias we calculate the average rating for the bias, however for the user bias we calculate the average for each individual user across all items they rated.


## 1C


In [216]:
def item_biases(
    ratings: list[int], encoded_items: list[int], global_bias_val: float
) -> dict[int, float]:
    """Calculate the bias values observed for a list of items and their corresponding ratings.

    :param ratings: A list of rating values in the same order as the input encoded_items
    :param encoded_items: A list of one hot encoded item id's
    :param global_bias_val: The precalculated global bias
    :return: A dictionary of one hot encoded item ids and their corresponding biases
    """

    item_ratings = {item: [] for item in set(encoded_items)}
    for item, rating in zip(encoded_items, ratings, strict=True):
        item_ratings[item].append(rating)

    return {
        item: (sum(rating_vals) / len(rating_vals)) - global_bias_val
        if rating_vals
        else 0
        for item, rating_vals in item_ratings.items()
    }


train_item_bias = item_biases(train_ratings, train_items, global_bias_val)
example_item = "6931234"
print(
    f"item bias value for item '{example_item}' is: {train_item_bias[item_encoding[example_item]]:.6f}"
)


item bias value for item '6931234' is: -0.247327


Again for item bias we calculate the average rating for each item regardless of the users who rated them.


# Question 2


## 2A


#### Encode all item_ids and user_ids


In [218]:
def loss(
    encoded_users: list[int],
    encoded_items: list[int],
    ratings: list[int],
    q: np.ndarray,
    p: np.ndarray,
    lambda1: float,
    lambda2: float,
) -> tuple[float, float]:
    """Loss function & RMSE function for stochastic gradient descent algorithm.

    :param encoded_users: List of user ids encoded to be integer indices
    :param encoded_items: List of item ids encoded to be integer indices
    :param ratings: List of ratings s.t. rating[i] belongs to encoded_user[i] on item[i]
    :param q: Latent vector representation of user profiles
    :param p: Latent vector representation of item profiles
    :param lambda1: Weighting given to sum of squared q l2 norms
    :param lambda2: Weighting given to sum of squared p l2 norms
    :return: Loss value and RMSE value
    """
    # ---------------------------------------------------------------------
    # BELOW ARE TWO VERSIONS OF THE SAME METHOD TO CALCULATE THE DIFFERENCE
    # SUM, BUT I LEFT THE FIRST HERE COMMENTED SINCE IT IS MUCH EASIER TO
    # READ BUT SLIGHTLY SLOWER
    # ---------------------------------------------------------------------

    # differences = []
    # for encoded_user, encoded_item, rating in zip(
    #     encoded_users, encoded_items, ratings, strict=False
    # ):
    #     r_hat_ij = q[encoded_user, :] @ p[encoded_item, :].T
    #     diff_sq = (rating - r_hat_ij) ** 2
    #     differences.append(diff_sq)
    # difference_sum = sum(differences)

    difference_sum = sum(
        [
            (rating - q[encoded_user, :] @ p[encoded_item, :].T) ** 2
            for encoded_user, encoded_item, rating in zip(
                encoded_users, encoded_items, ratings, strict=False
            )
        ]
    )
    q_norm_sum = sum(
        np.apply_along_axis(lambda x: np.linalg.norm(x) ** 2, axis=1, arr=q)
    )
    p_norm_sum = sum(
        np.apply_along_axis(lambda x: np.linalg.norm(x) ** 2, axis=1, arr=p)
    )
    loss_val = difference_sum + lambda1 * q_norm_sum + lambda2 * p_norm_sum
    rmse = (difference_sum / len(ratings)) ** 0.5
    return loss_val, rmse


def update_factors(
    encoded_users: list[int],
    encoded_items: list[int],
    ratings: list[int],
    q: np.ndarray,
    p: np.ndarray,
    lambda1: float,
    lambda2: float,
    lr: float,
) -> None:
    """Update the given q & p weights matrices using stochastic gradient descent.

    :param encoded_users: A list of one hot encoded user ids
    :param encoded_items: A list of one hot encoded item ids
    :param ratings: A list of ratings in corresponding order to the encoded user and item lists
    :param q: Weights matrix with (# users x k factors) size
    :param p: Weights matrix with (# items x k factors) size
    :param lambda1: Weighting given to the squared l2 norm of the q matrix
    :param lambda2: Weighting given to the squared l2 norm of the p matrix
    :param lr: Learning rate for weight updates
    """
    for encoded_user, encoded_item, rating in zip(
        encoded_users, encoded_items, ratings, strict=False
    ):
        r_hat_ij = np.dot(q[encoded_user, :].T, p[encoded_item, :])
        diff_deriv = (rating - r_hat_ij) * -2
        if np.isnan(diff_deriv):
            print(r_hat_ij)
        for f in range(q.shape[1]):  # For factor in k
            q[encoded_user, f] -= ((lr * diff_deriv) * p[encoded_item, f]) + (
                lr * 2 * lambda1 * q[encoded_user, f]
            )
            p[encoded_item, f] -= ((lr * diff_deriv) * q[encoded_user, f]) + (
                lr * 2 * lambda2 * p[encoded_item, f]
            )


def SGD(
    encoded_users: list[int],
    encoded_items: list[int],
    ratings: list[int],
    lambda1: float,
    lambda2: float,
    q: np.ndarray,
    p: np.ndarray,
    epochs: int = 10,
    lr: float = 0.01,
) -> tuple[np.ndarray, np.ndarray]:
    """Stochastic Gradient Descent Latent Factor Recommender System.

    :param encoded_users: A list of one hot encoded user ids
    :param encoded_items: A list of one hot encoded item ids
    :param ratings: A list of ratings in corresponding order to the encoded user and item lists
    :param lambda1: Weighting given to the squared l2 norm of the q matrix
    :param lambda2: Weighting given to the squared l2 norm of the p matrix
    :param q: Weights matrix with (# users x k factors) size
    :param p: Weights matrix with (# items x k factors) size
    :param epochs: Number of iterations, defaults to 10
    :param lr: Learning rate for weight updates, defaults to 0.01
    :return: Final q & p matrices
    """
    # Many of the variables in the loss function won't change so for
    # Readability we will set them as constant.
    loss_func = partial(
        loss,
        encoded_users=encoded_users,
        encoded_items=encoded_items,
        ratings=ratings,
        lambda1=lambda1,
        lambda2=lambda2,
    )

    update = partial(
        update_factors,
        encoded_users=encoded_users,
        encoded_items=encoded_items,
        ratings=ratings,
        lambda1=lambda1,
        lambda2=lambda2,
        lr=lr,
    )

    for i in range(epochs):
        print(f"Epoch {i+1}:", end=" ")
        update(q=q, p=p)
        loss_val, rmse_val = loss_func(q=q, p=p)
        print(f"Loss: {loss_val:.6f}, RMSE: {rmse_val:.6f}")
    return q, p

In order to make the next two steps easier, I implemented SGD as a standalone function as defined above.


In [224]:
k = 8

unique_users = max(user_encoding.values()) + 1
unique_items = max(item_encoding.values()) + 1

np.random.seed(121017)  # NOQA: NPY002
q = np.random.normal(1, scale=0.1, size=unique_users * k).reshape(unique_users, k)
p = np.random.normal(1, scale=0.1, size=unique_items * k).reshape(unique_items, k)
q, p = SGD(
    encoded_users=train_users,
    encoded_items=train_items,
    ratings=train_ratings,
    lambda1=0.3,
    lambda2=0.3,
    q=q,
    p=p,
    epochs=10,
    lr=0.01,
)

Epoch 1: Loss: 2984864.076879, RMSE: 1.284621
Epoch 2: Loss: 2632810.959652, RMSE: 1.195504
Epoch 3: Loss: 2459863.449682, RMSE: 1.151260
Epoch 4: Loss: 2347784.007002, RMSE: 1.122609
Epoch 5: Loss: 2266290.048069, RMSE: 1.101856
Epoch 6: Loss: 2203292.137349, RMSE: 1.085896
Epoch 7: Loss: 2152741.475028, RMSE: 1.073169
Epoch 8: Loss: 2111159.677850, RMSE: 1.062774
Epoch 9: Loss: 2076348.943256, RMSE: 1.054138
Epoch 10: Loss: 2046823.587465, RMSE: 1.046875


## 2B


In [211]:
lambda1 = lambda2 = 0.3

# Set all unchanging variables as static for future simplicity
validation_loss = partial(
    loss,
    encoded_users=valid_users,
    encoded_items=valid_items,
    ratings=valid_ratings,
    lambda1=lambda1,
    lambda2=lambda2,
)

# Set all unchanging variables as static for future simplicity
test_loss = partial(
    loss,
    encoded_users=test_users,
    encoded_items=test_items,
    ratings=test_ratings,
    lambda1=lambda1,
    lambda2=lambda2,
)

models = []

for k in (4, 8, 16):
    np.random.seed(121017)  # NOQA: NPY002
    q = np.random.normal(1, 0.1, size=unique_users * k).reshape(unique_users, k)  # NOQA: NPY002
    p = np.random.normal(1, 0.1, size=unique_items * k).reshape(unique_items, k)  # NOQA: NPY002
    q, p = SGD(
        encoded_users=train_users,
        encoded_items=train_items,
        ratings=train_ratings,
        lambda1=0.3,
        lambda2=0.3,
        q=q,
        p=p,
        epochs=10,
        lr=0.01,
    )

    # Store all the model weights and the validation RMSE so we can assess the best model
    _, rmse_val = validation_loss(q=q, p=p)
    models.append({"k": k, "q": np.copy(q), "p": np.copy(p), "RMSE": rmse_val})

    print("=" * 40)
    print(f"RMSE with k = {k} factors: {rmse_val:.6f}")
    print("=" * 40)

# Sort the models based on the validation RMSE and take the model with the lowest error
best_model = sorted(models, key=lambda x: x["RMSE"])[0]

_, rmse_test = test_loss(q=best_model["q"], p=best_model["p"])

result_str = f"Best model had k = {best_model['k']} factors, Validation RMSE: {best_model['RMSE']}, Test RMSE: {rmse_test}"
print()
print("*" * len(result_str))
print(result_str)
print("*" * len(result_str))

Epoch 1: Loss: 2093227.624406, RMSE: 1.101478
Epoch 2: Loss: 1998061.856147, RMSE: 1.072471
Epoch 3: Loss: 1945491.030523, RMSE: 1.056290
Epoch 4: Loss: 1910613.962578, RMSE: 1.045501
Epoch 5: Loss: 1885460.403815, RMSE: 1.037697
Epoch 6: Loss: 1866426.755245, RMSE: 1.031781
Epoch 7: Loss: 1851565.837149, RMSE: 1.027158
Epoch 8: Loss: 1839703.175067, RMSE: 1.023465
Epoch 9: Loss: 1830075.425107, RMSE: 1.020466
Epoch 10: Loss: 1822158.879828, RMSE: 1.018000
RMSE with k = 4 factors: 1.152184
Epoch 1: Loss: 2984864.076879, RMSE: 1.284621
Epoch 2: Loss: 2632810.959652, RMSE: 1.195504
Epoch 3: Loss: 2459863.449682, RMSE: 1.151260
Epoch 4: Loss: 2347784.007002, RMSE: 1.122609
Epoch 5: Loss: 2266290.048069, RMSE: 1.101856
Epoch 6: Loss: 2203292.137349, RMSE: 1.085896
Epoch 7: Loss: 2152741.475028, RMSE: 1.073169
Epoch 8: Loss: 2111159.677850, RMSE: 1.062774
Epoch 9: Loss: 2076348.943256, RMSE: 1.054138
Epoch 10: Loss: 2046823.587465, RMSE: 1.046875
RMSE with k = 8 factors: 1.237186
Epoch 1: L

# Task 3


## 3A


In [222]:
def bias_loss(
    encoded_users: list[int],
    encoded_items: list[int],
    ratings: list[int],
    q: np.ndarray,
    p: np.ndarray,
    b_user: np.ndarray,
    b_item: np.ndarray,
    b_global: float,
    lambda1: float,
    lambda2: float,
    lambda3: float,
    lambda4: float,
) -> tuple[float, float]:
    """Loss function & RMSE function for stochastic gradient descent algorithm with bias.

    :param encoded_users: List of user ids encoded to be integer indices
    :param encoded_items: List of item ids encoded to be integer indices
    :param ratings: List of ratings s.t. rating[i] belongs to encoded_user[i] on item[i]
    :param q: Latent vector representation of user profiles
    :param p: Latent vector representation of item profiles
    :param b_user: Array of user biases s.t. each index i corresponds to one hot encoded user i
    :param b_item: Array of item biases s.t. each index i corresponds to one hot encoded item i
    :param b_global: Global bias for the dataset
    :param lambda1: Weighting given to sum of squared q l2 norms
    :param lambda2: Weighting given to sum of squared p l2 norms
    :param lambda3: Weighting given to sum of user bias values
    :param lambda4: Weighting given to sum of item bias values
    :return: tuple containing total loss & RMSE
    """

    # ---------------------------------------------------------------------
    # BELOW ARE TWO VERSIONS OF THE SAME METHOD TO CALCULATE THE DIFFERENCE
    # SUM, BUT I LEFT THE FIRST HERE COMMENTED SINCE IT IS MUCH EASIER TO
    # READ BUT SLIGHTLY SLOWER
    # ---------------------------------------------------------------------

    # differences = []
    # for encoded_user, encoded_item, rating in zip(
    #     encoded_users, encoded_items, ratings, strict=False
    # ):
    #     r_hat_ij = q[encoded_user, :] @ p[encoded_item, :].T
    #     diff_sq = (rating - r_hat_ij - b_global - b_user[encoded_user] - b_item[encoded_item]) ** 2
    #     differences.append(diff_sq)
    # difference_sum = sum(differences)

    difference_sum = sum(
        [
            (
                rating
                - q[encoded_user, :] @ p[encoded_item, :].T
                - b_global
                - b_user[encoded_user]
                - b_item[encoded_item]
            )
            ** 2
            for encoded_user, encoded_item, rating in zip(
                encoded_users, encoded_items, ratings, strict=False
            )
        ]
    )
    q_norm_sum = sum(
        np.apply_along_axis(lambda x: np.linalg.norm(x) ** 2, axis=1, arr=q)
    )
    p_norm_sum = sum(
        np.apply_along_axis(lambda x: np.linalg.norm(x) ** 2, axis=1, arr=p)
    )

    # user_bias_sum = sum(
    #     [bias**2 for i, bias in enumerate(b_user) if i in encoded_users]
    # )
    # item_bias_sum = sum(
    #     [bias**2 for i, bias in enumerate(b_item) if i in encoded_items]
    # )

    # I decided to ignore the user and item bias sums to speed up calculations
    # Since the loss value is just an interesting extra.
    user_bias_sum = item_bias_sum = 0

    loss_val = (
        difference_sum
        + lambda1 * q_norm_sum
        + lambda2 * p_norm_sum
        + lambda3 * user_bias_sum
        + lambda4 * item_bias_sum
    )
    rmse = (difference_sum / len(ratings)) ** 0.5
    return loss_val, rmse


def bias_update_factors(
    encoded_users: list[int],
    encoded_items: list[int],
    ratings: list[int],
    q: np.ndarray,
    p: np.ndarray,
    b_user: np.ndarray,
    b_item: np.ndarray,
    b_global: float,
    lambda1: float,
    lambda2: float,
    lambda3: float,
    lambda4: float,
    lr: float,
) -> None:
    """Update the given q, p, user bias & item bias weights matrices using stochastic gradient descent.

    :param encoded_users: List of user ids encoded to be integer indices
    :param encoded_items: List of item ids encoded to be integer indices
    :param ratings: List of ratings s.t. rating[i] belongs to encoded_user[i] on item[i]
    :param q: Latent vector representation of user profiles
    :param p: Latent vector representation of item profiles
    :param b_user: Array of user biases s.t. each index i corresponds to one hot encoded user i
    :param b_item: Array of item biases s.t. each index i corresponds to one hot encoded item i
    :param b_global: Global bias for the dataset
    :param lambda1: Weighting given to sum of squared q l2 norms
    :param lambda2: Weighting given to sum of squared p l2 norms
    :param lambda3: Weighting given to sum of user bias values
    :param lambda4: Weighting given to sum of item bias values
    :param lr: Learning rate for weight updates
    """
    for encoded_user, encoded_item, rating in zip(
        encoded_users, encoded_items, ratings, strict=False
    ):
        r_hat_ij = (
            (q[encoded_user, :] @ p[encoded_item, :].T)
            + b_global
            + b_user[encoded_user]
            + b_item[encoded_item]
        )
        diff_deriv = (rating - r_hat_ij) * -2
        for f in range(q.shape[1]):  # For factor in k
            q[encoded_user, f] -= lr * (
                diff_deriv * p[encoded_item, f] + 2 * lambda1 * q[encoded_user, f]
            )
            p[encoded_item, f] -= lr * (
                diff_deriv * q[encoded_user, f] + 2 * lambda2 * p[encoded_item, f]
            )

        b_user[encoded_user] -= lr * (diff_deriv + 2 * lambda3 * b_user[encoded_user])
        b_item[encoded_item] -= lr * (diff_deriv + 2 * lambda4 * b_item[encoded_item])


def bias_SGD(
    encoded_users: list[int],
    encoded_items: list[int],
    ratings: list[int],
    b_user: np.ndarray,
    b_item: np.ndarray,
    b_global: float,
    lambda1: float,
    lambda2: float,
    lambda3: float,
    lambda4: float,
    q: np.ndarray,
    p: np.ndarray,
    epochs: int = 10,
    lr: float = 0.01,
) -> tuple[np.ndarray, np.ndarray]:
    """Stochastic Gradient Descent Latent Factor Recommender System accounting for biases.

    :param encoded_users: List of user ids encoded to be integer indices
    :param encoded_items: List of item ids encoded to be integer indices
    :param ratings: List of ratings s.t. rating[i] belongs to encoded_user[i] on item[i]
    :param b_user: Array of user biases s.t. each index i corresponds to one hot encoded user i
    :param b_item: Array of item biases s.t. each index i corresponds to one hot encoded item i
    :param b_global: Global bias for the dataset
    :param lambda1: Weighting given to sum of squared q l2 norms
    :param lambda2: Weighting given to sum of squared p l2 norms
    :param lambda3: Weighting given to sum of user bias values
    :param lambda4: Weighting given to sum of item bias values
    :param q: Latent vector representation of user profiles
    :param p: Latent vector representation of item profiles
    :param epochs: Number of iterations, defaults to 10
    :param lr: Learning rate for weight updates, defaults to 0.01
    """
    # Many of the variables in the loss function won't change so for
    # Readability we will set them as constant.
    loss_func = partial(
        bias_loss,
        encoded_users=encoded_users,
        encoded_items=encoded_items,
        ratings=ratings,
        lambda1=lambda1,
        lambda2=lambda2,
        lambda3=lambda3,
        lambda4=lambda4,
        b_global=b_global,
    )

    update = partial(
        bias_update_factors,
        encoded_users=encoded_users,
        encoded_items=encoded_items,
        ratings=ratings,
        lambda1=lambda1,
        lambda2=lambda2,
        lambda3=lambda3,
        lambda4=lambda4,
        b_global=b_global,
        lr=lr,
    )

    for i in range(epochs):
        print(f"Epoch {i+1}:", end=" ")
        update(q=q, p=p, b_user=b_user, b_item=b_item)
        loss_val, rmse_val = loss_func(q=q, p=p, b_user=b_user, b_item=b_item)
        print(f"Loss: {loss_val:.6f}, RMSE: {rmse_val:.6f}")
    return q, p

In [223]:
k = 8
lambda1 = lambda2 = lambda3 = lambda4 = 0.3

unique_users = max(user_encoding.values()) + 1
unique_items = max(item_encoding.values()) + 1

np.random.seed(121017)  # NOQA: NPY002
q = np.random.normal(1, 0.1, size=unique_users * k).reshape(unique_users, k)  # NOQA: NPY002
p = np.random.normal(1, 0.1, size=unique_items * k).reshape(unique_items, k)  # NOQA: NPY002
b_global = global_bias_val
b_user = np.zeros(shape=(unique_users,))
b_item = np.zeros(shape=(unique_items,))
for user, bias in train_user_bias.items():
    b_user[user] = bias
for item, bias in train_item_bias.items():
    b_item[item] = bias

q, p = bias_SGD(
    encoded_users=train_users,
    encoded_items=train_items,
    ratings=train_ratings,
    lambda1=lambda1,
    lambda2=lambda2,
    lambda3=lambda3,
    lambda4=lambda4,
    b_user=b_user,
    b_item=b_item,
    b_global=b_global,
    q=q,
    p=p,
    epochs=10,
    lr=0.01,
)

print()
print(
    f"The learned user bias value for user '{example_user}' is: {b_user[user_encoding[example_user]]:.6f}"
)

print(
    f"The learned item bias value for item '{example_item}' is: {b_item[item_encoding[example_item]]:.6f}"
)

Epoch 1: Loss: 2922948.562885, RMSE: 1.279505
Epoch 2: Loss: 2262923.909632, RMSE: 1.096707
Epoch 3: Loss: 2042770.140403, RMSE: 1.032366
Epoch 4: Loss: 1936271.372365, RMSE: 1.002041
Epoch 5: Loss: 1873952.516700, RMSE: 0.985393
Epoch 6: Loss: 1832737.140797, RMSE: 0.975301
Epoch 7: Loss: 1803014.207417, RMSE: 0.968731
Epoch 8: Loss: 1780162.974455, RMSE: 0.964213
Epoch 9: Loss: 1761724.198071, RMSE: 0.960968
Epoch 10: Loss: 1746284.288342, RMSE: 0.958551

The learned user bias value for user '91ceb82d91493506532feb02ce751ce7' is: -0.549056
The learned item bias value for item '6931234' is: -0.157415


## 3B


In [214]:
lambda1 = lambda2 = lambda3 = lambda4 = 0.3

validation_loss = partial(
    bias_loss,
    encoded_users=valid_users,
    encoded_items=valid_items,
    ratings=valid_ratings,
    lambda1=lambda1,
    lambda2=lambda2,
    lambda3=lambda3,
    lambda4=lambda4,
    b_global=b_global,
)

test_loss = partial(
    bias_loss,
    encoded_users=test_users,
    encoded_items=test_items,
    ratings=test_ratings,
    lambda1=lambda1,
    lambda2=lambda2,
    lambda3=lambda3,
    lambda4=lambda4,
    b_global=b_global,
)

models = []

for k in (4, 8, 16):
    np.random.seed(121017)  # NOQA: NPY002
    q = np.random.normal(1, 0.1, size=unique_users * k).reshape(unique_users, k)  # NOQA: NPY002
    p = np.random.normal(1, 0.1, size=unique_items * k).reshape(unique_items, k)  # NOQA: NPY002
    b_global = global_bias_val
    b_user = np.zeros(shape=(unique_users,))
    b_item = np.zeros(shape=(unique_items,))
    for user, bias in train_user_bias.items():
        b_user[user] = bias
    for item, bias in train_item_bias.items():
        b_item[item] = bias
    q, p = bias_SGD(
        encoded_users=train_users,
        encoded_items=train_items,
        ratings=train_ratings,
        lambda1=lambda1,
        lambda2=lambda2,
        lambda3=lambda3,
        lambda4=lambda4,
        b_user=b_user,
        b_item=b_item,
        b_global=b_global,
        q=q,
        p=p,
        epochs=10,
        lr=0.01,
    )

    _, rmse_val = validation_loss(q=q, p=p, b_item=b_item, b_user=b_user)
    models.append(
        {
            "k": k,
            "q": np.copy(q),
            "p": np.copy(p),
            "user_bias": np.copy(b_user),
            "item_bias": np.copy(b_item),
            "RMSE": rmse_val,
        }
    )

    print("=" * 40)
    print(f"RMSE with k = {k} factors: {rmse_val:.6f}")
    print("=" * 40)

best_model = sorted(models, key=lambda x: x["RMSE"])[0]

_, rmse_test = test_loss(
    q=best_model["q"],
    p=best_model["p"],
    b_user=best_model["user_bias"],
    b_item=best_model["item_bias"],
)

result_str = f"Best model had k = {best_model['k']} factors, Validation RMSE: {best_model['RMSE']}, Test RMSE: {rmse_test}"
print()
print("*" * len(result_str))
print(result_str)
print("*" * len(result_str))

Epoch 1: Loss: 2119001.580241, RMSE: 1.125568
Epoch 2: Loss: 1815408.532947, RMSE: 1.033979
Epoch 3: Loss: 1703198.257289, RMSE: 1.000072
Epoch 4: Loss: 1644770.695395, RMSE: 0.983126
Epoch 5: Loss: 1608825.318478, RMSE: 0.973319
Epoch 6: Loss: 1584294.301591, RMSE: 0.967104
Epoch 7: Loss: 1566297.871030, RMSE: 0.962908
Epoch 8: Loss: 1552368.432465, RMSE: 0.959936
Epoch 9: Loss: 1541133.345162, RMSE: 0.957749
Epoch 10: Loss: 1531772.774137, RMSE: 0.956090
RMSE with k = 4 factors: 1.176205
Epoch 1: Loss: 2922948.562885, RMSE: 1.279505
Epoch 2: Loss: 2262923.909632, RMSE: 1.096707
Epoch 3: Loss: 2042770.140403, RMSE: 1.032366
Epoch 4: Loss: 1936271.372365, RMSE: 1.002041
Epoch 5: Loss: 1873952.516700, RMSE: 0.985393
Epoch 6: Loss: 1832737.140797, RMSE: 0.975301
Epoch 7: Loss: 1803014.207417, RMSE: 0.968731
Epoch 8: Loss: 1780162.974455, RMSE: 0.964213
Epoch 9: Loss: 1761724.198071, RMSE: 0.960968
Epoch 10: Loss: 1746284.288342, RMSE: 0.958551
RMSE with k = 8 factors: 1.278609
Epoch 1: L

Similar to the results seen in task 2b, we find the best model is the one with k = 4 factors. We found that the validation RMSE was increasing as the K parameter increased, suggesting that the system is quite simple and is best explained by fewer variables. Because of this the addition of more factors just adds to the size of the q and p matrices, thus increasing the loss. Interestingly, the test RMSE for the bias included model is around 0.025 greater than the one we created in task 2b with the same K parameter. This again indicates the simplicity of the system, being better represented by less parameters.
