In [19]:
import json
from functools import partial
from pathlib import Path

import numpy as np

In [3]:
with Path("goodreads_reviews_young_adult_train.json").open() as f:
    train = [json.loads(line) for line in f]

with Path("goodreads_reviews_young_adult_val.json").open() as f:
    valid = [json.loads(line) for line in f]

with Path("goodreads_reviews_young_adult_test.json").open() as f:
    test = [json.loads(line) for line in f]

# Question 1


## 1A


In [4]:
def global_bias(data: list[dict[str, str | int]]) -> float:
    """Return global bias value for the provided dataset.

    :param data: dataset of user reviews
    :return: global bias value
    """
    return sum(line["rating"] for line in data) / len(data)


print(f"Global bias value is: {global_bias(train):.6f}")

Global bias value is: 3.763456


## 1B


In [5]:
def user_bias(
    data: list[dict[str, str | int]], user_id: str, global_bias_val: float | None = None
) -> float:
    """Return user bias for the specified user_id.

    :param data: dataset of user reviews
    :param user_id: user id of a user within the provided dataset
    :param global_bias_val: global bias value for the provided dataset, defaults to None
    :return: user specific bias value
    """
    if global_bias_val is None:
        global_bias_val = global_bias(data)

    user_ratings = [line["rating"] for line in data if line["user_id"] == user_id]
    if not user_ratings:
        raise KeyError(
            f"There is no user with the ID '{user_id}' in the dataset provided"  # NOQA: EM102
        )

    return sum(user_ratings) / len(user_ratings) - global_bias_val


user = "91ceb82d91493506532feb02ce751ce7"
print(f"User bias value for user '{user}' is: {user_bias(train, user):.6f}")

User bias value for user '91ceb82d91493506532feb02ce751ce7' is: -0.997498


## 1C


In [6]:
def item_bias(
    data: list[dict[str, str | int]], item_id: str, global_bias_val: float | None = None
) -> float:
    """Return item bias for the specified item_id.

    :param data: dataset of user reviews
    :param item_id: item id of a item within the provided dataset
    :param global_bias_val: global bias value for the provided dataset, defaults to None
    :return: item specific bias value
    """
    if global_bias_val is None:
        global_bias_val = global_bias(data)

    item_ratings = [line["rating"] for line in data if line["item_id"] == item_id]
    if not item_ratings:
        raise KeyError(
            f"There is no item with the ID '{item_id}' in the dataset provided"  # NOQA: EM102
        )

    return sum(item_ratings) / len(item_ratings) - global_bias_val


item = "6931234"
print(f"Item bias value for item '{item}' is: {item_bias(train, item):.6f}")

Item bias value for item '6931234' is: -0.247327


# Question 2


## 2A


#### Encode all item_ids and user_ids


In [None]:
def encode_data(data: list[dict]) -> tuple[list[int], list[int]]:
    

##### Train


In [37]:
train_users, train_items, train_ratings = zip(
    *[(line["user_id"], line["item_id"], line["rating"]) for line in train],
    strict=False,
)
train_unique_users = tuple(set(train_users))
train_unique_items = tuple(set(train_items))

# Create one hot encoding keys
user_encoding = {user: i for i, user in enumerate(unique_users)}
item_encoding = {item: i for i, item in enumerate(unique_items)}

# Apply one hot encoding
encoded_users = [user_encoding[user] for user in users]
encoded_items = [item_encoding[item] for item in items]

In [25]:
def loss(
    encoded_users: list[int],
    encoded_items: list[int],
    ratings: list[int],
    q: np.ndarray,
    p: np.ndarray,
    lambda1: float,
    lambda2: float,
) -> tuple[float, float]:
    """Loss function & RMSE function for stochastic gradient descent algorithm.

    :param encoded_users: List of user ids encoded to be integer indices
    :param encoded_items: List of item ids encoded to be integer indices
    :param ratings: List of ratings s.t. rating[i] belongs to encoded_user[i] on item[i]
    :param q: Latent vector representation of user profiles
    :param p: Latent vector representation of item profiles
    :param lambda1: Weighting given to sum of squared q l2 norms
    :param lambda2: Weighting given to sum of squared p l2 norms
    :return: Loss value and RMSE value
    """
    # ---------------------------------------------------------------------
    # BELOW ARE TWO VERSIONS OF THE SAME METHOD TO CALCULATE THE DIFFERENCE
    # SUM, BUT I LEFT THE FIRST HERE COMMENTED SINCE IT IS MUCH EASIER TO
    # READ BUT SLIGHTLY SLOWER
    # ---------------------------------------------------------------------

    # differences = []
    # for encoded_user, encoded_item, rating in zip(
    #     encoded_users, encoded_items, ratings, strict=False
    # ):
    #     r_hat_ij = q[encoded_user, :] @ p[encoded_item, :].T
    #     print(r_hat_ij)
    #     diff_sq = (rating - r_hat_ij) ** 2
    #     differences.append(diff_sq)
    # difference_sum = sum(differences)

    difference_sum = sum(
        [
            (rating - q[encoded_user, :] @ p[encoded_item, :].T) ** 2
            for encoded_user, encoded_item, rating in zip(
                encoded_users, encoded_items, ratings, strict=False
            )
        ]
    )
    q_norm_sum = sum(
        np.apply_along_axis(lambda x: np.linalg.norm(x) ** 2, axis=1, arr=q)
    )
    p_norm_sum = sum(
        np.apply_along_axis(lambda x: np.linalg.norm(x) ** 2, axis=1, arr=p)
    )
    loss_val = difference_sum + lambda1 * q_norm_sum + lambda2 * p_norm_sum
    rmse = (difference_sum / len(ratings)) ** 0.5
    return loss_val, rmse


def update_factors(
    encoded_users: list[int],
    encoded_items: list[int],
    ratings: list[int],
    q: np.ndarray,
    p: np.ndarray,
    lambda1: float,
    lambda2: float,
    lr: float,
) -> float:
    for encoded_user, encoded_item, rating in zip(
        encoded_users, encoded_items, ratings, strict=False
    ):
        r_hat_ij = q[encoded_user, :] @ p[encoded_item, :].T
        diff_deriv = (rating - r_hat_ij) * -2
        for f in range(q.shape[1]):  # For factor in k
            q[encoded_user, f] -= lr * (
                diff_deriv * p[encoded_item, f] + 2 * lambda1 * q[encoded_user, f]
            )
            p[encoded_item, f] -= lr * (
                diff_deriv * q[encoded_user, f] + 2 * lambda2 * p[encoded_item, f]
            )


def SGD(  # NOQA
    encoded_users: list[int],
    encoded_items: list[int],
    ratings: list[int],
    lambda1: float,
    lambda2: float,
    k: int = 8,
    epochs: int = 10,
    lr: int = 0.01,
) -> tuple[np.ndarray, np.ndarray]:
    # Many of the variables in the loss function won't change so for
    # Readability we will set them as constant.
    loss_func = partial(
        loss,
        encoded_users=encoded_users,
        encoded_items=encoded_items,
        ratings=ratings,
        lambda1=lambda1,
        lambda2=lambda2,
    )

    update = partial(
        update_factors,
        encoded_users=encoded_users,
        encoded_items=encoded_items,
        ratings=ratings,
        lambda1=lambda1,
        lambda2=lambda2,
        lr=lr,
    )
    unique_users = max(encoded_users) + 1
    unique_items = max(encoded_items) + 1

    np.random.seed(121017)  # NOQA: NPY002
    q = np.random.rand(unique_users, k)  # NOQA: NPY002
    p = np.random.rand(unique_items, k)  # NOQA: NPY002

    for _ in range(epochs):
        update(q=q, p=p)
        loss_val, rmse_val = loss_func(q=q, p=p)
        print(f"Loss: {loss_val:.6f}, RMSE: {rmse_val:.6f}")
    return q, p

In [26]:
q, p = SGD(
    encoded_users=encoded_users,
    encoded_items=encoded_items,
    ratings=ratings,
    lambda1=0.3,
    lambda2=0.3,
    k=8,
    epochs=10,
    lr=0.01,
)

Loss: 2327037.907780, RMSE: 1.210368
Loss: 2063437.201417, RMSE: 1.128759
Loss: 1947233.502302, RMSE: 1.090019
Loss: 1880497.811120, RMSE: 1.066684
Loss: 1837136.633370, RMSE: 1.050959
Loss: 1806820.497128, RMSE: 1.039626
Loss: 1784548.632534, RMSE: 1.031077
Loss: 1767577.485113, RMSE: 1.024408
Loss: 1754267.121117, RMSE: 1.019066
Loss: 1743574.250589, RMSE: 1.014693


## 2B


In [27]:
lambda1 = lambda2 = 0.3

val_users, val_items, val_ratings = zip(
    *[(line["user_id"], line["item_id"], line["rating"]) for line in valid],
    strict=False,
)
val_unique_users = tuple(set(val_users))
val_unique_items = tuple(set(val_items))

# Create one hot encoding keys
val_user_encoding = {user: i for i, user in enumerate(val_unique_users)}
val_item_encoding = {item: i for i, item in enumerate(val_unique_items)}

# Apply one hot encoding
val_encoded_users = [val_user_encoding[user] for user in val_users]
val_encoded_items = [val_item_encoding[item] for item in val_items]

validation_loss = partial(
    loss,
    encoded_users=val_encoded_users,
    encoded_items=val_encoded_items,
    ratings=val_ratings,
    lambda1=lambda1,
    lambda2=lambda2,
)

for k in (4, 8, 16):
    q, p = SGD(
        encoded_users=encoded_users,
        encoded_items=encoded_items,
        ratings=ratings,
        lambda1=lambda1,
        lambda2=lambda2,
        k=k,
        epochs=10,
        lr=0.01,
    )

    _, rmse_val = validation_loss(q=q, p=p)

    print("=" * 40)
    print(f"RMSE with k = {k} factors: {rmse_val:.6f}")
    print("=" * 40)

Loss: 2959345.483452, RMSE: 1.401605
Loss: 2397135.970727, RMSE: 1.248147
Loss: 2159832.992520, RMSE: 1.175582
Loss: 2027676.180905, RMSE: 1.132135
Loss: 1943963.289290, RMSE: 1.103059
Loss: 1886811.866674, RMSE: 1.082280


KeyboardInterrupt: 