In [1]:
import json
from functools import partial
from pathlib import Path

import numpy as np

In [2]:
with Path("goodreads_reviews_young_adult_train.json").open() as f:
    train = [json.loads(line) for line in f]

with Path("goodreads_reviews_young_adult_val.json").open() as f:
    valid = [json.loads(line) for line in f]

with Path("goodreads_reviews_young_adult_test.json").open() as f:
    test = [json.loads(line) for line in f]

# Question 1


## 1A


In [3]:
def global_bias(data: list[dict[str, str | int]]) -> float:
    """Return global bias value for the provided dataset.

    :param data: dataset of user reviews
    :return: global bias value
    """
    return sum(line["rating"] for line in data) / len(data)


print(f"Global bias value is: {global_bias(train):.6f}")

Global bias value is: 3.763456


## 1B


In [4]:
def user_bias(
    data: list[dict[str, str | int]], user_id: str, global_bias_val: float | None = None
) -> float:
    """Return user bias for the specified user_id.

    :param data: dataset of user reviews
    :param user_id: user id of a user within the provided dataset
    :param global_bias_val: global bias value for the provided dataset, defaults to None
    :return: user specific bias value
    """
    if global_bias_val is None:
        global_bias_val = global_bias(data)

    user_ratings = [line["rating"] for line in data if line["user_id"] == user_id]
    if not user_ratings:
        raise KeyError(
            f"There is no user with the ID '{user_id}' in the dataset provided"  # NOQA: EM102
        )

    return sum(user_ratings) / len(user_ratings) - global_bias_val


user = "91ceb82d91493506532feb02ce751ce7"
print(f"User bias value for user '{user}' is: {user_bias(train, user):.6f}")

User bias value for user '91ceb82d91493506532feb02ce751ce7' is: -0.997498


## 1C


In [5]:
def item_bias(
    data: list[dict[str, str | int]], item_id: str, global_bias_val: float | None = None
) -> float:
    """Return item bias for the specified item_id.

    :param data: dataset of user reviews
    :param item_id: item id of a item within the provided dataset
    :param global_bias_val: global bias value for the provided dataset, defaults to None
    :return: item specific bias value
    """
    if global_bias_val is None:
        global_bias_val = global_bias(data)

    item_ratings = [line["rating"] for line in data if line["item_id"] == item_id]
    if not item_ratings:
        raise KeyError(
            f"There is no item with the ID '{item_id}' in the dataset provided"  # NOQA: EM102
        )

    return sum(item_ratings) / len(item_ratings) - global_bias_val


item = "6931234"
print(f"Item bias value for item '{item}' is: {item_bias(train, item):.6f}")

Item bias value for item '6931234' is: -0.247327


# Question 2


## 2A


#### Encode all item_ids and user_ids


In [18]:
def encode_data(data: list[dict], encodings:tuple[dict,dict] | None = None) -> tuple[list[int], list[int], list[int], tuple[dict, dict]]:
    users, items, ratings = zip(
        *[(line["user_id"], line["item_id"], line["rating"]) for line in data],
        strict=False,
    )

    unique_users = set(users)
    unique_items = set(items)

    if encodings is None:
        
        # Create one hot encoding keys
        user_encoding = {user: i for i, user in enumerate(unique_users)}
        item_encoding = {item: i for i, item in enumerate(unique_items)}
    else:
        user_encoding, item_encoding = encodings
        new_users = unique_users - set(user_encoding.keys())
        new_items = unique_items - set(item_encoding.keys())
        max_user = max(user_encoding.values())
        max_item = max(item_encoding.values())
        for i, user in enumerate(new_users):
            user_encoding[user] = max_user + i + 1
        for i, item in enumerate(new_items):
            item_encoding[item] = max_item + i + 1

    # Apply one hot encoding
    encoded_users = [user_encoding[user] for user in users]
    encoded_items = [item_encoding[item] for item in items]
    return encoded_users, encoded_items, ratings, (user_encoding, item_encoding)

In [21]:
train_users, train_items, train_ratings, encodings = encode_data(train)
valid_users, valid_items, valid_ratings, encodings = encode_data(valid, encodings)
test_users, test_items, test_ratings, encodings = encode_data(test, encodings)

user_encoding, item_encoding = encodings

In [23]:
def loss(
    encoded_users: list[int],
    encoded_items: list[int],
    ratings: list[int],
    q: np.ndarray,
    p: np.ndarray,
    lambda1: float,
    lambda2: float,
) -> tuple[float, float]:
    """Loss function & RMSE function for stochastic gradient descent algorithm.

    :param encoded_users: List of user ids encoded to be integer indices
    :param encoded_items: List of item ids encoded to be integer indices
    :param ratings: List of ratings s.t. rating[i] belongs to encoded_user[i] on item[i]
    :param q: Latent vector representation of user profiles
    :param p: Latent vector representation of item profiles
    :param lambda1: Weighting given to sum of squared q l2 norms
    :param lambda2: Weighting given to sum of squared p l2 norms
    :return: Loss value and RMSE value
    """
    # ---------------------------------------------------------------------
    # BELOW ARE TWO VERSIONS OF THE SAME METHOD TO CALCULATE THE DIFFERENCE
    # SUM, BUT I LEFT THE FIRST HERE COMMENTED SINCE IT IS MUCH EASIER TO
    # READ BUT SLIGHTLY SLOWER
    # ---------------------------------------------------------------------

    # differences = []
    # for encoded_user, encoded_item, rating in zip(
    #     encoded_users, encoded_items, ratings, strict=False
    # ):
    #     r_hat_ij = q[encoded_user, :] @ p[encoded_item, :].T
    #     diff_sq = (rating - r_hat_ij) ** 2
    #     differences.append(diff_sq)
    # difference_sum = sum(differences)

    difference_sum = sum(
        [
            (rating - q[encoded_user, :] @ p[encoded_item, :].T) ** 2
            for encoded_user, encoded_item, rating in zip(
                encoded_users, encoded_items, ratings, strict=False
            )
        ]
    )
    q_norm_sum = sum(
        np.apply_along_axis(lambda x: np.linalg.norm(x) ** 2, axis=1, arr=q)
    )
    p_norm_sum = sum(
        np.apply_along_axis(lambda x: np.linalg.norm(x) ** 2, axis=1, arr=p)
    )
    loss_val = difference_sum + lambda1 * q_norm_sum + lambda2 * p_norm_sum
    rmse = (difference_sum / len(ratings)) ** 0.5
    return loss_val, rmse


def update_factors(
    encoded_users: list[int],
    encoded_items: list[int],
    ratings: list[int],
    q: np.ndarray,
    p: np.ndarray,
    lambda1: float,
    lambda2: float,
    lr: float,
) -> float:
    for encoded_user, encoded_item, rating in zip(
        encoded_users, encoded_items, ratings, strict=False
    ):
        r_hat_ij = q[encoded_user, :] @ p[encoded_item, :].T
        diff_deriv = (rating - r_hat_ij) * -2
        for f in range(q.shape[1]):  # For factor in k
            q[encoded_user, f] -= lr * (
                diff_deriv * p[encoded_item, f] + 2 * lambda1 * q[encoded_user, f]
            )
            p[encoded_item, f] -= lr * (
                diff_deriv * q[encoded_user, f] + 2 * lambda2 * p[encoded_item, f]
            )


def SGD(  # NOQA
    encoded_users: list[int],
    encoded_items: list[int],
    ratings: list[int],
    lambda1: float,
    lambda2: float,
    q: np.ndarray,
    p: np.ndarray,
    epochs: int = 10,
    lr: int = 0.01,
) -> tuple[np.ndarray, np.ndarray]:
    # Many of the variables in the loss function won't change so for
    # Readability we will set them as constant.
    loss_func = partial(
        loss,
        encoded_users=encoded_users,
        encoded_items=encoded_items,
        ratings=ratings,
        lambda1=lambda1,
        lambda2=lambda2,
    )

    update = partial(
        update_factors,
        encoded_users=encoded_users,
        encoded_items=encoded_items,
        ratings=ratings,
        lambda1=lambda1,
        lambda2=lambda2,
        lr=lr,
    )

    for _ in range(epochs):
        update(q=q, p=p)
        loss_val, rmse_val = loss_func(q=q, p=p)
        print(f"Loss: {loss_val:.6f}, RMSE: {rmse_val:.6f}")
    return q, p

In [25]:
k = 8

unique_users = max(user_encoding.values()) + 1
unique_items = max(item_encoding.values()) + 1

np.random.seed(121017)  # NOQA: NPY002
q = np.random.normal(size=unique_users*k).reshape(unique_users, k)  # NOQA: NPY002
p = np.random.normal(size=unique_items*k).reshape(unique_items, k)  # NOQA: NPY002

q, p = SGD(
    encoded_users=train_users,
    encoded_items=train_items,
    ratings=train_ratings,
    lambda1=0.3,
    lambda2=0.3,
    q=q,
    p=p,
    epochs=10,
    lr=0.01,
)

Loss: 2360828.150837, RMSE: 1.208986
Loss: 2098238.810055, RMSE: 1.127593
Loss: 1982786.722295, RMSE: 1.089051
Loss: 1916613.562958, RMSE: 1.065878
Loss: 1873682.752792, RMSE: 1.050282
Loss: 1843707.680242, RMSE: 1.039054
Loss: 1821714.382455, RMSE: 1.030593
Loss: 1804976.122554, RMSE: 1.023999
Loss: 1791864.203873, RMSE: 1.018722
Loss: 1781343.350403, RMSE: 1.014405


## 2B


In [31]:
lambda1 = lambda2 = 0.3

validation_loss = partial(
    loss,
    encoded_users=valid_users,
    encoded_items=valid_items,
    ratings=valid_ratings,
    lambda1=lambda1,
    lambda2=lambda2,
)

test_loss = partial(
    loss,
    encoded_users=test_users,
    encoded_items=test_items,
    ratings=test_ratings,
    lambda1=lambda1,
    lambda2=lambda2,
)

models = []

for k in (4, 8, 16):
    np.random.seed(121017)  # NOQA: NPY002
    q = np.random.normal(size=unique_users*k).reshape(unique_users, k)  # NOQA: NPY002
    p = np.random.normal(size=unique_items*k).reshape(unique_items, k)  # NOQA: NPY002
    q, p = SGD(
        encoded_users=train_users,
        encoded_items=train_items,
        ratings=train_ratings,
        lambda1=0.3,
        lambda2=0.3,
        q=q,
        p=p,
        epochs=10,
        lr=0.01,
    )

    _, rmse_val = validation_loss(q=q, p=p)
    models.append({"k": k, "q": np.copy(q), "p": np.copy(p), "RMSE": rmse_val})

    print("=" * 40)
    print(f"RMSE with k = {k} factors: {rmse_val:.6f}")
    print("=" * 40)

best_model = sorted(models, key=lambda x: x["RMSE"])[0]

_, rmse_test = test_loss(q=best_model["q"],p=best_model["p"])

result_str = f"Best model had k = {best_model['k']} factors, Validation RMSE: {best_model['RMSE']}, Test RMSE: {rmse_test}"
print()
print("*"*len(result_str))
print(result_str)
print("*"*len(result_str))

Loss: 2979907.660673, RMSE: 1.401940
Loss: 2418737.452257, RMSE: 1.248822
Loss: 2180889.557103, RMSE: 1.176139
Loss: 2048251.818305, RMSE: 1.132562
Loss: 1964234.719100, RMSE: 1.103397
Loss: 1906898.031309, RMSE: 1.082562
Loss: 1865800.471702, RMSE: 1.067014
Loss: 1835296.618059, RMSE: 1.055043
Loss: 1812052.892008, RMSE: 1.045601
Loss: 1793969.321085, RMSE: 1.038010
RMSE with k = 4 factors: 1.271235
Loss: 2360828.150837, RMSE: 1.208986
Loss: 2098238.810055, RMSE: 1.127593
Loss: 1982786.722295, RMSE: 1.089051
Loss: 1916613.562958, RMSE: 1.065878
Loss: 1873682.752792, RMSE: 1.050282
Loss: 1843707.680242, RMSE: 1.039054
Loss: 1821714.382455, RMSE: 1.030593
Loss: 1804976.122554, RMSE: 1.023999
Loss: 1791864.203873, RMSE: 1.018722
Loss: 1781343.350403, RMSE: 1.014405
RMSE with k = 8 factors: 1.199577
Loss: 2266297.274448, RMSE: 1.122188
Loss: 2105780.043836, RMSE: 1.074445
Loss: 2024408.493054, RMSE: 1.050121
Loss: 1972794.160549, RMSE: 1.034777
Loss: 1936426.733155, RMSE: 1.024069
Loss: 1

*************************************************************************************************
Best model had k = 16 factors, Validation RMSE: 1.1644991118551289, Test RMSE: 1.1628731158585603
*************************************************************************************************


# Task 3

## 3A

In [37]:
def bias_loss(
    encoded_users: list[int],
    encoded_items: list[int],
    ratings: list[int],
    q: np.ndarray,
    p: np.ndarray,
    b_user: np.ndarray,
    b_item: np.ndarray,
    b_global: float,
    lambda1: float,
    lambda2: float,
    lambda3: float,
    lambda4: float
) -> tuple[float, float]:
    """Loss function & RMSE function for stochastic gradient descent algorithm.

    :param encoded_users: List of user ids encoded to be integer indices
    :param encoded_items: List of item ids encoded to be integer indices
    :param ratings: List of ratings s.t. rating[i] belongs to encoded_user[i] on item[i]
    :param q: Latent vector representation of user profiles
    :param p: Latent vector representation of item profiles
    :param lambda1: Weighting given to sum of squared q l2 norms
    :param lambda2: Weighting given to sum of squared p l2 norms
    :return: Loss value and RMSE value
    """
    # ---------------------------------------------------------------------
    # BELOW ARE TWO VERSIONS OF THE SAME METHOD TO CALCULATE THE DIFFERENCE
    # SUM, BUT I LEFT THE FIRST HERE COMMENTED SINCE IT IS MUCH EASIER TO
    # READ BUT SLIGHTLY SLOWER
    # ---------------------------------------------------------------------

    # differences = []
    # for encoded_user, encoded_item, rating in zip(
    #     encoded_users, encoded_items, ratings, strict=False
    # ):
    #     r_hat_ij = q[encoded_user, :] @ p[encoded_item, :].T
    #     diff_sq = (rating - r_hat_ij - b_global - b_user[encoded_user] - b_item[encoded_item]) ** 2
    #     differences.append(diff_sq)
    # difference_sum = sum(differences)

    difference_sum = sum(
        [
            (rating - q[encoded_user, :] @ p[encoded_item, :].T - b_global - b_user[encoded_user] - b_item[encoded_item]) ** 2
            for encoded_user, encoded_item, rating in zip(
                encoded_users, encoded_items, ratings, strict=False
            )
        ]
    )
    q_norm_sum = sum(
        np.apply_along_axis(lambda x: np.linalg.norm(x) ** 2, axis=1, arr=q)
    )
    p_norm_sum = sum(
        np.apply_along_axis(lambda x: np.linalg.norm(x) ** 2, axis=1, arr=p)
    )
    user_bias_sum = sum(b_user**2)
    item_bias_sum = sum(b_item**2)
    loss_val = difference_sum + lambda1 * q_norm_sum + lambda2 * p_norm_sum + lambda3 * user_bias_sum + lambda4 * item_bias_sum
    rmse = (difference_sum / len(ratings)) ** 0.5
    return loss_val, rmse


def bias_update_factors(
    encoded_users: list[int],
    encoded_items: list[int],
    ratings: list[int],
    q: np.ndarray,
    p: np.ndarray,
    b_user: np.ndarray,
    b_item: np.ndarray,
    b_global: float,
    lambda1: float,
    lambda2: float,
    lambda3: float,
    lambda4: float,
    lr: float,
) -> float:
    for encoded_user, encoded_item, rating in zip(
        encoded_users, encoded_items, ratings, strict=False
    ):
        r_hat_ij = q[encoded_user, :] @ p[encoded_item, :].T - b_global - b_user[encoded_user] - b_item[encoded_item]
        diff_deriv = (rating - r_hat_ij) * -2
        for f in range(q.shape[1]):  # For factor in k
            q[encoded_user, f] -= lr * (
                diff_deriv * p[encoded_item, f] + 2 * lambda1 * q[encoded_user, f]
            )
            p[encoded_item, f] -= lr * (
                diff_deriv * q[encoded_user, f] + 2 * lambda2 * p[encoded_item, f]
            )
        
        b_user[encoded_user] -= lr*(diff_deriv + 2*lambda3 * b_user[encoded_user])
        b_item[encoded_item] -= lr*(diff_deriv + 2*lambda4 * b_item[encoded_item])


def bias_SGD(  # NOQA
    encoded_users: list[int],
    encoded_items: list[int],
    ratings: list[int],
    b_user: np.ndarray,
    b_item: np.ndarray,
    b_global: float,
    lambda1: float,
    lambda2: float,
    lambda3: float,
    lambda4: float,
    q: np.ndarray,
    p: np.ndarray,
    epochs: int = 10,
    lr: int = 0.01,
) -> tuple[np.ndarray, np.ndarray]:
    # Many of the variables in the loss function won't change so for
    # Readability we will set them as constant.
    loss_func = partial(
        bias_loss,
        encoded_users=encoded_users,
        encoded_items=encoded_items,
        ratings=ratings,
        lambda1=lambda1,
        lambda2=lambda2,
        lambda3=lambda3,
        lambda4=lambda4,
        b_global=b_global
    )

    update = partial(
        bias_update_factors,
        encoded_users=encoded_users,
        encoded_items=encoded_items,
        ratings=ratings,
        lambda1=lambda1,
        lambda2=lambda2,
        lambda3=lambda3,
        lambda4=lambda4,
        b_global=b_global,
        lr=lr,
    )

    for _ in range(epochs):
        update(q=q, p=p, b_user=b_user, b_item=b_item)
        loss_val, rmse_val = loss_func(q=q, p=p, b_user=b_user, b_item=b_item, b_global=b_global)
        print(f"Loss: {loss_val:.6f}, RMSE: {rmse_val:.6f}")
    return q, p

In [39]:
k = 8
lambda1=lambda2=lambda3=lambda4=0.3

unique_users = max(user_encoding.values()) + 1
unique_items = max(item_encoding.values()) + 1

np.random.seed(121017)  # NOQA: NPY002
q = np.random.normal(size=unique_users*k).reshape(unique_users, k)  # NOQA: NPY002
p = np.random.normal(size=unique_items*k).reshape(unique_items, k)  # NOQA: NPY002
b_user = np.random.normal(size=unique_users)  # NOQA: NPY002
b_item = np.random.normal(size=unique_items)  # NOQA: NPY002
b_global = global_bias(train)

q, p = bias_SGD(
    encoded_users=train_users,
    encoded_items=train_items,
    ratings=train_ratings,
    lambda1=lambda1,
    lambda2=lambda2,
    lambda3=lambda3,
    lambda4=lambda4,
    b_user=b_user,
    b_item=b_item,
    b_global=b_global,
    q=q,
    p=p,
    epochs=10,
    lr=0.01,
)

  diff_deriv * q[encoded_user, f] + 2 * lambda2 * p[encoded_item, f]
  r_hat_ij = q[encoded_user, :] @ p[encoded_item, :].T - b_global - b_user[encoded_user] - b_item[encoded_item]
  diff_deriv * p[encoded_item, f] + 2 * lambda1 * q[encoded_user, f]
  diff_deriv * q[encoded_user, f] + 2 * lambda2 * p[encoded_item, f]


KeyboardInterrupt: 