In [1]:
import numpy as np
import pandas as pd

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # to make jupyter print all outputs, not just the last one
from IPython.core.display import HTML # to pretty print pandas df and be able to copy them over (e.g. to ppt slides)

In [3]:
df = pd.read_parquet('cleaned/netflix_parquet')
df2 = pd.read_parquet('cleaned/movielens_parquet')

### Netflix data stats:

In [4]:
unique_users = set()  # Using a set to store unique user IDs

# Iterate over each row
for index, row in df.iterrows():
    # Iterate over each dictionary in the 'review_data' column of the current row
    for review_entry in row['review_data']:
        user_id = review_entry.get('userId')  # Extracting 'userId' from the dictionary
        if user_id:  # Check if 'userId' exists in the dictionary
            unique_users.add(user_id)  # Add 'userId' to the set of unique user IDs

amount_of_reviews = list()  # Using a set to store unique user IDs

# Iterate over each row
for index, row in df.iterrows():
    # Iterate over each dictionary in the 'review_data' column of the current row
    for review_entry in row['review_data']:
        user_id = review_entry.get('userId')  # Extracting 'userId' from the dictionary
        if user_id:  # Check if 'userId' exists in the dictionary
            amount_of_reviews.append(user_id)  # Add 'userId' to the set of unique user IDs

print("There are {} reviews in the NETFLIX dataframe.".format(len(amount_of_reviews)))
print("There are {} unique users who have reviewed a movie.".format(len(unique_users)))
print("There are {} movieIds in the NETFLIX dataset.".format(len(df)))
avg_reviews_per_uniqueUser = len(amount_of_reviews) / len(unique_users)
print("A unique user places {} reviews on average in the NETFLIX dataset.".format(round(avg_reviews_per_uniqueUser)))
avg_reviews_per_movieId = len(amount_of_reviews) / len(df)
print("A movieId receives {} reviews on average in the NETFLIX dataset.".format(round(avg_reviews_per_movieId)))

There are 9998038 reviews in the NETFLIX dataframe.
There are 447835 unique users who have reviewed a movie.
There are 1962 movieIds in the NETFLIX dataset.
A unique user places 22 reviews on average in the NETFLIX dataset.
A movieId receives 5096 reviews on average in the NETFLIX dataset.


### Movielens data stats:

In [6]:
unique_users = set()  # Using a set to store unique user IDs

# Iterate over each row
for index, row in df2.iterrows():
    # Iterate over each dictionary in the 'review_data' column of the current row
    for review_entry in row['review_data']:
        user_id = review_entry.get('userId')  # Extracting 'userId' from the dictionary
        if user_id:  # Check if 'userId' exists in the dictionary
            unique_users.add(user_id)  # Add 'userId' to the set of unique user IDs

amount_of_reviews = list()  # Using a set to store unique user IDs

# Iterate over each row
for index, row in df2.iterrows():
    # Iterate over each dictionary in the 'review_data' column of the current row
    for review_entry in row['review_data']:
        user_id = review_entry.get('userId')  # Extracting 'userId' from the dictionary
        if user_id:  # Check if 'userId' exists in the dictionary
            amount_of_reviews.append(user_id)  # Add 'userId' to the set of unique user IDs

print("There are {} reviews in the MOVIELENS dataframe.".format(len(amount_of_reviews)))
print("There are {} unique users who have reviewed a movie.".format(len(unique_users)))
print("There are {} movieIds in the MOVIELENS dataset.".format(len(df)))
avg_reviews_per_uniqueUser = len(amount_of_reviews) / len(unique_users)
print("A unique user places {} reviews on average in the MOVIELENS dataset.".format(round(avg_reviews_per_uniqueUser)))
avg_reviews_per_movieId = len(amount_of_reviews) / len(df)
print("A movieId receives {} reviews on average in the MOVIELENS dataset.".format(round(avg_reviews_per_movieId)))

TypeError: 'NoneType' object is not iterable

**Conclusion:** the movielens can be considered as more sparse (meaning more null values) than Netflix as the amount of movieIds is much higher, but the avg. review per userId and per movieId is much lower. We will see in performance if this makes a difference.

Define dataframe without date item in review_data dictionary to start with, later date features may be added for both Netflix and movielens:

In [None]:
netflix_df = df[df.columns]
netflix_df['review_data'] = netflix_df['review_data'].apply(lambda x: None if x is None else [{'userId': review['userId'], 'rating': review['rating']} for review in x if 'userId' in review and 'rating' in review])

In [None]:
movielens_df = df2[df2.columns]
movielens_df['review_data'] = movielens_df['review_data'].apply(lambda x: None if x is None else [{'userId': review['userId'], 'rating': review['rating']} for review in x if 'userId' in review and 'rating' in review])

To start with, I am going to make multiple random samples using random sample with replacement:

*only samples from EDA which were able to catch differences during ANOVA are selected, and which showed similar distribution to the complete dataset*

In [None]:
# make NETFLIX samples using random sampling:
sample_third = netflix_df.sample(frac=1/3, random_state=42)
sample_quarter = netflix_df.sample(frac=1/4, random_state=42)
sample_sixth = netflix_df.sample(frac=1/6, random_state=42)
sample_tenth = netflix_df.sample(frac=1/10, random_state=42)

In [None]:
# make MOVIELENS samples using random sampling:
sample_third_ml = movielens_df.sample(frac=1/3, random_state=42)
sample_quarter_ml = movielens_df.sample(frac=1/4, random_state=42)
sample_sixth_ml = movielens_df.sample(frac=1/6, random_state=42)
sample_tenth_ml = movielens_df.sample(frac=1/10, random_state=42)

### Feature engineering and pre processing:

Year and title will be dropped:

In [None]:
sample_tenth = sample_tenth.drop(['year','title'],axis=1)
sample_third = sample_third.drop(['year','title'],axis=1)
sample_third_ml = sample_third_ml[['movieId','review_data']]

Then, let's split our data into train, validation and test sets where we ensure that no training data flows into test and validation sets:

In [None]:
def train_val_test_split(data, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1):
    """
    Splits the data into training, validation, and test sets, simultaneously ensuring no training data flows into validation or test data.

    Parameters:
    - data: pandas DataFrame containing the data to be split.
    - train_ratio: float, ratio of the training set size to the total data size (default: 0.8).
    - val_ratio: float, ratio of the validation set size to the total data size (default: 0.1).
    - test_ratio: float, ratio of the test set size to the total data size (default: 0.1).

    Returns:
    - train_data: pandas DataFrame, training set.
    - val_data: pandas DataFrame, validation set.
    - test_data: pandas DataFrame, test set.
    """
    # Shuffle the data
    data_shuffled = data.sample(frac=1, random_state=42)

    # Calculate the sizes of each set
    num_samples = len(data_shuffled)
    num_train = int(train_ratio * num_samples)
    num_val = int(val_ratio * num_samples)
    num_test = num_samples - num_train - num_val

    # Split the data into train, validation, and test sets
    train_data = data_shuffled[:num_train]
    # Below is ensured the validation data and the test data starts after the indices which are already in the training data, ensuring that no training data will flow into validation of test data.
    val_data = data_shuffled[num_train:num_train+num_val]
    test_data = data_shuffled[num_train+num_val:]

    # Reset index for each set
    train_data.reset_index(drop=True, inplace=True)
    val_data.reset_index(drop=True, inplace=True)
    test_data.reset_index(drop=True, inplace=True)

    return train_data, val_data, test_data

Let's split the data accordingly and take two differenct sample sizes to see what effect it has on model performance:

In [None]:
# netflix dataset splitting
train_data, val_data, test_data = train_val_test_split(sample_tenth)
train_data2, val_data2, test_data2 = train_val_test_split(sample_third)
# movielens dataset splitting
train_data3, val_data3, test_data3 = train_val_test_split(sample_third_ml)

Subsequently, let's define some function to make our life easer for the compatibility of more datasets. We gather unique item and user ids, create user-item matrix which will be centered, followed by performing SVD en making recommendations using the dot product between the decomposed matrices resulting from SVD:

In [None]:
def create_user_item_matrix(train_test_val_set):
    """
    Creates a user-item matrix from the provided dataset containing review data.

    Parameters:
    train_test_val_set (DataFrame): DataFrame containing review data with columns 'review_data',
                                    which is a list of dictionaries with keys 'userId', 'rating',
                                    and 'movieId'.

    Returns:
    user_item_matrix (numpy.ndarray): Matrix representing users' ratings for items (movies), the matrix is an NumPy array which contains lists of user-item interactions, meaning a user and their corresponding ratings to the movieIds.    
    
    user_id_dict (dict): Dictionary mapping user IDs to unique indices in the user-item matrix.
    
    movie_id_dict (dict): Dictionary mapping movie IDs to unique indices in the user-item matrix.
    
    user_ids (numpy.ndarray): Array containing user IDs corresponding to each rating in the matrix.
    
    movie_ids (numpy.ndarray): Array containing movie IDs corresponding to each rating in the matrix.

    """
    review_data = train_test_val_set['review_data'].values
    user_ids = np.concatenate([np.array([entry['userId'] for entry in row]) for row in review_data])
    ratings = np.concatenate([np.array([entry['rating'] for entry in row]) for row in review_data])
    movieIds = np.concatenate([[movieId] * len(row) for movieId, row in zip(train_test_val_set['movieId'], review_data)])

    # create dictionaries to map user IDs and movie IDs to unique indices to map over
    user_id_dict = {user_id: index for index, user_id in enumerate(np.unique(user_ids))}
    movie_id_dict = {movie_id: index for index, movie_id in enumerate(np.unique(movieIds))}

    # initialize an empty user-item matrix
    user_count = len(user_id_dict)
    movie_count = len(movie_id_dict)
    user_item_matrix = np.zeros((user_count, movie_count))

    # populate the user-item matrix with ratings from netflix dataset
    for i, (user_id, movie_id, rating) in enumerate(zip(user_ids, movieIds, ratings)):
        user_index = user_id_dict[user_id]
        movie_index = movie_id_dict[movie_id]
        user_item_matrix[user_index, movie_index] = rating

    return user_item_matrix, user_id_dict, movie_id_dict, user_ids, movieIds

# # I will center the data in the function below, to make the matrix more robust to handle variations in user ratings
# def center_data(user_item_matrix):
#     """
#     Creates a centered matrix of the previously created user-item matrix

#     Parameters:
#     User-item matrix which is made a Numpy array with appended lists with ratings of each users of each item. Each position in each list corresponds to the same movieId. Datatype within the matrix is float64.

#     Return:
#     A centered user item matrix, where the row mean of each user is substracted from the initial ratings, to account for variations in ratings
    
#     """
#     user_means = np.mean(user_item_matrix, axis=1)
#     centered_user_item_matrix = user_item_matrix - user_means[:, np.newaxis]
#     return centered_user_item_matrix, user_means

def center_data(user_item_matrix):
    """
    Creates a centered matrix of the previously created user-item matrix

    Parameters:
    User-item matrix which is made a Numpy array with appended lists with ratings of each users of each item. Each position in each list corresponds to the same movieId. Datatype within the matrix is float64. Each NaN value is converted to 0. In other words, for the time being the implicit feedback is converted to 0.

    Return:
    A centered user item matrix, where the row mean of each user is subtracted from the initial ratings, to account for variations in ratings
    
    """
    # Check for NaN values and replace them with 0
    user_item_matrix[np.isnan(user_item_matrix)] = 0
    
    # Compute user means
    user_means = np.mean(user_item_matrix, axis=1)
    
    # Center the data
    centered_user_item_matrix = user_item_matrix - user_means[:, np.newaxis]
    
    return centered_user_item_matrix, user_means

# I will decompose the user item matrix in this function using numpy
def apply_svd(centered_user_item_matrix, num_latent_factors):
    """
    Applies Singular Value Decomposition (SVD) to decompose the centered user-item matrix into three matrices:
    U, Sigma, and Vt.

    U: user matrix with values which represent the relation between the chosen latent factors, Users are the rows, matrix is orthonormal to Vt
    Sigma: diagonal matrix where the chosen latent factors are in the diagonal line, ordered descendingly. 
    Vt: Item matrix with values which represent the relation between the chosen latent factors, Items are the columns, matrix is orthonormal to U

    Parameters:
    centered_user_item_matrix (numpy.ndarray): Centered user-item matrix to be decomposed.
    num_latent_factors (int): Number of latent factors to retain in the decomposition.

    Returns:
    U (numpy.ndarray): Matrix representing the relationship between users and latent factors.
    Sigma (numpy.ndarray): Diagonal matrix containing the singular values, representing the importance of each latent factor.
    Vt (numpy.ndarray): Transpose of the matrix representing the relationship between items and latent factors.

    """
    # U, sigma and Vt are created using the svd function from numpy
    U, Sigma, Vt = np.linalg.svd(centered_user_item_matrix, full_matrices=False)
    # set up sigma, which is the diagonal matrix from the decomposition
    Sigma = np.diag(Sigma[:num_latent_factors])
    # set up U and Vt which have to orthonormal to each other to ensure U represents each user and Vt represents each item, otherwise the total matrix would not add up.
    U = U[:, :num_latent_factors]
    Vt = Vt[:num_latent_factors, :]
    return U, Sigma, Vt

# I will compute recommendations by the dotproduct of the decomposed matrices from svd in this function
def compute_recommendations_for_all_users(U, Sigma, Vt, user_means, user_ids, num_recommendations, user_item_matrix):
    """
    Computes recommendations for all users based on the decomposed matrices from Singular Value Decomposition (SVD).

    Parameters:
    U: user matrix with values which represent the relation between the chosen latent factors, Users are the rows, matrix is orthonormal to Vt
    Sigma: diagonal matrix where the chosen latent factors are in the diagonal line, ordered descendingly. 
    Vt: Item matrix with values which represent the relation between the chosen latent factors, Items are the columns, matrix is orthonormal to U

    user_means (numpy.ndarray): Array containing mean ratings for each user.
    user_ids (numpy.ndarray): Array containing user IDs.
    num_recommendations (int): Number of recommendations to generate for each user.
    user_item_matrix (numpy.ndarray): Matrix representing user-item interactions, where rows correspond to users and columns correspond to items.

    Returns:
    all_recommendations (dict): Dictionary mapping user IDs to lists of top recommended item IDs.
    all_predicted_centered_ratings (numpy.ndarray): Array of predicted centered ratings for all users and items.
                                                    Predicted ratings are centered by adding the mean rating for each user.
                                                    Each row corresponds to a user, and each column corresponds to an item.
    """
    all_recommendations = {}
    all_predicted_centered_ratings = np.zeros_like(user_item_matrix)  # Initialize array for predicted ratings

    user_id_to_index = {user_id: idx for idx, user_id in enumerate(user_ids)}

    for user_id in user_ids:
        user_index = user_id_to_index[user_id]
        # the matrix multiplication below gives me the new ratings based on the dot product of corresponding row(user_index and therefore userId) with the recomposed item matrix with less features. Essentially the ratings are computed again with less features.
        user_ratings = np.dot(U[user_index, :], np.dot(Sigma, Vt)) + user_means[user_index]
        # masks out items greater than 0, meaning an item a user already interacted with, the rest is filled with infinite values to show that a user did not interact with that item yet
        user_ratings[user_item_matrix[user_index, :] > 0] = -np.inf
        # sort values descendingly, to identify top rated item per user
        top_indices = np.argsort(user_ratings)[::-1]
        # store the top predicted ratings for the current user
        all_predicted_centered_ratings[user_index, :] = user_ratings
        # select the top 'num_recommendations' items as recommendations
        top_items = top_indices[:num_recommendations] + 1
        # store the top recommended items for the current user
        all_recommendations[user_id] = top_items

    return all_recommendations, all_predicted_centered_ratings

**I can be sure user_ids across functions are the same, because:** in the function compute_recommendations_for_all_users, the user IDs are used to retrieve the corresponding user indices within the centered matrix. Here's how:

User IDs are used to retrieve the corresponding user indices using the user_id_to_index dictionary.
The predicted ratings for each user are computed based on their index within the centered matrix.
After computation, the recommendations and predicted ratings are stored and returned in a manner that preserves the correspondence between user IDs and their respective predictions.

**Therefore**, when accessing the predictions or recommendations for a specific user ID from the returned results, you can be confident that they correspond to the same user ID in the original centered matrix.

***********************

Before parameter tuning, I will run the recommender system for the train and validation set and record some baseline performance. Root Mean Squared Error (RMSE) will be used as performance metric. 

- Reason behind this is the corresponding original and predicted centered ratings from the train_data and val_data will be used for measuring performance. A form of squared mean error is appropriate for such cases. Recall and precision revolve around ratings which are relevant to the user or not, which is difficult and subjective to identify within this model. 
  
- Furthermore, RMSE will is expressed in the same units as the input data, making it easy to interpret for a user but a stakeholder as well.
- RMSE tend to highlight differences more on smaller sample sizes than MSE would do.

# Netflix Prize dataset

In [None]:
# here I make a baseline selection of latent factors
num_latent_factors = 1

# here I make a baseline selection of recommendations per user
num_recommendations = 4

Baseline train_data sample tenth

In [None]:
user_item_matrix_train, user_id_dict_train, movie_id_dict_train, user_ids_train, movie_ids_train = create_user_item_matrix(train_data)

# get unique movieIds, use set to ensure unique values and put ids in a list
user_ids_train = list(set(user_ids_train))
item_ids_train = list(set(movie_ids_train))

# unpack the tuple returned by center_data function to get an updates user item matrix which is more robust to variations in rating
centered_user_item_matrix_train, user_means_train = center_data(user_item_matrix_train)

# apply SVD using the centered matrix to reduce memory usage and to decompose the matrix to be able to make recommendations using the dot product method
U_TRAIN, Sigma_train, Vt_train = apply_svd(centered_user_item_matrix_train, num_latent_factors)

# compute the recommendations
all_recommendations_train, all_predicted_centered_ratings_train = compute_recommendations_for_all_users(U_TRAIN, Sigma_train, Vt_train, user_means_train, user_ids_train, num_recommendations, user_item_matrix_train)

Baseline val_data smaple tenth

In [None]:
user_item_matrix_val, user_id_dict_val, movie_id_dict_val, user_ids_val, movie_ids_val = create_user_item_matrix(val_data)

# get unique movieIds, use set to ensure unique values and put ids in a list
user_ids_val = list(set(user_ids_val))
item_ids_val = list(set(movie_ids_val))

# unpack the tuple returned by center_data function to get an updates user item matrix which is more robust to variations in rating
centered_user_item_matrix_val, user_means_val = center_data(user_item_matrix_val)

# apply SVD using the centered matrix to reduce memory usage and to decompose the matrix to be able to make recommendations using the dot product method
U_VAL, Sigma_val, Vt_val = apply_svd(centered_user_item_matrix_val, num_latent_factors)

# compute the recommendations
all_recommendations_val, all_predicted_centered_ratings_val = compute_recommendations_for_all_users(U_VAL, Sigma_val, Vt_val, user_means_val, user_ids_val, num_recommendations, user_item_matrix_val)

Baseline train_data sample third

In [None]:
user_item_matrix_train2, user_id_dict_train2, movie_id_dict_train2, user_ids_train2, movie_ids_train2 = create_user_item_matrix(train_data2)

# get unique movieIds, use set to ensure unique values and put ids in a list
user_ids_train2 = list(set(user_ids_train2))
item_ids_train2 = list(set(movie_ids_train2))

# unpack the tuple returned by center_data function to get an updates user item matrix which is more robust to variations in rating
centered_user_item_matrix_train2, user_means_train2 = center_data(user_item_matrix_train2)

# apply SVD using the centered matrix to reduce memory usage and to decompose the matrix to be able to make recommendations using the dot product method
U_TRAIN2, Sigma_train2, Vt_train2 = apply_svd(centered_user_item_matrix_train2, num_latent_factors)

# compute the recommendations
all_recommendations_train2, all_predicted_centered_ratings_train2 = compute_recommendations_for_all_users(U_TRAIN2, Sigma_train2, Vt_train2, user_means_train2, user_ids_train2, num_recommendations, user_item_matrix_train2)

Baseline val_data sample third

In [None]:
user_item_matrix_val2, user_id_dict_val2, movie_id_dict_val2, user_ids_val2, movie_ids_val2 = create_user_item_matrix(val_data2)

# get unique movieIds, use set to ensure unique values and put ids in a list
user_ids_val2 = list(set(user_ids_val2))
item_ids_val2 = list(set(movie_ids_val2))

# unpack the tuple returned by center_data function to get an updates user item matrix which is more robust to variations in rating
centered_user_item_matrix_val2, user_means_val2 = center_data(user_item_matrix_val2)

# apply SVD using the centered matrix to reduce memory usage and to decompose the matrix to be able to make recommendations using the dot product method
U_val2, Sigma_val2, Vt_val2 = apply_svd(centered_user_item_matrix_val2, num_latent_factors)

# compute the recommendations
all_recommendations_val2, all_predicted_centered_ratings_val2 = compute_recommendations_for_all_users(U_val2, Sigma_val2, Vt_val2, user_means_val2, user_ids_val2, num_recommendations, user_item_matrix_val2)

In [None]:
def compute_mse_rmse(original_ratings, predicted_ratings):
    """
    Computes the Root Mean Square Error (RMSE) between the original ratings and the predicted ratings. MovieIds a user has not interacted with is turned into 0 for now.

    Parameters:
    original_ratings (numpy.ndarray): Array containing the original ratings.
    predicted_ratings (numpy.ndarray): Array containing the predicted ratings.

    Returns:
    float: The RMSE value.
    
    """
    # handle implicit ratings with 0s for now
    original_ratings = np.nan_to_num(original_ratings, nan=0, posinf=0, neginf=0)
    predicted_ratings = np.nan_to_num(predicted_ratings, nan=0, posinf=0, neginf=0)

    # make 1d arrays by flattening them to be able to make masks
    original_ratings_flat = original_ratings.flatten()
    predicted_ratings_flat = predicted_ratings.flatten()
    
    # remove entries with no original rating (unrated items)
    mask = original_ratings_flat != 0
    original_ratings_flat = original_ratings_flat[mask]
    predicted_ratings_flat = predicted_ratings_flat[mask]
    
    # Compute the squared differences
    squared_diff = np.square(original_ratings_flat - predicted_ratings_flat)
    
    # Compute the mean squared error
    mse = np.mean(squared_diff)
    
    # Compute the square root of the mean squared error to get RMSE
    rmse = np.sqrt(mse)
    
    return mse,rmse

In [None]:
user_item_matrix_train.shape
centered_user_item_matrix_train.shape
all_predicted_centered_ratings_train.shape

(9086, 8)

(9086, 8)

(9086, 8)

In [None]:
# Evaluate performance on the FIRST training set
train_mse,train_rmse = compute_mse_rmse(centered_user_item_matrix_train, all_predicted_centered_ratings_train)
# Evaluate performance on the FIRST validation set
val_mse,val_rmse = compute_mse_rmse(centered_user_item_matrix_val, all_predicted_centered_ratings_val)

# Evaluate performance on the SECOND training set
train_mse2,train_rmse2 = compute_mse_rmse(centered_user_item_matrix_train2, all_predicted_centered_ratings_train2)
# Evaluate performance on the SECOND validation set
val_mse2,val_rmse2 = compute_mse_rmse(centered_user_item_matrix_val2, all_predicted_centered_ratings_val2)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


FIRST sample result (sample_tenth)

In [None]:
print("RMSE on training set:", train_rmse)
print("RMSE on validation set:", val_rmse)

RMSE on training set: 1.3380707207291844
RMSE on validation set: nan


**Baseline findings (sample_tenth):** MSE is lower on validation set than training set, indicating the model overfits to data it has already seen. Then again, it could also have something to do with the sample size. It choose the smaller one first. Having a smaller amount of reviews to rely on, could cause the model to perform worse on the validation data as the model has less reviews to work with.

As in a SVD model the number of latent factors is the only tunable hyperparameter, I will perform hyperparameter tuning below on it below by looping throug different amounts of latent factors and computing their corresponding RMSE with relation to the orignal centered ratings.

</BR>
SECOND sample result (sample_third)

In [None]:
print("RMSE on training set:", train_rmse2)
print("RMSE on validation set:", val_rmse2)

RMSE on training set: 0.8220297034898152
RMSE on validation set: 1.9386260127843542


**Baseline performance (increased sample size to a third of the dataset (sample_third)):** 

we can see two things:
- the error stays more or less the same on the training and the validation sets 
- the difference between the error on the training and the validation set is substantially smaller

**Conclusion:** the SVD recommender system is overfitting less to the training data when the sample size increases. This is a logic thing to happen, as the validation set becomes bigger, leading the RecSys to have more unseen reviews to work with. 

### Hyper parameter tuning (sample_tenth)

In [None]:
latent_factors_range = [1, 10, 25, 50, 250]
rmse_values = []

for num_latent_factors in latent_factors_range:
    # perform only SVD and generationg of recommendations again while doing the loop, as the matrix and centered matrix will not change depending on the amount of latent factors    
    # apply Singular Value Decomposition (SVD)
    U_TRAIN, Sigma_train, Vt_train = apply_svd(centered_user_item_matrix_train, num_latent_factors)
    
    # compute recommendations for all users
    all_recommendations_train, all_predicted_centered_ratings_train = compute_recommendations_for_all_users(U_TRAIN, Sigma_train, Vt_train, user_means_train, user_ids_train, num_recommendations, user_item_matrix_train)
    
    # compute Mean Squared Error (MSE) or Root Mean Square Error (RMSE) with the function I have written for it
    mse, rmse = compute_mse_rmse(centered_user_item_matrix_train, all_predicted_centered_ratings_train)
    
    # append the RMSE value to the list
    rmse_values.append(rmse)

# print the result descendingly
for i, num_latent_factors in enumerate(latent_factors_range):
    print(f"Num Latent Factors: {num_latent_factors} | RMSE: {rmse_values[i]}")


Num Latent Factors: 1 | RMSE: 1.3380707207291844
Num Latent Factors: 10 | RMSE: 1.1615958074236314
Num Latent Factors: 25 | RMSE: 1.1615958074236314
Num Latent Factors: 50 | RMSE: 1.1615958074236314
Num Latent Factors: 250 | RMSE: 1.1615958074236314


### Hyper parameter tuning (sample_third)

In [None]:
latent_factors_range2 = [1, 10, 25, 50, 250]
rmse_values2 = []

for num_latent_factors in latent_factors_range2:
    # perform only SVD and generationg of recommendations again while doing the loop, as the matrix and centered matrix will not change depending on the amount of latent factors    
    # apply Singular Value Decomposition (SVD)
    U_TRAIN2, Sigma_train2, Vt_train2 = apply_svd(centered_user_item_matrix_train2, num_latent_factors)
    
    # compute recommendations for all users
    all_recommendations_train2, all_predicted_centered_ratings_train2 = compute_recommendations_for_all_users(U_TRAIN2, Sigma_train2, Vt_train2, user_means_train2, user_ids_train2, num_recommendations, user_item_matrix_train2)
    
    # compute Mean Squared Error (MSE) or Root Mean Square Error (RMSE) with the function I have written for it
    mse, rmse = compute_mse_rmse(centered_user_item_matrix_train2, all_predicted_centered_ratings_train2)
    
    # append the RMSE value to the list
    rmse_values2.append(rmse)

# print the result descendingly
for i, num_latent_factors in enumerate(latent_factors_range2):
    print(f"Num Latent Factors: {num_latent_factors} | RMSE: {rmse_values2[i]}")


Num Latent Factors: 1 | RMSE: 0.8220297034898152
Num Latent Factors: 10 | RMSE: 0.7797149702343218
Num Latent Factors: 25 | RMSE: 0.7686107981337318
Num Latent Factors: 50 | RMSE: 0.7686107981337318
Num Latent Factors: 250 | RMSE: 0.7686107981337318


Best amount of latent factors is 250

### Final predictions on test set:

Increased sample size (sample_third) and the best amount of latent factors are used to predict.

In [None]:
num_latent_factors_test2 = 250

user_item_matrix_test2, user_id_dict_test2, movie_id_dict_test2, user_ids_test2, movie_ids_test2 = create_user_item_matrix(test_data2)

# get unique movieIds, use set to ensure unique values and put ids in a list
user_ids_test2 = list(set(user_ids_test2))
item_ids_test2 = list(set(movie_ids_test2))

# unpack the tuple returned by center_data function to get an updates user item matrix which is more robust to variations in rating
centered_user_item_matrix_test2, user_means_test2 = center_data(user_item_matrix_test2)

# apply SVD using the centered matrix to reduce memory usage and to decompose the matrix to be able to make recommendations using the dot product method
U_test2, Sigma_test2, Vt_test2 = apply_svd(centered_user_item_matrix_test2, num_latent_factors_test2)

# compute the recommendations
all_recommendations_test2, all_predicted_centered_ratings_test2 = compute_recommendations_for_all_users(U_test2, Sigma_test2, Vt_test2, user_means_test2, user_ids_test2, num_recommendations, user_item_matrix_test2)

In [None]:
test_mse2,test_rmse2 = compute_mse_rmse(centered_user_item_matrix_test2, all_predicted_centered_ratings_test2)
print("RMSE on test set:", test_rmse2)

RMSE on test set: 1.5934194788493532


### Overall conclusion:

Hyperparametertuning and increasing the sample size did eventually not lead to a lower error on the test set. Maybe adding more features will help?

# Movielens dataset:

Fit the RecSys on the training data

In [None]:
num_latent_factors_train3 = 250

user_item_matrix_train3, user_id_dict_train3, movie_id_dict_train3, user_ids_train3, movie_ids_train3 = create_user_item_matrix(train_data3)

# get unique movieIds, use set to ensure unique values and put ids in a list
user_ids_train3 = list(set(user_ids_train3))
item_ids_train3 = list(set(movie_ids_train3))

# unpack the tuple returned by center_data function to get an updates user item matrix which is more robust to variations in rating
centered_user_item_matrix_train3, user_means_train3 = center_data(user_item_matrix_train3)

# apply SVD using the centered matrix to reduce memory usage and to decompose the matrix to be able to make recommendations using the dot product method
U_train3, Sigma_train3, Vt_train3 = apply_svd(centered_user_item_matrix_train3, num_latent_factors_train3)

# compute the recommendations
all_recommendations_train3, all_predicted_centered_ratings_train3 = compute_recommendations_for_all_users(U_train3, Sigma_train3, Vt_train3, user_means_train3, user_ids_train3, num_recommendations, user_item_matrix_train3)

Fit the RecSys on the validation data

In [None]:
num_latent_factors_val3 = 250

user_item_matrix_val3, user_id_dict_val3, movie_id_dict_val3, user_ids_val3, movie_ids_val3 = create_user_item_matrix(val_data3)

# get unique movieIds, use set to ensure unique values and put ids in a list
user_ids_val3 = list(set(user_ids_val3))
item_ids_val3 = list(set(movie_ids_val3))

# unpack the tuple returned by center_data function to get an updates user item matrix which is more robust to variations in rating
centered_user_item_matrix_val3, user_means_val3 = center_data(user_item_matrix_val3)

# apply SVD using the centered matrix to reduce memory usage and to decompose the matrix to be able to make recommendations using the dot product method
U_val3, Sigma_val3, Vt_val3 = apply_svd(centered_user_item_matrix_val3, num_latent_factors_val3)

# compute the recommendations
all_recommendations_val3, all_predicted_centered_ratings_val3 = compute_recommendations_for_all_users(U_val3, Sigma_val3, Vt_val3, user_means_val3, user_ids_val3, num_recommendations, user_item_matrix_val3)

See baseline performance:

In [None]:
train_mse3,train_rmse3 = compute_mse_rmse(centered_user_item_matrix_train3, all_predicted_centered_ratings_train3)
print("RMSE on training set:", train_rmse3)
val_mse3,val_rmse3 = compute_mse_rmse(centered_user_item_matrix_val3, all_predicted_centered_ratings_val3)
print("RMSE on validation set:", val_rmse3)

RMSE on training set: 0.7959795423781132
RMSE on validation set: 1.566462504410829


The model is performing slightly better, and the error is smaller as well. Adding a filter to the movieLens dataset helped to turn down the error.
<p>
</p>

</br>Perform hyperparameter tuning on num_latent_factors:

In [None]:
latent_factors_range3 = [1, 10, 25, 50, 250]
rmse_values3 = []

for num_latent_factors in latent_factors_range3:
    # perform only SVD and generationg of recommendations again while doing the loop, as the matrix and centered matrix will not change depending on the amount of latent factors    
    # apply Singular Value Decomposition (SVD)
    U_TRAIN3, Sigma_train3, Vt_train3 = apply_svd(centered_user_item_matrix_train3, num_latent_factors)
    
    # compute recommendations for all users
    all_recommendations_train3, all_predicted_centered_ratings_train3 = compute_recommendations_for_all_users(U_TRAIN3, Sigma_train3, Vt_train3, user_means_train3, user_ids_train3, num_recommendations, user_item_matrix_train3)
    
    # compute Mean Squared Error (MSE) or Root Mean Square Error (RMSE) with the function I have written for it
    mse, rmse = compute_mse_rmse(centered_user_item_matrix_train3, all_predicted_centered_ratings_train3)
    
    # append the RMSE value to the list
    rmse_values3.append(rmse)

# print the result descendingly
for i, num_latent_factors in enumerate(latent_factors_range3):
    print(f"Num Latent Factors: {num_latent_factors} | RMSE: {rmse_values3[i]}")

Num Latent Factors: 1 | RMSE: 0.8419723262061493
Num Latent Factors: 10 | RMSE: 0.8038566651519746
Num Latent Factors: 25 | RMSE: 0.7959795423781132
Num Latent Factors: 50 | RMSE: 0.7959795423781132
Num Latent Factors: 250 | RMSE: 0.7959795423781132


No matter which of the iterated number of latent factors I put in, the RecSys model is not able to have a lower error term, meaning the model operates bad.

### Overall conclusion:

The current amount of features is not able to capture the complexity of the data, meaning it is not able to catch each preference of the user. Adding more features will maybe lead to a better result.


**BY ADDING FILTER MATRIX IS LESS SPARSE, IMPLICIT FEEDBACK IS NOT TAKEN INTO ACCOUNT. BOTH ARE CASES WHICH COULD LEAD THE MODEL TO PERFORM BETTER. BUT THIS DOES NOT MEAN THE MODEL PERFORM WELL, AS IT ALSO NEEDS TO GENERATE RECOMMENDATIONS FOR NEW USERS (COLD START PROBLEM)**

In [None]:
# write code

### Include time feature in matrix:

In [None]:
# train_data2, val_data2, test_data2 = train_val_test_split(df)

In [None]:
# # Extract titles, user IDs, ratings, and dates
# review_data2 = train_data2['review_data'].values
# user_ids2 = np.concatenate([np.array([entry['userId'] for entry in row]) for row in review_data2])
# ratings2 = np.concatenate([np.array([entry['rating'] for entry in row]) for row in review_data2])
# dates = np.concatenate([np.array([entry['date'] for entry in row], dtype='datetime64') for row in review_data2])
# movieIds2 = np.concatenate([[movieId] * len(row) for movieId, row in zip(train_data['movieId'], review_data2)])

In [None]:
# train_data

In [None]:
# movieIds2
# user_ids2
# ratings
# dates

In [None]:
# # define function to convert datetime64[D] to months to normalize the dates
# def get_month(date):
#     month = (date.astype('datetime64[M]').astype(int) % 12) + 1
#     return month

# # Convert datetime64[D] dates to months
# months = np.array([get_month(date) for date in dates])

In [None]:
# # Create dictionaries to map user IDs and movie IDs to unique indices
# user_id_dict2 = {user_id: index for index, user_id in enumerate(np.unique(user_ids2))}
# movie_id_dict2 = {movie_id: index for index, movie_id in enumerate(np.unique(movieIds2))}

# # Initialize the user-item-time matrix
# user_count2 = len(user_id_dict2)
# movie_count2 = len(movie_id_dict2)
# matrix_3d = np.zeros((user_count2, movie_count2, 2))

# # Populate the matrix with ratings and normalized timestamps
# for user_id, movie_id, rating, month in zip(user_ids2, movieIds2, ratings2, months):
#     user_index = user_id_dict2[user_id]
#     movie_index = movie_id_dict2[movie_id]
#     matrix_3d[user_index, movie_index] = [rating, month]

In [None]:
# unique_values = np.unique(matrix_3d)
# print("Unique values in the user-item-time matrix:", unique_values)
# matrix_3d

In [None]:
# # set is used because it does not allow for duplicates
# user_ids2 = set()

# # iterate over each row
# for index, row in train_data2.iterrows():
#     # iterate over each dictionary in the 'review_data' column of the current row
#     for review_dict in row['review_data']:
#         user_id = review_dict.get('userId')  # Extract userId from the dictionary
#         if user_id:  # Check if userId exists
#             user_ids2.add(int(user_id))  # Add user ID to the set

# user_ids2 = list(user_ids2)

In [None]:
# # put movieids in set so duplicates are not allowed here either
# item_ids2 = list(set(train_data2['movieId'].unique()))

In [None]:
# def center_data_3d(matrix_3d):
#     # Calculate mean along the second axis (movies axis)
#     user_means = np.mean(matrix_3d, axis=(1, 2), keepdims=True)
#     # Subtract the mean from the original matrix
#     centered_user_item_matrix_3d = matrix_3d - user_means
#     return centered_user_item_matrix_3d, user_means

# def apply_svd_3d(centered_user_item_matrix_3d, num_latent_factors):
#     # Reshape the matrix to be 2D for SVD
#     reshaped_matrix = centered_user_item_matrix_3d.reshape(centered_user_item_matrix_3d.shape[0], -1)
#     # Perform SVD
#     U, Sigma, Vt = np.linalg.svd(reshaped_matrix, full_matrices=False)
#     # Keep only the specified number of latent factors
#     U = U[:, :num_latent_factors]
#     Sigma = np.diag(Sigma[:num_latent_factors])
#     Vt = Vt[:num_latent_factors, :]
#     return U, Sigma, Vt

# def compute_recommendations_for_all_users_3d(U, Sigma, Vt, user_means, user_ids2, num_recommendations):
#     all_recommendations = {}
#     user_id_to_index = {user_id: idx for idx, user_id in enumerate(user_ids2)}
#     for user_id in user_ids2:
#         user_index = user_id_to_index[user_id]
#         # Perform dot product for each user
#         user_ratings = np.dot(U[user_index], np.dot(Sigma, Vt)) + user_means[user_index]
#         # Exclude items already interacted with
#         user_ratings[user_item_matrix[user_index] > 0] = -np.inf
#         # Get indices of top recommendations
#         top_indices = np.argsort(user_ratings)[::-1][:num_recommendations]
#         top_items = top_indices + 1
#         all_recommendations[user_id] = top_items
#     return all_recommendations

In [None]:
# # select the Number of Latent Factors
# num_latent_factors = 4 

# # unpack the tuple returned by center_data function to get an updates user item matrix which is more robust to variations in rating
# centered_user_item_matrix_3d, user_means = center_data_3d(matrix_3d)

# # apply SVD using the centered matrix to reduce memory usage and to decompose the matrix to be able to make recommendations using the dot product method
# U, Sigma, Vt = apply_svd_3d(centered_user_item_matrix_3d, num_latent_factors)
# U 
# Sigma
# Vt

# # define number of recommendations per user
# num_recommendations = 4

# # compute the recommendations
# all_recommendations2 = compute_recommendations_for_all_users_3d(U, Sigma, Vt, user_means, user_ids, num_recommendations)

In [None]:
# all_recommendations2

# Redundant but maybe useful for troubleshooting:

In [None]:
# # Extract unique user IDs from the dataset
# dataset_user_ids = set()
# for review_list in train_data['review_data']:
#     for review_dict in review_list:
#         user_id = review_dict.get('userId')
#         if user_id:
#             dataset_user_ids.add(user_id)

# # Check if all user IDs in the matrix are also in the dataset, and vice versa
# user_ids_in_dataset_not_in_matrix = dataset_user_ids - set(user_ids)
# user_ids_in_matrix_not_in_dataset = set(user_ids) - dataset_user_ids
# len(user_ids_in_dataset_not_in_matrix)
# len(user_ids_in_matrix_not_in_dataset)

User item matrix with pandas:

In [None]:
# # extract review dates, user IDs, and ratings using NumPy
# review_data = train_data['review_data'].values
# user_ids = np.concatenate([np.array([entry['userId'] for entry in row]) for row in review_data])
# ratings = np.concatenate([np.array([entry['rating'] for entry in row]) for row in review_data])

# # Extract movie titles
# movieIds = np.concatenate([[movieId] * len(row) for movieId, row in zip(train_data['movieId'], review_data)])

# # Create a DataFrame with review dates, user IDs, ratings, and movie titles
# review_df = pd.DataFrame({'userId': user_ids, 'rating': ratings, 'movieId': movieIds})

# # Pivot review_df to get user-item matrix with reviews as values
# user_item_matrix_df = review_df.pivot_table(index='userId', columns='movieId', values='rating')

# # Fill NaN values with 0
# user_item_matrix_df = user_item_matrix_df.fillna(0)

# # Convert DataFrame to NumPy array
# user_item_matrix = user_item_matrix_df.to_numpy()

# user_item_matrix

In [None]:
# # extract titles from dataframe, user IDs, and ratings from dictionary using NumPy
# review_data = train_data['review_data'].values
# user_ids = np.concatenate([np.array([entry['userId'] for entry in row]) for row in review_data])
# ratings = np.concatenate([np.array([entry['rating'] for entry in row]) for row in review_data])
# movieIds = np.concatenate([[movieId] * len(row) for movieId, row in zip(train_data['movieId'], review_data)])

# # create dictionaries to map user IDs and movie IDs to unique indices to map over
# user_id_dict = {user_id: index for index, user_id in enumerate(np.unique(user_ids))}
# movie_id_dict = {movie_id: index for index, movie_id in enumerate(np.unique(movieIds))}

# # initialize an empty user-item matrix
# user_count = len(user_id_dict)
# movie_count = len(movie_id_dict)
# user_item_matrix = np.zeros((user_count, movie_count))

# # populate the user-item matrix with ratings from netflix dataset
# for i, (user_id, movie_id, rating) in enumerate(zip(user_ids, movieIds, ratings)):
#     user_index = user_id_dict[user_id]
#     movie_index = movie_id_dict[movie_id]
#     user_item_matrix[user_index, movie_index] = rating

In [None]:
# # set is used because it does not allow for duplicates
# user_ids = set()

# # iterate over each row
# for index, row in train_data.iterrows():
#     # iterate over each dictionary in the 'review_data' column of the current row
#     for review_dict in row['review_data']:
#         user_id = review_dict.get('userId')  # Extract userId from the dictionary
#         if user_id:  # Check if userId exists
#             user_ids.add(int(user_id))  # Add user ID to the set

# user_ids = list(user_ids)

In [None]:
# # put movieids in set so duplicates are not allowed here either
# item_ids = list(set(train_data['movieId'].unique()))

Function which computes recommendations and returns recommendations only, not the predicted ratings for every item after the svd matrix dot product:

In [None]:
# I will compute recommendations for each user_id in the training/test/validation data by performing the dot product between the previous reviews in the matrix by the reconstruction of the user item matrix with less features
# def compute_recommendations_for_all_users(U, Sigma, Vt, user_means, user_ids, num_recommendations):
#     all_recommendations = {}
#     user_id_to_index = {user_id: idx for idx, user_id in enumerate(user_ids)}
#     for user_id in user_ids:
#         user_index = user_id_to_index[user_id]
#         # matrix multriplication between Sigma and Vt, to reconstruct an item matrix with less features, followed by the dot product of U and the reconstruction of item matrix. It essentially calculates the predicted ratings for each item for the given user based on their latent given ratings. user_means[user_index] rules out the items the user already interacted with.
#         user_ratings = np.dot(U[user_index, :], np.dot(Sigma, Vt)) + user_means[user_index]
#         user_ratings[user_item_matrix[user_index, :] > 0] = -np.inf
#         top_indices = np.argsort(user_ratings)[::-1][:num_recommendations]
#         top_items = top_indices + 1
#         all_recommendations[user_id] = top_items
#     return all_recommendations

Old functions to extract unique user and item ids

In [None]:
# def extract_unique_user_ids(train_test_val_set):
#     user_ids = set()
#     # iterate over each row
#     for index, row in train_test_val_set.iterrows():
#         # iterate over each dictionary in the 'review_data' column of the current row
#         for review_dict in row['review_data']:
#             user_id = review_dict.get('userId')  # Extract userId from the dictionary
#             if user_id:  # Check if userId exists
#                 user_ids.add(int(user_id))  # Add user ID to the set
#     return list(user_ids)

# def extract_unique_movie_ids(train_test_val_set):
#     movie_ids = set(train_test_val_set['movieId'].unique())
#     return list(movie_ids)

For looking at arrays if they are the same

In [None]:
# all_recommendations_train
# all_recommendations_val
# centered_user_item_matrix_train # variable for centered user item matrix train
# all_predicted_centered_ratings_train # variable for predicted centered user item matrix train
# centered_user_item_matrix_val # variable for centered user item matrix val
# all_predicted_centered_ratings_val # variable for predicted centered user item matrix val
# print('All arrays are in the same format, meaning they are appropriately prepped for model evaluation')