In [244]:
import numpy as np
import pandas as pd

In [245]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # to make jupyter print all outputs, not just the last one
from IPython.core.display import HTML # to pretty print pandas df and be able to copy them over (e.g. to ppt slides)

In [246]:
df = pd.read_parquet('cleaned/netflix_parquet')

To start with, I am going to make multiple random samples using random sample with replacement:

*only samples from EDA which were able to catch differences during ANOVA are selected, and which showed similar distribution to the complete dataset*

In [247]:
# make samples using random sampling:
sample_third = df.sample(frac=1/3, random_state=42)
sample_quarter = df.sample(frac=1/4, random_state=42)
sample_sixth = df.sample(frac=1/6, random_state=42)
sample_tenth = df.sample(frac=1/10, random_state=42)

### Feature engineering:

Year and title will be dropped:

In [248]:
netflix_df = sample_tenth.drop(['year','title'],axis=1)
netflix_df

Unnamed: 0,movieId,review_data
123,124,"[{'date': 2002-04-01, 'rating': 3.0, 'userId':..."
1193,13434,"[{'date': 2003-02-20, 'rating': 2.0, 'userId':..."
462,4601,"[{'date': 2003-12-27, 'rating': 3.0, 'userId':..."
351,352,"[{'date': 2003-10-13, 'rating': 4.0, 'userId':..."
1058,9560,"[{'date': 2003-07-06, 'rating': 4.0, 'userId':..."
...,...,...
367,4506,"[{'date': 2005-07-06, 'rating': 4.0, 'userId':..."
1415,13656,"[{'date': 2003-11-21, 'rating': 2.0, 'userId':..."
768,9270,"[{'date': 2002-09-26, 'rating': 4.0, 'userId':..."
1218,13459,"[{'date': 2000-07-18, 'rating': 2.0, 'userId':..."


#### Let's work with movies and reviews first, add other features later:

Only rating and userId of dictionary will be kept to accomplish this.

In [249]:
netflix_df['review_data'] = netflix_df['review_data'].apply(lambda x: None if x is None else [{'userId': review['userId'], 'rating': review['rating']} for review in x if 'userId' in review and 'rating' in review])

Then, let's split our data into train, validation and test sets where we ensure that no training data flows into test and validation sets:

In [250]:
def train_val_test_split(data, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1):
    """
    Splits the data into training, validation, and test sets, simultaneously ensuring no training data flows into validation or test data.

    Parameters:
    - data: pandas DataFrame containing the data to be split.
    - train_ratio: float, ratio of the training set size to the total data size (default: 0.8).
    - val_ratio: float, ratio of the validation set size to the total data size (default: 0.1).
    - test_ratio: float, ratio of the test set size to the total data size (default: 0.1).

    Returns:
    - train_data: pandas DataFrame, training set.
    - val_data: pandas DataFrame, validation set.
    - test_data: pandas DataFrame, test set.
    """
    # Shuffle the data
    data_shuffled = data.sample(frac=1, random_state=42)

    # Calculate the sizes of each set
    num_samples = len(data_shuffled)
    num_train = int(train_ratio * num_samples)
    num_val = int(val_ratio * num_samples)
    num_test = num_samples - num_train - num_val

    # Split the data into train, validation, and test sets
    train_data = data_shuffled[:num_train]
    # Below is ensured the validation data and the test data starts after the indices which are already in the training data, ensuring that no training data will flow into validation of test data.
    val_data = data_shuffled[num_train:num_train+num_val]
    test_data = data_shuffled[num_train+num_val:]

    # Reset index for each set
    train_data.reset_index(drop=True, inplace=True)
    val_data.reset_index(drop=True, inplace=True)
    test_data.reset_index(drop=True, inplace=True)

    return train_data, val_data, test_data

Let's split the data accordingly:

In [251]:
train_data, val_data, test_data = train_val_test_split(netflix_df)

Subsequently, let's define some function to make our life easer for the compatibility of more datasets. We gather unique item and user ids, create user-item matrix which will be centered, followed by performing SVD en making recommendations using the dot product between the decomposed matrices resulting from SVD:

In [252]:
def create_user_item_matrix(train_test_val_set):
    """
    Creates a user-item matrix from the provided dataset containing review data.

    Parameters:
    train_test_val_set (DataFrame): DataFrame containing review data with columns 'review_data',
                                    which is a list of dictionaries with keys 'userId', 'rating',
                                    and 'movieId'.

    Returns:
    user_item_matrix (numpy.ndarray): Matrix representing users' ratings for items (movies), the matrix is an NumPy array which contains lists of user-item interactions, meaning a user and their corresponding ratings to the movieIds.    
    
    user_id_dict (dict): Dictionary mapping user IDs to unique indices in the user-item matrix.
    
    movie_id_dict (dict): Dictionary mapping movie IDs to unique indices in the user-item matrix.
    
    user_ids (numpy.ndarray): Array containing user IDs corresponding to each rating in the matrix.
    
    movie_ids (numpy.ndarray): Array containing movie IDs corresponding to each rating in the matrix.

    """
    review_data = train_test_val_set['review_data'].values
    user_ids = np.concatenate([np.array([entry['userId'] for entry in row]) for row in review_data])
    ratings = np.concatenate([np.array([entry['rating'] for entry in row]) for row in review_data])
    movieIds = np.concatenate([[movieId] * len(row) for movieId, row in zip(train_data['movieId'], review_data)])

    # create dictionaries to map user IDs and movie IDs to unique indices to map over
    user_id_dict = {user_id: index for index, user_id in enumerate(np.unique(user_ids))}
    movie_id_dict = {movie_id: index for index, movie_id in enumerate(np.unique(movieIds))}

    # initialize an empty user-item matrix
    user_count = len(user_id_dict)
    movie_count = len(movie_id_dict)
    user_item_matrix = np.zeros((user_count, movie_count))

    # populate the user-item matrix with ratings from netflix dataset
    for i, (user_id, movie_id, rating) in enumerate(zip(user_ids, movieIds, ratings)):
        user_index = user_id_dict[user_id]
        movie_index = movie_id_dict[movie_id]
        user_item_matrix[user_index, movie_index] = rating

    return user_item_matrix, user_id_dict, movie_id_dict, user_ids, movieIds

# I will center the data in the function below, to make the matrix more robust to handle variations in user ratings
def center_data(user_item_matrix):
    """
    Creates a centered matrix of the previously created user-item matrix

    Parameters:
    User-item matrix which is made a Numpy array with appended lists with ratings of each users of each item. Each position in each list corresponds to the same movieId. Datatype within the matrix is float64.

    Return:
    A centered user item matrix, where the row mean of each user is substracted from the initial ratings, to account for variations in ratings
    
    """
    user_means = np.mean(user_item_matrix, axis=1)
    centered_user_item_matrix = user_item_matrix - user_means[:, np.newaxis]
    return centered_user_item_matrix, user_means

# I will decompose the user item matrix in this function using numpy
def apply_svd(centered_user_item_matrix, num_latent_factors):
    """
    Applies Singular Value Decomposition (SVD) to decompose the centered user-item matrix into three matrices:
    U, Sigma, and Vt.

    U: user matrix with values which represent the relation between the chosen latent factors, Users are the rows, matrix is orthonormal to Vt
    Sigma: diagonal matrix where the chosen latent factors are in the diagonal line, ordered descendingly. 
    Vt: Item matrix with values which represent the relation between the chosen latent factors, Items are the columns, matrix is orthonormal to U

    Parameters:
    centered_user_item_matrix (numpy.ndarray): Centered user-item matrix to be decomposed.
    num_latent_factors (int): Number of latent factors to retain in the decomposition.

    Returns:
    U (numpy.ndarray): Matrix representing the relationship between users and latent factors.
    Sigma (numpy.ndarray): Diagonal matrix containing the singular values, representing the importance of each latent factor.
    Vt (numpy.ndarray): Transpose of the matrix representing the relationship between items and latent factors.

    """
    # U, sigma and Vt are created using the svd function from numpy
    U, Sigma, Vt = np.linalg.svd(centered_user_item_matrix, full_matrices=False)
    # set up sigma, which is the diagonal matrix from the decomposition
    Sigma = np.diag(Sigma[:num_latent_factors])
    # set up U and Vt which have to orthonormal to each other to ensure U represents each user and Vt represents each item, otherwise the total matrix would not add up.
    U = U[:, :num_latent_factors]
    Vt = Vt[:num_latent_factors, :]
    return U, Sigma, Vt

# I will compute recommendations by the dotproduct of the decomposed matrices from svd in this function
def compute_recommendations_for_all_users(U, Sigma, Vt, user_means, user_ids, num_recommendations, user_item_matrix):
    """
    Computes recommendations for all users based on the decomposed matrices from Singular Value Decomposition (SVD).

    Parameters:
    U: user matrix with values which represent the relation between the chosen latent factors, Users are the rows, matrix is orthonormal to Vt
    Sigma: diagonal matrix where the chosen latent factors are in the diagonal line, ordered descendingly. 
    Vt: Item matrix with values which represent the relation between the chosen latent factors, Items are the columns, matrix is orthonormal to U

    user_means (numpy.ndarray): Array containing mean ratings for each user.
    user_ids (numpy.ndarray): Array containing user IDs.
    num_recommendations (int): Number of recommendations to generate for each user.
    user_item_matrix (numpy.ndarray): Matrix representing user-item interactions, where rows correspond to users and columns correspond to items.

    Returns:
    all_recommendations (dict): Dictionary mapping user IDs to lists of top recommended item IDs.
    all_predicted_centered_ratings (numpy.ndarray): Array of predicted centered ratings for all users and items.
                                                    Predicted ratings are centered by adding the mean rating for each user.
                                                    Each row corresponds to a user, and each column corresponds to an item.
    """
    all_recommendations = {}
    all_predicted_centered_ratings = np.zeros_like(user_item_matrix)  # Initialize array for predicted ratings

    user_id_to_index = {user_id: idx for idx, user_id in enumerate(user_ids)}

    for user_id in user_ids:
        user_index = user_id_to_index[user_id]
        # Matrix multiplication between Sigma and Vt to reconstruct an item matrix with fewer features,
        # followed by the dot product of U and the reconstruction of the item matrix.
        user_ratings = np.dot(U[user_index, :], np.dot(Sigma, Vt)) + user_means[user_index]
        # Mask out items the user has already interacted with
        user_ratings[user_item_matrix[user_index, :] > 0] = -np.inf
        # Sort the predicted ratings in descending order
        top_indices = np.argsort(user_ratings)[::-1]
        # Store the top predicted ratings for the current user
        all_predicted_centered_ratings[user_index, :] = user_ratings
        # Select the top 'num_recommendations' items as recommendations
        top_items = top_indices[:num_recommendations] + 1
        # Store the top recommended items for the current user
        all_recommendations[user_id] = top_items

    return all_recommendations, all_predicted_centered_ratings

Before parameter tuning, I will run the recommender system for the train and validation set and record some baseline performance. Root Mean Squared Error (RMSE) will be used as performance metric. 

- Reason behind this is the corresponding original and predicted centered ratings from the train_data and val_data will be used for measuring performance. A form of squared mean error is appropriate for such cases. Recall and precision revolve around ratings which are relevant to the user or not, which is difficult and subjective to identify within this model. 
  
- Furthermore, RMSE will is expressed in the same units as the input data, making it easy to interpret for a user but a stakeholder as well.

In [253]:
# here I make a baseline selection of latent factors
num_latent_factors = 4

# here I make a baseline selection of recommendations per user
num_recommendations = 4

Baseline train_data

In [254]:
user_item_matrix_train, user_id_dict_train, movie_id_dict_train, user_ids_train, movie_ids_train = create_user_item_matrix(train_data)
# get unique movieIds, use set to ensure unique values and put ids in a list
user_ids_train = list(set(user_ids_train))
item_ids_train = list(set(movie_ids_train))

In [255]:
# unpack the tuple returned by center_data function to get an updates user item matrix which is more robust to variations in rating
centered_user_item_matrix_train, user_means_train = center_data(user_item_matrix_train)

# apply SVD using the centered matrix to reduce memory usage and to decompose the matrix to be able to make recommendations using the dot product method
U_TRAIN, Sigma_train, Vt_train = apply_svd(centered_user_item_matrix_train, num_latent_factors)

# compute the recommendations
all_recommendations_train, all_predicted_centered_ratings_train = compute_recommendations_for_all_users(U_TRAIN, Sigma_train, Vt_train, user_means_train, user_ids_train, num_recommendations, user_item_matrix_train)

Baseline val_data

In [256]:
user_item_matrix_val, user_id_dict_val, movie_id_dict_val, user_ids_val, movie_ids_val = create_user_item_matrix(val_data)
# get unique movieIds, use set to ensure unique values and put ids in a list
user_ids_val = list(set(user_ids_val))
item_ids_val = list(set(movie_ids_val))

In [257]:
# unpack the tuple returned by center_data function to get an updates user item matrix which is more robust to variations in rating
centered_user_item_matrix_val, user_means_val = center_data(user_item_matrix_val)

# apply SVD using the centered matrix to reduce memory usage and to decompose the matrix to be able to make recommendations using the dot product method
U_VAL, Sigma_val, Vt_val = apply_svd(centered_user_item_matrix_val, num_latent_factors)

# compute the recommendations
all_recommendations_val, all_predicted_centered_ratings_val = compute_recommendations_for_all_users(U_VAL, Sigma_val, Vt_val, user_means_val, user_ids_val, num_recommendations, user_item_matrix_val)

In [325]:
centered_user_item_matrix_train # variable for centered user item matrix train
all_predicted_centered_ratings_train # variable for predicted centered user item matrix train
centered_user_item_matrix_val # variable for centered user item matrix val
all_predicted_centered_ratings_val # variable for predicted centered user item matrix val
print('All arrays are in the same format, meaning they are appropriately prepped for model evaluation')

array([[ 3.87719298, -0.12280702, -0.12280702, ..., -0.12280702,
        -0.12280702, -0.12280702],
       [ 3.92105263, -0.07894737, -0.07894737, ..., -0.07894737,
        -0.07894737, -0.07894737],
       [ 4.95614035, -0.04385965, -0.04385965, ..., -0.04385965,
        -0.04385965, -0.04385965],
       ...,
       [-0.03508772, -0.03508772, -0.03508772, ..., -0.03508772,
        -0.03508772, -0.03508772],
       [-0.04385965, -0.04385965, -0.04385965, ..., -0.04385965,
        -0.04385965, -0.04385965],
       [ 2.97368421, -0.02631579, -0.02631579, ..., -0.02631579,
        -0.02631579, -0.02631579]])

array([[       -inf,  0.1817749 ,  0.06512649, ...,  0.03168683,
         0.014135  ,  0.47402227],
       [       -inf, -0.05485963, -0.02833147, ..., -0.0061352 ,
        -0.02866221,  0.28722274],
       [       -inf,  0.08364216,  0.01483195, ..., -0.002823  ,
        -0.01259906,  0.09270115],
       ...,
       [ 0.05052406, -0.09741868, -0.03215763, ..., -0.00310144,
        -0.01486637,  0.17044946],
       [ 0.21805977,  0.21440508,  0.01588955, ..., -0.03242487,
        -0.03262818,  0.1972004 ],
       [       -inf,  0.0501853 ,  0.00889917, ..., -0.0016938 ,
        -0.00755943,  0.05562069]])

array([[-0.21428571,  2.78571429, -0.21428571, ..., -0.21428571,
        -0.21428571, -0.21428571],
       [-0.21428571,  2.78571429, -0.21428571, ..., -0.21428571,
        -0.21428571, -0.21428571],
       [-0.28571429,  3.71428571, -0.28571429, ..., -0.28571429,
        -0.28571429, -0.28571429],
       ...,
       [-0.28571429, -0.28571429, -0.28571429, ..., -0.28571429,
        -0.28571429, -0.28571429],
       [-0.28571429,  3.71428571, -0.28571429, ..., -0.28571429,
        -0.28571429, -0.28571429],
       [-0.35714286,  4.64285714, -0.35714286, ..., -0.35714286,
        -0.35714286, -0.35714286]])

array([[-0.00776757,        -inf, -0.00862977, ...,  0.0139554 ,
         0.00463803, -0.00947849],
       [-0.00776757,        -inf, -0.00862977, ...,  0.0139554 ,
         0.00463803, -0.00947849],
       [-0.01035677,        -inf, -0.01150637, ...,  0.01860719,
         0.00618404, -0.01263799],
       ...,
       [ 0.07378203,  0.00484103,  0.05702355, ..., -0.08011124,
        -0.00482495,  0.04585588],
       [-0.01035677,        -inf, -0.01150637, ...,  0.01860719,
         0.00618404, -0.01263799],
       [-0.01294596,        -inf, -0.01438296, ...,  0.02325899,
         0.00773005, -0.01579749]])

All arrays are in the same format, meaning they are appropriately prepped for model evaluation


### Approach Baseline model performance evaluation

Yes, comparing the predicted matrix filled with user-item interactions to the original one from both the training and validation data is a common approach to evaluating the performance of a recommender system. Here's how you can do it:

Calculate Predicted Ratings: After performing Singular Value Decomposition (SVD) on the training data and reconstructing the predicted matrix, you'll have a matrix of predicted ratings for all users and items. This matrix represents the system's predictions of how users would rate items.

Compare with Original Training Data: Compare the predicted ratings in the reconstructed matrix with the original ratings in the training data. You can use metrics such as Mean Absolute Error (MAE) or Root Mean Squared Error (RMSE) to quantify the differences between the predicted and actual ratings. Lower values indicate better performance.

Compare with Validation Data: Similarly, compare the predicted ratings with the actual ratings in the validation data. This step evaluates how well the recommender system generalizes to unseen data. Again, use MAE or RMSE to assess the differences.

Analyze Differences: Examine the differences between predicted and actual ratings to understand where the recommender system performs well and where it struggles. You can identify cases where the system makes accurate predictions and cases where it makes significant errors.

Iterate and Improve: Based on the analysis, refine your recommender system to improve its performance. This could involve adjusting hyperparameters, using different algorithms, or incorporating additional features into the model.

In [348]:
def compute_mse(original_ratings, predicted_ratings):
    """
    Computes the Root Mean Square Error (RMSE) between the original ratings and the predicted ratings. MovieIds a user has not interacted with is turned into 0 for now.

    Parameters:
    original_ratings (numpy.ndarray): Array containing the original ratings.
    predicted_ratings (numpy.ndarray): Array containing the predicted ratings.

    Returns:
    float: The RMSE value.
    
    """
    # Replace inf and -inf values with 0
    original_ratings = np.nan_to_num(original_ratings, nan=0, posinf=0, neginf=0)
    predicted_ratings = np.nan_to_num(predicted_ratings, nan=0, posinf=0, neginf=0)

    # Flatten the matrices to 1D arrays
    original_ratings_flat = original_ratings.flatten()
    predicted_ratings_flat = predicted_ratings.flatten()
    
    # Remove entries with no original rating (unrated items)
    mask = original_ratings_flat != 0
    original_ratings_flat = original_ratings_flat[mask]
    predicted_ratings_flat = predicted_ratings_flat[mask]
    
    # Compute the squared differences
    squared_diff = np.square(original_ratings_flat - predicted_ratings_flat)
    
    # Compute the mean squared error
    mse = np.mean(squared_diff)
    
    # # Compute the square root of the mean squared error to get RMSE
    # rmse = np.sqrt(mse)
    
    return mse

In [349]:
# Evaluate performance on the training set
train_mse = compute_mse(centered_user_item_matrix_train, all_predicted_centered_ratings_train)
# print("RMSE on training set:", train_rmse)
print("MSE on training set:", train_mse)

# Evaluate performance on the validation set
val_mse = compute_mse(centered_user_item_matrix_val, all_predicted_centered_ratings_val)
# print("RMSE on validation set:", val_rmse)
print("MSE on validation set:", val_mse)

MSE on training set: 0.3586038276856281
MSE on validation set: 1.2250391032010444


**Baseline findings:** MSE is lower on validation set than training set, indication the model overfits to data it has already seen.

### Hyper parameter tuning

Compare performance from centered and uncentered data:

In [350]:
# write code

### Include time feature in matrix:

In [351]:
# train_data2, val_data2, test_data2 = train_val_test_split(df)

In [352]:
# # Extract titles, user IDs, ratings, and dates
# review_data2 = train_data2['review_data'].values
# user_ids2 = np.concatenate([np.array([entry['userId'] for entry in row]) for row in review_data2])
# ratings2 = np.concatenate([np.array([entry['rating'] for entry in row]) for row in review_data2])
# dates = np.concatenate([np.array([entry['date'] for entry in row], dtype='datetime64') for row in review_data2])
# movieIds2 = np.concatenate([[movieId] * len(row) for movieId, row in zip(train_data['movieId'], review_data2)])

In [353]:
# train_data

In [354]:
# movieIds2
# user_ids2
# ratings
# dates

In [355]:
# # define function to convert datetime64[D] to months to normalize the dates
# def get_month(date):
#     month = (date.astype('datetime64[M]').astype(int) % 12) + 1
#     return month

# # Convert datetime64[D] dates to months
# months = np.array([get_month(date) for date in dates])

In [356]:
# # Create dictionaries to map user IDs and movie IDs to unique indices
# user_id_dict2 = {user_id: index for index, user_id in enumerate(np.unique(user_ids2))}
# movie_id_dict2 = {movie_id: index for index, movie_id in enumerate(np.unique(movieIds2))}

# # Initialize the user-item-time matrix
# user_count2 = len(user_id_dict2)
# movie_count2 = len(movie_id_dict2)
# matrix_3d = np.zeros((user_count2, movie_count2, 2))

# # Populate the matrix with ratings and normalized timestamps
# for user_id, movie_id, rating, month in zip(user_ids2, movieIds2, ratings2, months):
#     user_index = user_id_dict2[user_id]
#     movie_index = movie_id_dict2[movie_id]
#     matrix_3d[user_index, movie_index] = [rating, month]

In [357]:
# unique_values = np.unique(matrix_3d)
# print("Unique values in the user-item-time matrix:", unique_values)
# matrix_3d

In [358]:
# # set is used because it does not allow for duplicates
# user_ids2 = set()

# # iterate over each row
# for index, row in train_data2.iterrows():
#     # iterate over each dictionary in the 'review_data' column of the current row
#     for review_dict in row['review_data']:
#         user_id = review_dict.get('userId')  # Extract userId from the dictionary
#         if user_id:  # Check if userId exists
#             user_ids2.add(int(user_id))  # Add user ID to the set

# user_ids2 = list(user_ids2)

In [359]:
# # put movieids in set so duplicates are not allowed here either
# item_ids2 = list(set(train_data2['movieId'].unique()))

In [360]:
# def center_data_3d(matrix_3d):
#     # Calculate mean along the second axis (movies axis)
#     user_means = np.mean(matrix_3d, axis=(1, 2), keepdims=True)
#     # Subtract the mean from the original matrix
#     centered_user_item_matrix_3d = matrix_3d - user_means
#     return centered_user_item_matrix_3d, user_means

# def apply_svd_3d(centered_user_item_matrix_3d, num_latent_factors):
#     # Reshape the matrix to be 2D for SVD
#     reshaped_matrix = centered_user_item_matrix_3d.reshape(centered_user_item_matrix_3d.shape[0], -1)
#     # Perform SVD
#     U, Sigma, Vt = np.linalg.svd(reshaped_matrix, full_matrices=False)
#     # Keep only the specified number of latent factors
#     U = U[:, :num_latent_factors]
#     Sigma = np.diag(Sigma[:num_latent_factors])
#     Vt = Vt[:num_latent_factors, :]
#     return U, Sigma, Vt

# def compute_recommendations_for_all_users_3d(U, Sigma, Vt, user_means, user_ids2, num_recommendations):
#     all_recommendations = {}
#     user_id_to_index = {user_id: idx for idx, user_id in enumerate(user_ids2)}
#     for user_id in user_ids2:
#         user_index = user_id_to_index[user_id]
#         # Perform dot product for each user
#         user_ratings = np.dot(U[user_index], np.dot(Sigma, Vt)) + user_means[user_index]
#         # Exclude items already interacted with
#         user_ratings[user_item_matrix[user_index] > 0] = -np.inf
#         # Get indices of top recommendations
#         top_indices = np.argsort(user_ratings)[::-1][:num_recommendations]
#         top_items = top_indices + 1
#         all_recommendations[user_id] = top_items
#     return all_recommendations

In [361]:
# # select the Number of Latent Factors
# num_latent_factors = 4 

# # unpack the tuple returned by center_data function to get an updates user item matrix which is more robust to variations in rating
# centered_user_item_matrix_3d, user_means = center_data_3d(matrix_3d)

# # apply SVD using the centered matrix to reduce memory usage and to decompose the matrix to be able to make recommendations using the dot product method
# U, Sigma, Vt = apply_svd_3d(centered_user_item_matrix_3d, num_latent_factors)
# U 
# Sigma
# Vt

# # define number of recommendations per user
# num_recommendations = 4

# # compute the recommendations
# all_recommendations2 = compute_recommendations_for_all_users_3d(U, Sigma, Vt, user_means, user_ids, num_recommendations)

In [362]:
# all_recommendations2

# Redundant but maybe useful for troubleshooting:

In [363]:
# # Extract unique user IDs from the dataset
# dataset_user_ids = set()
# for review_list in train_data['review_data']:
#     for review_dict in review_list:
#         user_id = review_dict.get('userId')
#         if user_id:
#             dataset_user_ids.add(user_id)

# # Check if all user IDs in the matrix are also in the dataset, and vice versa
# user_ids_in_dataset_not_in_matrix = dataset_user_ids - set(user_ids)
# user_ids_in_matrix_not_in_dataset = set(user_ids) - dataset_user_ids
# len(user_ids_in_dataset_not_in_matrix)
# len(user_ids_in_matrix_not_in_dataset)

User item matrix with pandas:

In [364]:
# # extract review dates, user IDs, and ratings using NumPy
# review_data = train_data['review_data'].values
# user_ids = np.concatenate([np.array([entry['userId'] for entry in row]) for row in review_data])
# ratings = np.concatenate([np.array([entry['rating'] for entry in row]) for row in review_data])

# # Extract movie titles
# movieIds = np.concatenate([[movieId] * len(row) for movieId, row in zip(train_data['movieId'], review_data)])

# # Create a DataFrame with review dates, user IDs, ratings, and movie titles
# review_df = pd.DataFrame({'userId': user_ids, 'rating': ratings, 'movieId': movieIds})

# # Pivot review_df to get user-item matrix with reviews as values
# user_item_matrix_df = review_df.pivot_table(index='userId', columns='movieId', values='rating')

# # Fill NaN values with 0
# user_item_matrix_df = user_item_matrix_df.fillna(0)

# # Convert DataFrame to NumPy array
# user_item_matrix = user_item_matrix_df.to_numpy()

# user_item_matrix

In [365]:
# # extract titles from dataframe, user IDs, and ratings from dictionary using NumPy
# review_data = train_data['review_data'].values
# user_ids = np.concatenate([np.array([entry['userId'] for entry in row]) for row in review_data])
# ratings = np.concatenate([np.array([entry['rating'] for entry in row]) for row in review_data])
# movieIds = np.concatenate([[movieId] * len(row) for movieId, row in zip(train_data['movieId'], review_data)])

# # create dictionaries to map user IDs and movie IDs to unique indices to map over
# user_id_dict = {user_id: index for index, user_id in enumerate(np.unique(user_ids))}
# movie_id_dict = {movie_id: index for index, movie_id in enumerate(np.unique(movieIds))}

# # initialize an empty user-item matrix
# user_count = len(user_id_dict)
# movie_count = len(movie_id_dict)
# user_item_matrix = np.zeros((user_count, movie_count))

# # populate the user-item matrix with ratings from netflix dataset
# for i, (user_id, movie_id, rating) in enumerate(zip(user_ids, movieIds, ratings)):
#     user_index = user_id_dict[user_id]
#     movie_index = movie_id_dict[movie_id]
#     user_item_matrix[user_index, movie_index] = rating

In [366]:
# # set is used because it does not allow for duplicates
# user_ids = set()

# # iterate over each row
# for index, row in train_data.iterrows():
#     # iterate over each dictionary in the 'review_data' column of the current row
#     for review_dict in row['review_data']:
#         user_id = review_dict.get('userId')  # Extract userId from the dictionary
#         if user_id:  # Check if userId exists
#             user_ids.add(int(user_id))  # Add user ID to the set

# user_ids = list(user_ids)

In [367]:
# # put movieids in set so duplicates are not allowed here either
# item_ids = list(set(train_data['movieId'].unique()))

Function which computes recommendations and returns recommendations only, not the predicted ratings for every item after the svd matrix dot product:

In [368]:
# I will compute recommendations for each user_id in the training/test/validation data by performing the dot product between the previous reviews in the matrix by the reconstruction of the user item matrix with less features
# def compute_recommendations_for_all_users(U, Sigma, Vt, user_means, user_ids, num_recommendations):
#     all_recommendations = {}
#     user_id_to_index = {user_id: idx for idx, user_id in enumerate(user_ids)}
#     for user_id in user_ids:
#         user_index = user_id_to_index[user_id]
#         # matrix multriplication between Sigma and Vt, to reconstruct an item matrix with less features, followed by the dot product of U and the reconstruction of item matrix. It essentially calculates the predicted ratings for each item for the given user based on their latent given ratings. user_means[user_index] rules out the items the user already interacted with.
#         user_ratings = np.dot(U[user_index, :], np.dot(Sigma, Vt)) + user_means[user_index]
#         user_ratings[user_item_matrix[user_index, :] > 0] = -np.inf
#         top_indices = np.argsort(user_ratings)[::-1][:num_recommendations]
#         top_items = top_indices + 1
#         all_recommendations[user_id] = top_items
#     return all_recommendations

Old functions to extract unique user and item ids

In [369]:
# def extract_unique_user_ids(train_test_val_set):
#     user_ids = set()
#     # iterate over each row
#     for index, row in train_test_val_set.iterrows():
#         # iterate over each dictionary in the 'review_data' column of the current row
#         for review_dict in row['review_data']:
#             user_id = review_dict.get('userId')  # Extract userId from the dictionary
#             if user_id:  # Check if userId exists
#                 user_ids.add(int(user_id))  # Add user ID to the set
#     return list(user_ids)

# def extract_unique_movie_ids(train_test_val_set):
#     movie_ids = set(train_test_val_set['movieId'].unique())
#     return list(movie_ids)