In [108]:
import pandas as pd
import numpy as np

In [109]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # to make jupyter print all outputs, not just the last one
from IPython.core.display import HTML # to pretty print pandas df and be able to copy them over (e.g. to ppt slides)

In [110]:
netflix_df = pd.read_parquet('cleaned/netflix_parquet')

In [111]:
df = netflix_df.sample(n=500, random_state=42)

In [112]:
df = df[['movieId','review_data']]

In [113]:
review_data = df['review_data'].values
user_ids = np.concatenate([np.array([entry['userId'] for entry in row]) for row in review_data])
ratings = np.concatenate([np.array([entry['rating'] for entry in row]) for row in review_data])
movieIds = np.concatenate([[movieId] * len(row) for movieId, row in zip(df['movieId'], review_data)])
len(user_ids)
len(np.unique(movieIds))

2458112

500

In [114]:
df.head()
df['review_data'].iloc[0][1]

Unnamed: 0,movieId,review_data
1098,1099,"[{'date': 2003-03-31, 'rating': 3.0, 'userId':..."
572,573,"[{'date': 2004-09-22, 'rating': 5.0, 'userId':..."
450,451,"[{'date': 2005-08-16, 'rating': 5.0, 'userId':..."
383,384,"[{'date': 2004-07-22, 'rating': 4.0, 'userId':..."
1491,1492,"[{'date': 2005-04-06, 'rating': 3.0, 'userId':..."


{'date': datetime.date(2004, 7, 23), 'rating': 5.0, 'userId': '2292389'}

In [115]:
def create_user_item_matrix(train_test_val_set):
    """
    Creates a user-item matrix from the provided dataset containing review data.

    Parameters:
    train_test_val_set (DataFrame): DataFrame containing review data with columns 'review_data',
                                    which is a list of dictionaries with keys 'userId', 'rating',
                                    and 'movieId'.

    Returns:
    user_item_matrix (numpy.ndarray): Matrix representing users' ratings for items (movies), the matrix is an NumPy array which contains lists of user-item interactions, meaning a user and their corresponding ratings to the movieIds.    
    
    user_id_dict (dict): Dictionary mapping user IDs to unique indices in the user-item matrix.
    
    movie_id_dict (dict): Dictionary mapping movie IDs to unique indices in the user-item matrix.
    
    user_ids (numpy.ndarray): Array containing user IDs corresponding to each rating in the matrix.
    
    movie_ids (numpy.ndarray): Array containing movie IDs corresponding to each rating in the matrix.

    """
    review_data = train_test_val_set['review_data'].values
    user_ids = np.concatenate([np.array([entry['userId'] for entry in row]) for row in review_data])
    ratings = np.concatenate([np.array([entry['rating'] for entry in row]) for row in review_data])
    movieIds = np.concatenate([[movieId] * len(row) for movieId, row in zip(train_test_val_set['movieId'], review_data)])

    # create dictionaries to map user IDs and movie IDs to unique indices to map over
    user_id_dict = {user_id: index for index, user_id in enumerate(np.unique(user_ids))}
    movie_id_dict = {movie_id: index for index, movie_id in enumerate(np.unique(movieIds))}

    # initialize an empty user-item matrix
    user_count = len(user_id_dict)
    movie_count = len(movie_id_dict)
    user_item_matrix = np.zeros((user_count, movie_count))

    # populate the user-item matrix with ratings from netflix dataset
    for i, (user_id, movie_id, rating) in enumerate(zip(user_ids, movieIds, ratings)):
        user_index = user_id_dict[user_id]
        movie_index = movie_id_dict[movie_id]
        user_item_matrix[user_index, movie_index] = rating

    return user_item_matrix, user_id_dict, movie_id_dict, user_ids, movieIds

def center_data(user_item_matrix):
    """
    Creates a centered matrix of the previously created user-item matrix

    Parameters:
    User-item matrix which is made a Numpy array with appended lists with ratings of each users of each item. Each position in each list corresponds to the same movieId. Datatype within the matrix is float64. Each NaN value is converted to 0. In other words, for the time being the implicit feedback is converted to 0.

    Return:
    A centered user item matrix, where the row mean of each user is subtracted from the initial ratings, to account for variations in ratings
    
    """
    # Check for NaN values and replace them with 0
    user_item_matrix[np.isnan(user_item_matrix)] = 0
    
    # Compute user means
    user_means = np.mean(user_item_matrix, axis=1)
    
    # Center the data
    centered_user_item_matrix = user_item_matrix - user_means[:, np.newaxis]
    
    return centered_user_item_matrix, user_means

# I will decompose the user item matrix in this function using numpy
def apply_svd(centered_user_item_matrix, num_latent_factors):
    """
    Applies Singular Value Decomposition (SVD) to decompose the centered user-item matrix into three matrices:
    U, Sigma, and Vt.

    U: user matrix with values which represent the relation between the chosen latent factors, Users are the rows, matrix is orthonormal to Vt
    Sigma: diagonal matrix where the chosen latent factors are in the diagonal line, ordered descendingly. 
    Vt: Item matrix with values which represent the relation between the chosen latent factors, Items are the columns, matrix is orthonormal to U

    Parameters:
    centered_user_item_matrix (numpy.ndarray): Centered user-item matrix to be decomposed.
    num_latent_factors (int): Number of latent factors to retain in the decomposition.

    Returns:
    U (numpy.ndarray): Matrix representing the relationship between users and latent factors.
    Sigma (numpy.ndarray): Diagonal matrix containing the singular values, representing the importance of each latent factor.
    Vt (numpy.ndarray): Transpose of the matrix representing the relationship between items and latent factors.

    """
    # U, sigma and Vt are created using the svd function from numpy
    U, Sigma, Vt = np.linalg.svd(centered_user_item_matrix, full_matrices=False)
    # set up sigma, which is the diagonal matrix from the decomposition
    Sigma = np.diag(Sigma[:num_latent_factors])
    # set up U and Vt which have to orthonormal to each other to ensure U represents each user and Vt represents each item, otherwise the total matrix would not add up.
    U = U[:, :num_latent_factors]
    Vt = Vt[:num_latent_factors, :]
    return U, Sigma, Vt

In [116]:
user_item_matrix, user_id_dict, movie_id_dict, user_ids, movie_ids = create_user_item_matrix(df)

# get unique movieIds, use set to ensure unique values and put ids in a list
user_ids = list(set(user_ids))
item_ids = list(set(movie_ids))

# unpack the tuple returned by center_data function to get an updates user item matrix which is more robust to variations in rating
centered_user_item_matrix, user_means = center_data(user_item_matrix)

num_latent_factors = 2

# apply SVD using the centered matrix to reduce memory usage and to decompose the matrix to be able to make recommendations using the dot product method
U, Sigma, Vt = apply_svd(centered_user_item_matrix, num_latent_factors)

In [117]:
def cosine_similarity(vector_a, vector_b):
    """
    Computes the cosine similarity between two vectors.

    Parameters:
    vector_a (numpy.ndarray): First vector.
    vector_b (numpy.ndarray): Second vector.

    Returns:
    similarity (float): Cosine similarity score between the two vectors.
    """
    dot_product = np.dot(vector_a, vector_b)
    norm_a = np.linalg.norm(vector_a)
    norm_b = np.linalg.norm(vector_b)
    similarity = dot_product / (norm_a * norm_b)
    return similarity

def predict_rating(user_vector, movie_vector, user_mean):
    """
    Predicts a rating for a given user and movie.

    Parameters:
    user_vector (numpy.ndarray): Vector representing the user.
    movie_vector (numpy.ndarray): Vector representing the movie.
    user_mean (float): Mean rating of the user.

    Returns:
    predicted_rating (float): Predicted rating for the user-movie pair.
    """
    predicted_rating = user_mean + np.dot(user_vector, movie_vector)
    return predicted_rating

def compute_recommendations(user_id, user_item_matrix, user_id_dict, movie_id_dict, U, Sigma, Vt, user_means, num_recommendations):
    """
    Computes recommendations for a given user.

    Parameters:
    user_id (int): ID of the user for whom to compute recommendations.
    user_item_matrix (numpy.ndarray): Matrix representing users' ratings for items (movies).
    user_id_dict (dict): Dictionary mapping user IDs to unique indices.
    movie_id_dict (dict): Dictionary mapping movie IDs to unique indices.
    U (numpy.ndarray): Matrix representing the relationship between users and latent factors.
    Sigma (numpy.ndarray): Diagonal matrix containing the singular values.
    Vt (numpy.ndarray): Transpose of the matrix representing the relationship between items and latent factors.
    user_means (numpy.ndarray): Array containing mean ratings for each user.
    num_recommendations (int): Number of recommendations to generate.

    Returns:
    recommendations (list): List of tuples containing recommended movie IDs and predicted ratings.
    """
    user_index = user_id_dict[user_id]
    user_vector = U[user_index]
    user_mean = user_means[user_index]

    # Initialize recommendations list
    recommendations = []

    # Iterate over all movies
    for movie_id in movie_id_dict:
        movie_index = movie_id_dict[movie_id]
        if user_item_matrix[user_index, movie_index] == 0:  # Only predict for unrated movies
            movie_vector = Vt[:, movie_index]
            predicted_rating = predict_rating(user_vector, movie_vector, user_mean)
            recommendations.append((movie_id, predicted_rating))

    # Sort recommendations by predicted rating (descending order)
    recommendations.sort(key=lambda x: x[1], reverse=True)

    # Get top recommendations
    top_recommendations = recommendations[:num_recommendations]
    return top_recommendations

In [118]:
# Example user ID
example_user_id = user_ids[0]

# Number of recommendations to generate
num_recommendations = 5

# Compute recommendations for the example user
recommendations = compute_recommendations(example_user_id, user_item_matrix, user_id_dict, movie_id_dict, U, Sigma, Vt, user_means, num_recommendations)

# Display top 5 recommendations
print("Top 5 Recommendations for User", example_user_id)
for i, (movie_id, _) in enumerate(recommendations, start=1):
    print(f"{i}. Movie ID: {movie_id}")

Top 5 Recommendations for User 2378815
1. Movie ID: 985
2. Movie ID: 483
3. Movie ID: 241
4. Movie ID: 1650
5. Movie ID: 1110


### Some basic statistics about the user-item matrix:

Also to check whether the creation of user-item matrix went well.

In [119]:
# # Initialize a set to store unique values
# unique_values = set()

# # Iterate over each row in the sparse matrix
# for row in range(user_item_matrix.shape[0]):
#     # Get the indices and data for the non-zero elements in the current row
#     indices = user_item_matrix.indices[user_item_matrix.indptr[row]:user_item_matrix.indptr[row+1]]
#     data = user_item_matrix.data[user_item_matrix.indptr[row]:user_item_matrix.indptr[row+1]]
    
#     # Update the set with unique values in the current row
#     unique_values.update(data)

# # Convert the set to a list for easier inspection
# unique_values = list(unique_values)

# # Look into results for checking
# print(f"Unique values in the user_item_matrix: {unique_values}")
# print(f"There are {user_item_matrix.shape[0]} users in this matrix")
# print(f"There are {user_item_matrix.shape[1]} items (movies) in this matrix")

### OLD:

In [120]:
# import scipy.sparse as sp

# def create_user_item_matrix(train_test_val_set):
#     """
#     Creates a user-item matrix from the provided dataset containing review data.

#     Parameters:
#     train_test_val_set (DataFrame): DataFrame containing review data with columns 'review_data',
#                                     which is a list of dictionaries with keys 'userId', 'rating',
#                                     and 'movieId'.

#     Returns:
#     user_item_matrix (scipy.sparse.csr_matrix): Sparse matrix representing users' ratings for items (movies).
#     user_id_dict (dict): Dictionary mapping user IDs to unique indices in the user-item matrix.
#     movie_id_dict (dict): Dictionary mapping movie IDs to unique indices in the user-item matrix.
#     """

#     review_data = train_test_val_set['review_data'].values
#     user_ids = np.concatenate([np.array([entry['userId'] for entry in row]) for row in review_data])
#     ratings = np.concatenate([np.array([entry['rating'] for entry in row]) for row in review_data])
#     movieIds = np.concatenate([[movieId] * len(row) for movieId, row in zip(train_test_val_set['movieId'], review_data)])

#     # Create dictionaries to map user IDs and movie IDs to unique indices
#     user_id_dict = {user_id: index for index, user_id in enumerate(np.unique(user_ids))}
#     movie_id_dict = {movie_id: index for index, movie_id in enumerate(np.unique(movieIds))}

#     # Initialize lists to store row, column, and data for the sparse matrix
#     row_indices = [user_id_dict[user_id] for user_id in user_ids]
#     col_indices = [movie_id_dict[movie_id] for movie_id in movieIds]
#     data = ratings

#     # Create the sparse user-item matrix
#     user_item_matrix = sp.csr_matrix((data, (row_indices, col_indices)), shape=(len(user_id_dict), len(movie_id_dict)))

#     return user_item_matrix, user_id_dict, movie_id_dict, user_ids, movieIds, ratings

In [121]:
# class UserKNNRecommender:
#     """
#     User-based Collaborative Filtering Recommender System.

#     Parameters:
#     - k (int): Number of similar users to consider for prediction (default is 5).
#     - similarity_measure (str): Similarity measure to use (default is 'cosine').

#     Methods:
#     - fit(user_item_matrix): Fit the recommender system to the user-item matrix.
#     - predict_ratings(user_id): Predict ratings for items for a given user.
#     - recommend_items(user_id, top_n): Recommend top items for a given user.
#     """

#     def __init__(self, k=5, similarity_measure='cosine'):
#         self.k = k
#         self.similarity_measure = similarity_measure

#     def fit(self, user_item_matrix):
#         """
#         Fit the recommender system to the user-item matrix.

#         Parameters:
#         - user_item_matrix (scipy.sparse.csr_matrix): User-item matrix representing user ratings for items.
#         """
#         self.user_item_matrix = user_item_matrix
#         if self.similarity_measure == 'cosine':
#             self.similarity_matrix = self.cosine_similarity(user_item_matrix)

#     def cosine_similarity(self, matrix):
#         """
#         Compute cosine similarity between rows of a matrix.

#         Parameters:
#         - matrix (scipy.sparse.csr_matrix): Input matrix.

#         Returns:
#         - similarity_matrix (numpy.ndarray): Cosine similarity matrix.
#         """
#         norm_matrix = sp.linalg.norm(matrix, axis=1)
#         normalized_matrix = matrix / norm_matrix[:, np.newaxis]
#         similarity_matrix = normalized_matrix.dot(normalized_matrix.T)
#         similarity_matrix.setdiag(0)  # Set diagonal to 0 to avoid self-similarity
#         return similarity_matrix

#     def predict_ratings(self, user_id):
#         """
#         Predict ratings for items for a given user. This is the distinguishable step from ItemKNN, where the similarity scores are retrieved per user instaed of per Item (movies).

#         Parameters:
#         - user_id (int): ID of the user for whom ratings are to be predicted.

#         Returns:
#         - predicted_ratings (numpy.ndarray): Predicted ratings for items.
#         """
#         user_index = user_id_dict[user_id]
#         similarity_scores = self.similarity_matrix[user_index]
#         top_similar_users_indices = np.argsort(similarity_scores)[::-1][:self.k]  # Indices of k most similar users
#         similar_users_ratings = self.user_item_matrix[top_similar_users_indices]
#         predicted_ratings = np.mean(similar_users_ratings, axis=0)  # Average ratings of similar users
#         return predicted_ratings

#     def recommend_items(self, user_id, top_n=5):
#         """
#         Recommend top items for a given user. Within this function, the user similarities are also used, meaning this is also a distinguishable step for UserKNN.

#         Parameters:
#         - user_id (int): ID of the user for whom items are to be recommended.
#         - top_n (int): Number of items to recommend (default is 5).

#         Returns:
#         - recommended_items (list): List of recommended item IDs.
#         """
#         user_index = user_id_dict[user_id]
#         predicted_ratings = self.predict_ratings(user_id)
#         # Find top_n items with highest predicted ratings
#         top_indices = np.argsort(predicted_ratings)[::-1][:top_n]
#         recommended_items = [movie_id for movie_id, index in movie_id_dict.items() if index in top_indices]
#         return recommended_items[:top_n]

In [122]:
# # Create user-item matrix and associated dictionaries
# user_item_matrix, user_id_dict, movie_id_dict, _, _, _ = create_user_item_matrix(df)

# # Randomly select a user ID from the user_id_dict
# user_id = list(user_id_dict.keys())[0]

# # Instantiate the UserKNNRecommender
# user_knn_recommender = UserKNNRecommender(k=2)

# # Fit the recommender to the user-item matrix
# user_knn_recommender.fit(user_item_matrix)

# # Predict ratings for the user
# predicted_ratings = user_knn_recommender.predict_ratings(user_id)

# # Recommend top items for the user
# recommended_items = user_knn_recommender.recommend_items(user_id)

# # Display the predicted ratings and recommended items
# print("Recommended items:", recommended_items)