In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras


100,000 ratings and 3,600 tag applications applied to 9,000 movies by 600 users. Last updated 9/2018.

In [2]:
movies = pd.read_csv('sample_data/movies.csv')
ratings = pd.read_csv('sample_data/ratings.csv')
tags = pd.read_csv('sample_data/tags.csv')
links = pd.read_csv('sample_data/links.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [9]:
print("Number of movies:", len(movies))
print("Number of users:", len(ratings['userId'].unique()))

Number of movies: 9742
Number of users: 610


# USING SVD

In [5]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357273 sha256=4d4c07566cab10848b91d684f2251e6955f7ae430f70059b0b68ecd5636e8c97
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Succ

Singular Value Decomposition (SVD) is a powerful mathematical technique used in machine learning and data science for matrix factorization. It is primarily used for dimensionality reduction, noise reduction, data compression, and feature extraction, particularly in recommendation systems, image compression, and natural language processing.

In [11]:
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
import pandas as pd

# Assuming 'ratings' DataFrame is already loaded with columns ['userId', 'movieId', 'rating']
reader = Reader()
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
svd = SVD()

# Perform cross-validation
cv_results = cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=False)

# Create a DataFrame to display results
cv_summary = pd.DataFrame({
    'Fold': [f'Fold {i+1}' for i in range(5)],
    'RMSE': cv_results['test_rmse'],
    'MAE': cv_results['test_mae'],
    'Fit Time (s)': cv_results['fit_time'],
    'Test Time (s)': cv_results['test_time']
})

# Add mean and std as summary rows
cv_summary.loc['Mean'] = ['Mean', cv_summary['RMSE'].mean(), cv_summary['MAE'].mean(),
                          cv_summary['Fit Time (s)'].mean(), cv_summary['Test Time (s)'].mean()]
cv_summary.loc['Std'] = ['Std', cv_summary['RMSE'].std(), cv_summary['MAE'].std(),
                         cv_summary['Fit Time (s)'].std(), cv_summary['Test Time (s)'].std()]

# Print the summary DataFrame
print(cv_summary)

# Print average RMSE and MAE
avg_rmse = cv_summary.loc['Mean', 'RMSE']
avg_mae = cv_summary.loc['Mean', 'MAE']

print(f"\nAverage RMSE of all folds: {avg_rmse:.4f}")
print(f"Average MAE of all folds: {avg_mae:.4f}")


        Fold      RMSE       MAE  Fit Time (s)  Test Time (s)
0     Fold 1  0.879147  0.677082      2.458977       0.189503
1     Fold 2  0.874460  0.672815      1.352617       0.254417
2     Fold 3  0.882935  0.675929      1.577595       0.362591
3     Fold 4  0.860414  0.661546      1.312746       0.105639
4     Fold 5  0.873898  0.669514      1.302002       0.103369
Mean    Mean  0.874171  0.671377      1.600787       0.203104
Std      Std  0.007629  0.005576      0.440649       0.097674

Average RMSE of all folds: 0.8742
Average MAE of all folds: 0.6714


In [22]:
def get_user_movie_ratings(user_id, ratings, movies, model):
    """
    Get a DataFrame showing the movies rated and not rated by a user, along with their estimated ratings.

    Parameters:
    - user_id: int, the ID of the user
    - ratings: DataFrame, containing user ratings with columns ['userId', 'movieId', 'rating']
    - movies: DataFrame, containing movie details with columns ['movieId', 'title']
    - model: trained Surprise prediction model

    Returns:
    - DataFrame with columns ['movieId', 'title', 'rating', 'Estimated_rating']
    """
    # Get movies user has rated
    user_rated = ratings[ratings['userId'] == user_id]
    user_rated = pd.merge(user_rated, movies, how='inner', on='movieId')
    user_rated['Estimated_rating'] = user_rated['movieId'].apply(lambda x: model.predict(user_id, x).est)
    user_rated = user_rated[['movieId', 'title', 'rating', 'Estimated_rating']]

    # Get movies user hasn't rated
    user_unrated = movies[~movies['movieId'].isin(user_rated['movieId'])].copy()
    user_unrated['rating'] = None  # Set 'rating' to NaN for unrated movies
    user_unrated['Estimated_rating'] = user_unrated['movieId'].apply(lambda x: model.predict(user_id, x).est)
    user_unrated = user_unrated[['movieId', 'title', 'rating', 'Estimated_rating']]

    # Drop columns that are entirely NA to avoid warnings
    user_rated = user_rated.dropna(axis=1, how='all')
    user_unrated = user_unrated.dropna(axis=1, how='all')

    # Combine rated and unrated movies
    user_final = pd.concat([user_rated, user_unrated], ignore_index=True)

    # Sort the final DataFrame by Estimated_rating in descending order
    user_final.sort_values('Estimated_rating', ascending=False, inplace=True)

    # Format the columns for better readability
    user_final['Estimated_rating'] = user_final['Estimated_rating'].apply(lambda x: f"{x:.2f}")
    if user_final['rating'].dtype != 'object':  # Format 'rating' column if not already a string
        user_final['rating'] = user_final['rating'].apply(lambda x: f"{x:.1f}" if pd.notnull(x) else None)

    # Reset index for a cleaner display
    user_final.reset_index(drop=True, inplace=True)

    return user_final

# Example usage
user_id = 610  # Replace with any user ID as needed
result = get_user_movie_ratings(user_id, ratings, movies, svd)
result


Unnamed: 0,movieId,title,rating,Estimated_rating
0,296,Pulp Fiction (1994),5.0,4.86
1,541,Blade Runner (1982),5.0,4.86
2,2858,American Beauty (1999),3.5,4.81
3,1089,Reservoir Dogs (1992),5.0,4.80
4,912,Casablanca (1942),3.5,4.77
...,...,...,...,...
9737,3646,Big Momma's House (2000),1.0,2.23
9738,3997,Dungeons & Dragons (2000),1.0,2.17
9739,1562,Batman & Robin (1997),,2.17
9740,1556,Speed 2: Cruise Control (1997),,2.03


#The goal of collaborative filtering is to generate two vectors: For each user, a 'parameter vector' that embodies the movie tastes of a user. The dot produxt of the two vectors plus the bias term should produce an estimate of the rating the user might give to the movie.

In [28]:
user_ratings = pd.merge(ratings, movies,how='inner', on='movieId')

user_movie_table = user_ratings.pivot_table(index='movieId', columns='userId', values='rating')

user_movie_table.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,,,4.0,,4.5,,,,...,4.0,,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,,,,,,4.0,,4.0,,,...,,4.0,,5.0,3.5,,,2.0,,
3,4.0,,,,,5.0,,,,,...,,,,,,,,2.0,,
4,,,,,,3.0,,,,,...,,,,,,,,,,
5,,,,,,5.0,,,,,...,,,,3.0,,,,,,


In [30]:


# Convert user_movie_table to a NumPy array, replacing NaNs with zeros
Y = np.nan_to_num(user_movie_table.to_numpy())

# Create the R matrix indicating where ratings exist (1 for rated, 0 for not rated)
R = np.zeros(Y.shape)
R[Y > 0] = 1

# Define dimensions
num_movies = R.shape[0]
num_users = R.shape[1]
num_features = 10  # Number of latent features for the model

# Initialize X (movie features) and W (user features) with random values
X = np.random.rand(num_movies, num_features)
W = np.random.rand(num_users, num_features)

# Initialize bias term for users
b = np.zeros((1, num_users))

# Print summary of matrix dimensions with explanations
print(f"Shape of Y (Ratings matrix): {Y.shape} - This matrix contains user ratings for movies.")
print(f"Shape of R (Indicator matrix): {R.shape} - This matrix indicates whether a movie is rated (1) or not (0).")
print(f"Shape of X (Movie features matrix): {X.shape} - This matrix represents the latent features for movies.")
print(f"Shape of W (User features matrix): {W.shape} - This matrix represents the latent features for users.")
print(f"Shape of b (User bias term): {b.shape} - This vector represents the bias term for each user.")
print(f"Number of latent features: {num_features}")
print(f"Number of movies: {num_movies}")
print(f"Number of users: {num_users}")


Shape of Y (Ratings matrix): (9724, 610) - This matrix contains user ratings for movies.
Shape of R (Indicator matrix): (9724, 610) - This matrix indicates whether a movie is rated (1) or not (0).
Shape of X (Movie features matrix): (9724, 10) - This matrix represents the latent features for movies.
Shape of W (User features matrix): (610, 10) - This matrix represents the latent features for users.
Shape of b (User bias term): (1, 610) - This vector represents the bias term for each user.
Number of latent features: 10
Number of movies: 9724
Number of users: 610


#The function cofi_cost_func computes the collaborative filtering objective function

In [31]:
import tensorflow as tf

def cofi_cost_func(X, W, b, Y, R, lambda_):
    """
    Computes the collaborative filtering cost function.

    Parameters:
    - X: np.ndarray, movie features matrix (num_movies x num_features)
    - W: np.ndarray, user features matrix (num_users x num_features)
    - b: np.ndarray, bias term for users (1 x num_users)
    - Y: np.ndarray, ratings matrix (num_movies x num_users)
    - R: np.ndarray, indicator matrix (num_movies x num_users)
         where R[i, j] = 1 if movie i was rated by user j, 0 otherwise
    - lambda_: float, regularization parameter

    Returns:
    - float, the computed cost function value
    """
    # Calculate the prediction error where ratings exist
    prediction_error = (tf.linalg.matmul(X, tf.transpose(W)) + b - Y) * R

    # Compute the cost with regularization
    cost = 0.5 * tf.reduce_sum(prediction_error ** 2) + (lambda_ / 2) * (tf.reduce_sum(X ** 2) + tf.reduce_sum(W ** 2))

    return cost

# Calculate the cost for lambda = 0 and display it
cost = cofi_cost_func(X, W, b, Y, R, 0).numpy()
print(f"Computed cost for lambda = 0: {cost:.4f}")


Computed cost for lambda = 0: 127107.5684


In [None]:
# import random
# # Initialize user ratings array with zeros
# my_ratings = np.zeros(R.shape[0])
# print("Rate movies from 1 to 5. If you haven't seen the movie, type 'skip'. Type 'stop' to finish rating.\n")

# movie_indices = list(range(R.shape[0]))
# random.shuffle(movie_indices)  # Shuffle indices for random selection

# for i in movie_indices:
#     # Get movie title
#     movie_title = movies.loc[i, "title"]

#     # Ask user for rating
#     user_input = input(f"Rate '{movie_title}': (1-5, or 'skip', 'stop') ").strip().lower()

#     if user_input == 'stop':
#         print("Stopping rating process.")
#         break
#     elif user_input == 'skip':
#         print(f"Skipped '{movie_title}'.")
#         continue
#     else:
#         try:
#             rating = int(user_input)
#             if 1 <= rating <= 5:
#                 my_ratings[i] = rating
#                 print(f"Rated {rating} for '{movie_title}'.")
#             else:
#                 print("Please enter a rating between 1 and 5.")
#         except ValueError:
#             print("Invalid input. Please enter a number between 1 and 5, 'skip', or 'stop'.")

# # Show the final ratings
# print('\nNew user ratings:\n')
# for i in range(len(my_ratings)):
#     if my_ratings[i] > 0:
#         print(f'Rated {my_ratings[i]} for {movies.loc[i,"title"]}')

In [40]:
# Initialize user ratings array with zeros
my_ratings = np.zeros(R.shape[0])

print("Rate movies from 1 to 5. To search for a movie, type part of its title. Type 'stop' to finish rating.\n")

while True:
    # Prompt user for a movie title
    title_input = input("Enter part of the movie title you've watched (or 'stop' to finish): ").strip().lower()

    if title_input == 'stop':
        print("Stopping rating process.")
        break

    # Find movies containing the search term
    matches = movies[movies['title'].str.lower().str.contains(title_input)]

    if matches.empty:
        print("No movies found with that title. Please try again.")
        continue

    # Display matching movies with indices
    print("\nMovies found:")
    for idx, row in matches.iterrows():
        print(f"{idx}: {row['title']}")

    # Ask user to select movie indices to rate
    indices_input = input("\nEnter the indices of the movies you want to rate (comma-separated, or type 'skip' to search again): ").strip()

    if indices_input.lower() == 'skip':
        print("Skipping to search again.\n")
        continue

    try:
        # Parse the indices and check their validity
        indices = [int(x.strip()) for x in indices_input.split(',')]
        if not all(idx in matches.index for idx in indices):
            print("Invalid indices entered. Please try again.")
            continue
    except ValueError:
        print("Invalid input. Please enter valid indices.")
        continue

    # Loop through selected indices and ask for ratings
    for movie_idx in indices:
        user_rating = input(f"Rate '{movies.loc[movie_idx, 'title']}' (1-5): ").strip()

        if user_rating.isdigit() and 1 <= int(user_rating) <= 5:
            my_ratings[movie_idx] = int(user_rating)
            print(f"Rated {int(user_rating)} for '{movies.loc[movie_idx, 'title']}'.\n")
        else:
            print("Please enter a rating between 1 and 5.")

# Show final ratings
print("\nNew user ratings:\n")
for i in range(len(my_ratings)):
    if my_ratings[i] > 0:
        print(f"Rated {my_ratings[i]} for {movies.loc[i, 'title']}")


Rate movies from 1 to 5. To search for a movie, type part of its title. Type 'stop' to finish rating.

Enter part of the movie title you've watched (or 'stop' to finish): Harry Potter

Movies found:
3574: Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
4076: Harry Potter and the Chamber of Secrets (2002)
5166: Harry Potter and the Prisoner of Azkaban (2004)
6062: Harry Potter and the Goblet of Fire (2005)
6522: Harry Potter and the Order of the Phoenix (2007)
7078: Harry Potter and the Half-Blood Prince (2009)
7465: Harry Potter and the Deathly Hallows: Part 1 (2010)
7644: Harry Potter and the Deathly Hallows: Part 2 (2011)

Enter the indices of the movies you want to rate (comma-separated, or type 'skip' to search again): 3574,4076,5166,6062,6522
Rate 'Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)' (1-5): 5
Rated 5 for 'Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Phil

In [41]:
Y = np.c_[my_ratings, Y]
R = np.c_[my_ratings > 0, R]

Ymean = np.zeros((num_movies, 1))
Ynorm = np.zeros(Y.shape)

for i in range(num_movies):
    idx = (R[i,:] == 1)
    Ymean[i] = np.mean(Y[i, idx])
    Ynorm[i, idx] = Y[i, idx] - Ymean[i]



In [42]:
num_movies, num_users = Y.shape
num_features = 100
tf.random.set_seed(1234)
W = tf.Variable(tf.random.normal((num_users,  num_features),dtype=tf.float64),  name='W')
X = tf.Variable(tf.random.normal((num_movies, num_features),dtype=tf.float64),  name='X')
b = tf.Variable(tf.random.normal((1,          num_users),   dtype=tf.float64),  name='b')

# Instantiate an optimizer.
optimizer = keras.optimizers.Adam(learning_rate=1e-1)

In [43]:
iterations = 200
lambda_ = 1
for iter in range(iterations):
    # Use TensorFlow’s GradientTape
    # to record the operations used to compute the cost
    with tf.GradientTape() as tape:

        # Compute the cost (forward pass included in cost)
        cost_value = cofi_cost_func(X, W, b, Ynorm, R, lambda_)

    # Use the gradient tape to automatically retrieve
    # the gradients of the trainable variables with respect to the loss
    grads = tape.gradient( cost_value, [X,W,b] )

    # Run one step of gradient descent by updating
    # the value of the variables to minimize the loss.
    optimizer.apply_gradients( zip(grads, [X,W,b]) )

    # Log periodically.
    if iter % 20 == 0:
        print(f"Training loss at iteration {iter}: {cost_value:0.1f}")

Training loss at iteration 0: 5672014.7
Training loss at iteration 20: 286257.8
Training loss at iteration 40: 110702.7
Training loss at iteration 60: 54352.1
Training loss at iteration 80: 31120.5
Training loss at iteration 100: 19896.5
Training loss at iteration 120: 13862.0
Training loss at iteration 140: 10391.7
Training loss at iteration 160: 8303.6
Training loss at iteration 180: 7001.1


In [56]:
# Make a prediction using trained weights and biases
p = np.matmul(X.numpy(), np.transpose(W.numpy())) + b.numpy()
pm = p + Ymean


In [69]:
# Get predictions for the first user (or change the index as needed)
my_predictions = pm[:, 0]
ix = my_predictions.argsort()[::-1]

# Create a DataFrame for the top 10 recommended movies
top_10_recommendations = movies.loc[ix[:10], ['movieId', 'title']].copy()
top_10_recommendations['Pred_Rating'] = my_predictions[ix[:10]]

# Create a DataFrame for the comparison of original and predicted ratings
rated_movies_indices = [i for i in range(len(my_ratings)) if my_ratings[i] > 0]
comparison_df = movies.loc[rated_movies_indices, ['movieId', 'title']].copy()
comparison_df['Original_Rating'] = my_ratings[rated_movies_indices]
comparison_df['Pred_Rating'] = my_predictions[rated_movies_indices]

# Display the DataFrames
# print("\nTop 10 Recommended Movies with Predicted Ratings:")
# print(top_10_recommendations)

# print("\nComparison of Original and Predicted Ratings for Movies You Rated:")
# print(comparison_df)

# Optionally, return these DataFrames if needed
# return top_10_recommendations, comparison_df


In [70]:
top_10_recommendations

Unnamed: 0,movieId,title,Pred_Rating
6693,58293,"10,000 BC (2008)",5.455829
15,16,Casino (1995),5.362931
6540,54686,"Last Legion, The (2007)",5.360796
520,608,Fargo (1996),5.355065
531,626,"Thin Line Between Love and Hate, A (1996)",5.354192
2992,4007,Wall Street (1987),5.320819
3633,4988,White Water Summer (1987),5.28892
6905,63992,Twilight (2008),5.269089
4131,5943,Maid in Manhattan (2002),5.244815
3979,5612,Trapped (2002),5.243903


In [71]:
comparison_df.head()

Unnamed: 0,movieId,title,Original_Rating,Pred_Rating
960,1261,Evil Dead II (Dead by Dawn) (1987),2.0,2.262502
1067,1387,Jaws (1975),1.0,1.220351
1068,1388,Jaws 2 (1978),2.0,1.923786
1069,1389,Jaws 3-D (1983),1.0,1.221032
1083,1407,Scream (1996),2.0,2.198392


In [76]:
def get_user_recommendations(user_id, pm, movies, user_ratings):
    """
    Get top 10 recommended movies and the user's actual vs. predicted ratings.

    Parameters:
    - user_id: int, the ID of the user
    - pm: np.ndarray, matrix of predicted ratings adjusted by mean (num_movies x num_users)
    - movies: DataFrame, containing movie details with columns ['movieId', 'title', 'genres']
    - user_ratings: DataFrame, containing user ratings with columns ['userId', 'movieId', 'rating', 'title', 'genres']

    Returns:
    - top_10_recommendations: DataFrame with columns ['movieId', 'title', 'Pred_Rating']
    - user_rated_df: DataFrame with columns ['movieId', 'title', 'genres', 'rating', 'Pred_Rating']
    """
    # Get predicted ratings for the specified user
    user_index = user_id - 1  # Adjust for zero-based index
    user_pred = pm[:, user_index]
    ix = user_pred.argsort()[::-1]

    # Create a DataFrame for the top 10 recommended movies for the user
    top_10_recommendations = movies.loc[ix[:10], ['movieId', 'title']].copy()
    top_10_recommendations['Pred_Rating'] = user_pred[ix[:10]]

    # Create a DataFrame for the user's actual ratings
    user_actual_ratings = user_ratings[user_ratings['userId'] == user_id][['userId', 'movieId', 'rating', 'title', 'genres']]

    # Add predicted ratings to the user's actual ratings DataFrame
    user_actual_ratings['Pred_Rating'] = user_actual_ratings['movieId'].apply(
        lambda x: user_pred[movies[movies['movieId'] == x].index[0]]
    )

    # Create a DataFrame with columns for the user's rated movies
    user_rated_df = user_actual_ratings[['movieId', 'title', 'genres', 'rating', 'Pred_Rating']]

    return top_10_recommendations, user_rated_df

# Example usage
user_id = 610  # Replace with any user ID as needed
top_10_recommendations, user_610_df = get_user_recommendations(user_id, pm, movies, user_ratings)

# Display the DataFrames





In [77]:
print("\nTop 10 Recommended Movies for User 610:")
top_10_recommendations


Top 10 Recommended Movies for User 610:


Unnamed: 0,movieId,title,Pred_Rating
4159,5989,Catch Me If You Can (2002),5.905848
5880,33148,King's Ransom (2005),5.765769
6293,47997,Idiocracy (2006),5.597401
7355,78499,Toy Story 3 (2010),5.540721
900,1198,Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981),5.379222
295,337,What's Eating Gilbert Grape (1993),5.23813
4631,6902,Interstate 60 (2002),5.175708
257,296,Pulp Fiction (1994),5.143575
2992,4007,Wall Street (1987),5.117555
1938,2570,"Walk on the Moon, A (1999)",5.113916


In [79]:
print("\nMovies Rated by User 610 with Original and Predicted Ratings:")
user_610_df.head(10)


Movies Rated by User 610 with Original and Predicted Ratings:


Unnamed: 0,movieId,title,genres,rating,Pred_Rating
99534,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,2.557197
99535,6,Heat (1995),Action|Crime|Thriller,5.0,2.672583
99536,16,Casino (1995),Crime|Drama,4.5,4.456357
99537,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller,4.5,3.641987
99538,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,5.0,4.471366
99539,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,4.0,4.73011
99540,70,From Dusk Till Dawn (1996),Action|Comedy|Horror|Thriller,4.0,3.107426
99541,95,Broken Arrow (1996),Action|Adventure|Thriller,3.5,2.073842
99542,110,Braveheart (1995),Action|Drama|War,4.5,4.046391
99543,111,Taxi Driver (1976),Crime|Drama|Thriller,5.0,3.072304


In [82]:
from sklearn.metrics import mean_squared_error

from math import sqrt
def calculate_average_rmse(pm, Y, R):
    num_users = Y.shape[1]
    total_rmse = 0
    user_count = 0

    for user_id in range(num_users):
        # Movies that the user has rated
        rated_movies = R[:, user_id] > 0
        if np.any(rated_movies):  # Check if the user has rated any movies
            rated_movie_indices = np.where(rated_movies)[0]

            # Extract actual and predicted ratings for the rated movies
            actual_ratings = Y[rated_movies, user_id]
            predicted_ratings = pm[rated_movie_indices, user_id]

            # Calculate RMSE for the user and add to total
            rmse = sqrt(mean_squared_error(actual_ratings, predicted_ratings))
            total_rmse += rmse
            user_count += 1

    # Calculate average RMSE
    average_rmse = total_rmse / user_count if user_count > 0 else float('nan')
    return average_rmse

# Calculate average RMSE
average_rmse = calculate_average_rmse(pm, Y, R)
print(f"Average RMSE for all users: {average_rmse:.4f}")

Average RMSE for all users: 0.1078


In [83]:
from sklearn.metrics import mean_absolute_error

def calculate_average_mae(pm, Y, R):
    num_users = Y.shape[1]
    total_mae = 0
    user_count = 0

    for user_id in range(num_users):
        # Movies that the user has rated
        rated_movies = R[:, user_id] > 0
        if np.any(rated_movies):  # Check if the user has rated any movies
            rated_movie_indices = np.where(rated_movies)[0]

            # Extract actual and predicted ratings for the rated movies
            actual_ratings = Y[rated_movies, user_id]
            predicted_ratings = pm[rated_movie_indices, user_id]

            # Calculate MAE for the user and add to total
            mae = mean_absolute_error(actual_ratings, predicted_ratings)
            total_mae += mae
            user_count += 1

    # Calculate average MAE
    average_mae = total_mae / user_count if user_count > 0 else float('nan')
    return average_mae

# Calculate average MAE
average_mae = calculate_average_mae(pm, Y, R)
print(f"Average MAE for all users: {average_mae:.4f}")


Average MAE for all users: 0.0838
