In [None]:
! curl http://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  955k  100  955k    0     0   440k      0  0:00:02  0:00:02 --:--:--  440k


In [None]:
import zipfile
with zipfile.ZipFile('ml-latest-small.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Concatenate, Dense
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
import pandas as pd

In [None]:
movies_df = pd.read_csv('data/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')


In [None]:
user_ids = ratings_df['userId'].unique()
movie_ids = ratings_df['movieId'].unique()

user_id_to_index = {user_id: idx for idx, user_id in enumerate(user_ids)}
movie_id_to_index = {movie_id: idx for idx, movie_id in enumerate(movie_ids)}

ratings_df['user_index'] = ratings_df['userId'].map(user_id_to_index)
ratings_df['movie_index'] = ratings_df['movieId'].map(movie_id_to_index)

In [None]:
ratings_df.head(10)

Unnamed: 0,userId,movieId,rating,timestamp,user_index,movie_index
0,1,1,4.0,964982703,0,0
1,1,3,4.0,964981247,0,1
2,1,6,4.0,964982224,0,2
3,1,47,5.0,964983815,0,3
4,1,50,5.0,964982931,0,4
5,1,70,3.0,964982400,0,5
6,1,101,5.0,964980868,0,6
7,1,110,4.0,964982176,0,7
8,1,151,5.0,964984041,0,8
9,1,157,5.0,964984100,0,9


In [None]:
index = ratings_df['movie_index']

In [None]:
ratings_df = ratings_df.drop('timestamp', axis=1)


In [None]:
def give_mean_of_moiveID(n):
    x = ratings_df[ratings_df['movieId'] == n]['rating'].mean()
    return f"The average rating of the movie ID {n} is = {x:.3f} / 5"

def give_mean_of_userID(n):
    return ratings_df[ratings_df['userId'] == n]['rating'].mean()

In [None]:
give_mean_of_moiveID(167370)

'The average rating of the movie ID 167370 is = 2.500 / 5'

In [None]:
def preprocess_data(ratings_df, num_features=10):
    """
    Preprocesses the ratings_df into the required format.

    Parameters:
        ratings_df (pd.DataFrame): DataFrame containing userId, movieId, and rating.
        num_features (int): Number of latent features for X and W matrices.

    Returns:
        X (ndarray): Matrix of item features (num_movies x num_features).
        W (ndarray): Matrix of user parameters (num_users x num_features).
        b (ndarray): Vector of user biases (1 x num_users).
        Y (ndarray): Matrix of user ratings of movies (num_movies x num_users).
        R (ndarray): Matrix indicating if a movie was rated by a user (num_movies x num_users).
    """
    # Get unique users and movies
    unique_users = ratings_df['userId'].unique()
    unique_movies = ratings_df['movieId'].unique()

    num_users = len(unique_users)
    num_movies = len(unique_movies)

    # Create mappings for userId and movieId
    user_to_index = {user: idx for idx, user in enumerate(unique_users)}
    movie_to_index = {movie: idx for idx, movie in enumerate(unique_movies)}

    # Initialize Y and R matrices
    Y = np.zeros((num_movies, num_users))
    R = np.zeros((num_movies, num_users))

    # Populate Y and R matrices
    for _, row in ratings_df.iterrows():
        movie_idx = movie_to_index[row['movieId']]
        user_idx = user_to_index[row['userId']]
        Y[movie_idx, user_idx] = row['rating']
        R[movie_idx, user_idx] = 1  # Indicate that this movie was rated by this user

    # Initialize X (item features) and W (user parameters) with random values
    X = np.random.rand(num_movies, num_features)
    W = np.random.rand(num_users, num_features)

    # Initialize b (user biases) as zeros
    b = np.zeros((1, num_users))

    return X, W, b, num_movies, num_features, num_users, Y, R


In [None]:
def cofi_cost_func(X, W, b, Y, R, lambda_):
    """
    Returns the cost for the content-based filtering
    Args:
      X (ndarray (num_movies,num_features)): matrix of item features
      W (ndarray (num_users,num_features)) : matrix of user parameters
      b (ndarray (1, num_users)            : vector of user parameters
      Y (ndarray (num_movies,num_users)    : matrix of user ratings of movies
      R (ndarray (num_movies,num_users)    : matrix, where R(i, j) = 1 if the i-th movies was rated by the j-th user
      lambda_ (float): regularization parameter
    Returns:
      J (float) : Cost
    """
    nm, nu = Y.shape
    J = 0

    for j in range(nu):
        w = W[j,:]
        b_j = b[0,j]
        for i in range(nm):
            x = X[i,:]
            y = Y[i,j]
            r = R[i,j]
            J += r * np.square((np.dot(w,x) + b_j - y ))
    J += (lambda_) * (np.sum(np.square(W)) + np.sum(np.square(X)))
    J = J/2


    return J

In [None]:
X, W, b, num_movies, num_features, num_users, Y, R = preprocess_data(ratings_df)

In [None]:
def cofi_cost_func_v(X, W, b, Y, R, lambda_):
    """
    Returns the cost for the content-based filtering
    Vectorized for speed. Uses tensorflow operations to be compatible with custom training loop.
    Args:
      X (ndarray (num_movies,num_features)): matrix of item features
      W (ndarray (num_users,num_features)) : matrix of user parameters
      b (ndarray (1, num_users)            : vector of user parameters
      Y (ndarray (num_movies,num_users)    : matrix of user ratings of movies
      R (ndarray (num_movies,num_users)    : matrix, where R(i, j) = 1 if the i-th movies was rated by the j-th user
      lambda_ (float): regularization parameter
    Returns:
      J (float) : Cost
    """
    j = (tf.linalg.matmul(X, tf.transpose(W)) + b - Y)*R
    J = 0.5 * tf.reduce_sum(j**2) + (lambda_/2) * (tf.reduce_sum(X**2) + tf.reduce_sum(W**2))
    return J

In [None]:
myratings =  np.zeros(num_movies)

In [None]:
def get_movie_row(movie_df, movie_id):
    row_index = movie_df.index[movie_df["movieId"] == movie_id].tolist()
    return row_index[0] if row_index else "Movie ID not found"

movie_id = 123  # Replace with your desired movieId
print(get_movie_row(ratings_df, movie_id))


35973


In [None]:
movie_id = 109487
row_index = movies_df.index[movies_df['movieId'] == movie_id].item()

print(row_index)


8376


In [None]:
movieList = pd.read_csv('data/ml-latest-small/movies.csv')
movieList = movieList.drop(["genres"],axis=1)


In [None]:
id_to_rating_list =[(2, 4),(2115, 4),(2150, 4),(5580, 4),(167370, 4),(5054, 5),(5171, 5),(5218, 5),(5816, 1),(4896, 1),(8368, 1),(109487, 5),(79132, 5),(189333, 5),(177765, 5),(168712, 1),(164909, 2),(163386, 2),(163112, 2),(163072, 2),(163134, 2),(168492, 1),]
for movie_id, rating in id_to_rating_list:
    movie_idx = movies_df.index[movies_df['movieId'] == movie_id].item()
    if movie_idx is not None:
        myratings[movie_idx] = rating



In [None]:
def normalizeRatings(Y, R):
    """
    Preprocess data by subtracting mean rating for every movie (every row).
    Only include real ratings R(i,j)=1.
    [Ynorm, Ymean] = normalizeRatings(Y, R) normalized Y so that each movie
    has a rating of 0 on average. Unrated moves then have a mean rating (0)
    Returns the mean rating in Ymean.
    """
    Ymean = (np.sum(Y*R,axis=1)/(np.sum(R, axis=1)+1e-12)).reshape(-1,1)
    Ynorm = Y - np.multiply(Ymean, R)
    return(Ynorm, Ymean)


In [None]:
Y    = np.c_[myratings, Y]
R    = np.c_[(myratings != 0).astype(int), R]

# Normalize the Dataset
Ynorm, Ymean = normalizeRatings(Y, R)


In [None]:
num_movies, num_users = Y.shape
num_features = 100

# Set Initial Parameters (W, X), use tf.Variable to track these variables
tf.random.set_seed(1234) # for consistent results
W = tf.Variable(tf.random.normal((num_users,  num_features),dtype=tf.float64),  name='W')
X = tf.Variable(tf.random.normal((num_movies, num_features),dtype=tf.float64),  name='X')
b = tf.Variable(tf.random.normal((1,num_users),   dtype=tf.float64),  name='b')

# Instantiate an optimizer.
optimizer = keras.optimizers.Adam(learning_rate=1e-1)


In [None]:
iterations = 200
lambda_ = 1
for iter in range(iterations):
    # to record the operations used to compute the cost
    with tf.GradientTape() as tape:

        # Compute the cost (forward pass included in cost)
        cost_value = cofi_cost_func_v(X, W, b, Ynorm, R, lambda_)

    # the gradients of the trainable variables with respect to the loss
    grads = tape.gradient( cost_value, [X,W,b] )

    # Run one step of gradient descent by updating
    # the value of the variables to minimize the loss.
    optimizer.apply_gradients( zip(grads, [X,W,b]) )

    # Log periodically.
    if iter % 50 == 0:
        print(f"Training loss at iteration {iter}: {cost_value:0.1f}")


Training loss at iteration 0: 5557339.4
Training loss at iteration 50: 74135.1
Training loss at iteration 100: 19411.2
Training loss at iteration 150: 9007.1


In [None]:
my_rated = [i for i in range(len(myratings)) if myratings[i] > 0]

In [None]:
movieList.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
movieList = movieList.drop(["genres"],axis=1)

In [None]:
# Compute predictions
p = np.matmul(X.numpy(), np.transpose(W.numpy())) + b.numpy()

# Restore the mean
pm = p + Ymean

# Extract predictions for the first user
my_predictions = pm[:, 0]

# Sort predictions in descending order
ix = tf.argsort(my_predictions, direction='DESCENDING').numpy()



In [None]:
print("\nRecommended Movies:\n")
for i in range(100):
    j = int(ix[i])
    if j in movieList.index and j > 3000:
        print(f'Predicting rating {my_predictions[j]:0.2f} for movie "{movieList["title"].iloc[j]}"\n')

    else:
        # print(f"Warning: Index {j} not found in movieList\n")
        continue


Recommended Movies:

Predicting rating 4.96 for movie "Coco (2017)"

Predicting rating 4.92 for movie "Mission: Impossible - Fallout (2018)"

Predicting rating 4.92 for movie "Big Momma's House 2 (2006)"

Predicting rating 4.86 for movie "Time Machine, The (2002)"

Predicting rating 4.85 for movie "Gigli (2003)"

Predicting rating 4.82 for movie "Other Guys, The (2010)"

Predicting rating 4.82 for movie "Wild Child (2008)"

Predicting rating 4.82 for movie "Miss Meadows (2014)"

Predicting rating 4.81 for movie "Spy Game (2001)"

Predicting rating 4.81 for movie "Funny Girl (1968)"

Predicting rating 4.81 for movie "In the Bedroom (2001)"

Predicting rating 4.81 for movie "Making a Murderer (2015)"

Predicting rating 4.81 for movie "The Putin Interviews (2017)"

Predicting rating 4.81 for movie "Game Over, Man! (2018)"

Predicting rating 4.81 for movie "Too Funny to Fail: The Life and Death of The Dana Carvey Show (2017)"

Predicting rating 4.81 for movie "Wonder (2017)"

Predicting r

In [None]:
print("\n\nOriginal vs Predicted Ratings:\n")
for i in range(len(myratings)):
    if not np.isnan(myratings[i]):
        if i in movieList.index:
            print(f'Original {myratings[i]}, Predicted {my_predictions[i]:0.2f} for {movieList["title"].iloc[i]}\n')
        else:
            print(f'Original {myratings[i]}, Predicted {my_predictions[i]:0.2f} for unknown movie\n')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Original 0.0, Predicted 2.16 for Book of Eli, The (2010)

Original 0.0, Predicted 1.25 for Girl Who Kicked the Hornet's Nest, The (Luftslottet som sprängdes) (2009)

Original 0.0, Predicted 0.25 for Prophet, A (Un Prophète) (2009)

Original 0.0, Predicted 3.25 for Staten Island (2009)

Original 0.0, Predicted 1.71 for Maiden Heist, The (2009)

Original 0.0, Predicted 1.86 for Blood: The Last Vampire (2009)

Original 0.0, Predicted 1.96 for MacGyver: Lost Treasure of Atlantis (1994)

Original 0.0, Predicted 2.84 for Pekka ja Pätkä Suezilla (1958)

Original 0.0, Predicted 2.62 for Bart Got a Room (2008)

Original 0.0, Predicted 1.96 for Project A 2 ('A' gai wak juk jap) (1987)

Original 0.0, Predicted 2.75 for Robin-B-Hood (Bo bui gai wak) (2006)

Original 0.0, Predicted 2.75 for Concert, Le (2009)

Original 0.0, Predicted 0.75 for Ninja (2009)

Original 0.0, Predicted 2.62 for Asterix at the Olympic Games (Astérix aux jeux

In [None]:
l = [7371,7374,8378,6064,6524,3,8220]
for i in l:
    print(f'Original {myratings[i-2]:0.0f} , Predicted {round(my_predictions[i-2])} for {movieList["title"].iloc[i-2]}\n')

Original 0 , Predicted 3 for Predators (2010)

Original 5 , Predicted 5 for Inception (2010)

Original 5 , Predicted 5 for Interstellar (2014)

Original 0 , Predicted 4 for Harry Potter and the Goblet of Fire (2005)

Original 0 , Predicted 2 for Harry Potter and the Order of the Phoenix (2007)

Original 4 , Predicted 4 for Jumanji (1995)

Original 0 , Predicted 4 for Conjuring, The (2013)



In [None]:
for i in range(9724):
  if myratings[i-2] != 0:
    print(f'Original {myratings[i-2]:0.0f} , Predicted {round(my_predictions[i-2])} for {movieList["title"].iloc[i-2]}\n')

Original 4 , Predicted 4 for Jumanji (1995)

Original 4 , Predicted 4 for Indiana Jones and the Temple of Doom (1984)

Original 4 , Predicted 4 for Gods Must Be Crazy, The (1980)

Original 1 , Predicted 1 for Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)

Original 5 , Predicted 5 for Brainstorm (1983)

Original 5 , Predicted 5 for Time Machine, The (2002)

Original 5 , Predicted 5 for Ice Age (2002)

Original 4 , Predicted 4 for Aspen Extreme (1993)

Original 1 , Predicted 1 for Harry Potter and the Chamber of Secrets (2002)

Original 1 , Predicted 1 for Harry Potter and the Prisoner of Azkaban (2004)

Original 5 , Predicted 5 for Inception (2010)

Original 5 , Predicted 5 for Interstellar (2014)

Original 2 , Predicted 2 for Winnie Pooh (1969)

Original 2 , Predicted 2 for Winnie the Pooh Goes Visiting (1971)

Original 2 , Predicted 2 for Your Name. (2016)

Original 2 , Predicted 2 for Winnie the Pooh and the Day of Concern (1972)

Orig