In [43]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd

In [44]:
# Movie, its ID and name
movies_df = pd.read_csv("../ml-latest-small/movies.csv")
# User ID, movie ID and user's rating
ratings_df = pd.read_csv("../ml-latest-small/ratings.csv")

num_movies = len(movies_df)
num_users = len(ratings_df['userId'].unique())

In [45]:
Y = np.zeros((num_movies, num_users))
Y.shape

(9742, 610)

In [46]:
ratings_df.tail()

Unnamed: 0,userId,movieId,rating,timestamp
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352
100835,610,170875,3.0,1493846415


Movie IDs are not 1-to-1 with thier index; they are skipped over. There are a total of 9742 unique movies based on movies_df since each row is a different ID. There are a total of 610 unique users based on unique 'userId' in ratings_df and none of the users in between are missing as the tail-end of ratings goes till 610, so if any ID in between was missing, the unique IDs would be lower.


Lets form a matrix Y where each row is a unique movie and each column value is rating from that unique user for that movie. Since some movie IDs are missing, lets use the their index from movies_df. We can later use that same index to retrieve their name from movies_df.

In [47]:
movies_df['index'] = movies_df.index
movie_ratings_df = pd.merge(movies_df, ratings_df, on='movieId')
movie_ratings_df

Unnamed: 0,movieId,title,genres,index,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,17,4.5,1305696483
...,...,...,...,...,...,...,...
100831,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,9737,184,4.0,1537109082
100832,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,9738,184,3.5,1537109545
100833,193585,Flint (2017),Drama,9739,184,3.5,1537109805
100834,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,9740,184,3.5,1537110021


Y contains ratings of 0.5 to 5 inclusive in 0.5 steps. 0 if the movie has not been rated. We make a matrix R which is the same shape as Y but with binary values to keep track of whether a user rated a movie. 1 means they did, 0 means they didnt.

In [48]:
R = np.zeros((num_movies, num_users))
R.shape

(9742, 610)

In [49]:
for index, row in movie_ratings_df.iterrows():
    m_idx = row['index']
    u_idx = row['userId'] - 1
    Y[m_idx][u_idx] = row['rating']

    if row['rating'] != 0:
        R[m_idx][u_idx] = 1

In [50]:
def collab_fill_cost_func(X, W, b, Y, R, lambda_):
    """
    Returns the cost for the content-based filtering
    Vectorized for speed. Uses tensorflow operations to be compatible with custom training loop.
    Args:
      X (ndarray (num_movies,num_features)): matrix of item features
      W (ndarray (num_users,num_features)) : matrix of user parameters
      b (ndarray (1, num_users)            : vector of user parameters
      Y (ndarray (num_movies,num_users)    : matrix of user ratings of movies
      R (ndarray (num_movies,num_users)    : matrix, where R(i, j) = 1 if the i-th movies was rated by the j-th user
      lambda_ (float): regularization parameter
    Returns:
      J (float) : Cost
    """
    j = (tf.linalg.matmul(X, tf.transpose(W)) + b - Y)*R
    J = 0.5 * tf.reduce_sum(j**2) + (lambda_/2) * (tf.reduce_sum(X**2) + tf.reduce_sum(W**2))
    return J

In [51]:
movies_df.iloc[41]

movieId                       45
title          To Die For (1995)
genres     Comedy|Drama|Thriller
index                         41
Name: 41, dtype: object

In [52]:
# Get some ratings from new users to help make predictions for them
my_ratings = np.zeros(num_movies) 

# Check the file movies.csv for auto-increment id of each movie in our dataset (different from movieId)
# Just get the left-most column value - 1 since its 0-base indexing
my_ratings[3638] = 5 # LOTR Fellowship at id 3639
my_ratings[4137] = 5;  # LOTR Two Tower
my_ratings[4800] = 5;  # LORT Return
my_ratings[3194] = 5;  # Shrek at id 3195
my_ratings[511] = 4;  # Snowwhite
my_ratings[2016] = 4;  # Tarzan

my_ratings[7140] = 2;  # Paranormal activity
my_ratings[7212] = 2;  # Avatar
my_ratings[7277] = 1;  # Room
my_ratings[9591] = 3;  # Death note
my_ratings[9590] = 1;  # Emoji movie
my_ratings[9509] = 1;  # Cars 3

my_rated = [i for i in range(len(my_ratings)) if my_ratings[i] > 0]

print('\nNew user ratings:\n')
for i in range(len(my_ratings)):
    if my_ratings[i] > 0 :
        print(f'Rated {my_ratings[i]} for  {movies_df.loc[i,"title"]}');



New user ratings:

Rated 4.0 for  Snow White and the Seven Dwarfs (1937)
Rated 4.0 for  Tarzan (1999)
Rated 5.0 for  Shrek (2001)
Rated 5.0 for  Lord of the Rings: The Fellowship of the Ring, The (2001)
Rated 5.0 for  Lord of the Rings: The Two Towers, The (2002)
Rated 5.0 for  Lord of the Rings: The Return of the King, The (2003)
Rated 2.0 for  Paranormal Activity (2009)
Rated 2.0 for  Avatar (2009)
Rated 1.0 for  Room, The (2003)
Rated 1.0 for  Cars 3 (2017)
Rated 1.0 for  The Emoji Movie (2017)
Rated 3.0 for  Death Note (2017)


In [53]:
def normalize_ratings(Y, R):
    """Use this to normalize the ratings to improve predictions for new users 
    who have not rated anything. Since their weight/bias would be around 0, 
    our prediction would tend to go towards 0. We will do MEAN NORMALIZATION by 
    subtracting mean ratings of a movie from all ratings receive. Once a prediction 
    is made, we add mean back to it so now predictions for new users is closer to mean."""

    Y_norm = np.zeros(Y.shape)
    Y_mean = np.zeros((num_movies, 1))
    for i in range(num_movies):
        # Get only colums where user has given ratings, get mean and subtract from only rated columns
        if np.count_nonzero(Y[i]) != 0:
            movie_mean = np.mean(Y[i, R[i, :].astype(bool)])
            Y_mean[i] = movie_mean
            Y_norm[i, R[i, :].astype(bool)] = Y[i, R[i, :].astype(bool)] - movie_mean

    return Y_norm, Y_mean


In [54]:
# Add new user ratings to Y 
Y = np.c_[my_ratings, Y]

# Add new user indicator matrix to R
R = np.c_[(my_ratings != 0).astype(int), R]

# Normalize the Dataset
Y_norm, Y_mean = normalize_ratings(Y, R)
Y.shape

(9742, 611)

In [55]:
np.random.seed(42)

# Lets start with weight and feature vector of size 100 and adjust later if needed
num_movies, num_users = Y.shape
num_features = 100

# Set Initial Parameters (W, X), use tf.Variable to track these variables
tf.random.set_seed(1234) # for consistent results
W = tf.Variable(tf.random.normal((num_users,  num_features),dtype=tf.float64),  name='W')
X = tf.Variable(tf.random.normal((num_movies, num_features),dtype=tf.float64),  name='X')
b = tf.Variable(tf.random.normal((1,          num_users),   dtype=tf.float64),  name='b')

# Instantiate an optimizer.
optimizer = keras.optimizers.Adam(learning_rate=1e-1)

print("Y", Y.shape, "R", R.shape)
print("X", X.shape)
print("W", W.shape)
print("b", b.shape)
print("num_features", num_features)
print("num_movies",   num_movies)
print("num_users",    num_users)

Y (9742, 611) R (9742, 611)
X (9742, 100)
W (611, 100)
b (1, 611)
num_features 100
num_movies 9742
num_users 611


In [56]:
# Movies with no ratings
np.where(np.all(Y==0, axis=1))

(array([ 816, 2211, 2499, 2587, 3118, 4037, 4506, 4598, 4704, 5020, 5293,
        5421, 5452, 5749, 5824, 5837, 5957, 7565]),)

In [57]:
# Can check there are no ratings for movies with those index
movie_ratings_df[movie_ratings_df['index'] == 0]

Unnamed: 0,movieId,title,genres,index,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,17,4.5,1305696483
...,...,...,...,...,...,...,...
210,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,606,2.5,1349082950
211,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,607,4.0,964744033
212,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,608,2.5,1117408267
213,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,609,3.0,847221025


In [58]:
iterations = 200
lambda_ = 1
for iter in range(iterations):
    # Use TensorFlow’s GradientTape
    # to record the operations used to compute the cost 
    with tf.GradientTape() as tape:

        # Compute the cost (forward pass included in cost)
        cost_value = collab_fill_cost_func(X, W, b, Y_norm, R, lambda_)

    # Use the gradient tape to automatically retrieve
    # the gradients of the trainable variables with respect to the loss
    grads = tape.gradient( cost_value, [X,W,b] )

    # Run one step of gradient descent by updating
    # the value of the variables to minimize the loss.
    optimizer.apply_gradients( zip(grads, [X,W,b]) )

    # Log periodically.
    if iter % 20 == 0:
        print(f"Training loss at iteration {iter}: {cost_value:0.1f}")

Training loss at iteration 0: 5558289.8
Training loss at iteration 20: 279471.5
Training loss at iteration 40: 107963.5
Training loss at iteration 60: 53016.0
Training loss at iteration 80: 30348.1
Training loss at iteration 100: 19407.0
Training loss at iteration 120: 13542.4
Training loss at iteration 140: 10179.7
Training loss at iteration 160: 8157.7
Training loss at iteration 180: 6895.7


Now we see what it predicts

In [121]:
# Make a prediction using trained weights and biases
p = np.matmul(X.numpy(), np.transpose(W.numpy())) + b.numpy()

#restore the mean
pm = p + Y_mean

my_predictions = pm[:,0]

# sort predictions
ix = tf.argsort(my_predictions, direction='DESCENDING')

for i in range(17):
    j = ix[i]
    if j not in my_rated:
        print(f'Predicting rating {my_predictions[j]:0.2f} for movie {movies_df.loc[int(j), "title"]}')

print('\n\nOriginal vs Predicted ratings:\n')
for i in range(len(my_ratings)):
    if my_ratings[i] > 0:
        print(f'Original {my_ratings[i]}, Predicted {my_predictions[i]:0.2f} for {movies_df.loc[i, "title"]}')


Predicting rating 4.91 for movie Shawshank Redemption, The (1994)
Predicting rating 4.82 for movie Pulp Fiction (1994)
Predicting rating 4.80 for movie Léon: The Professional (a.k.a. The Professional) (Léon) (1994)
Predicting rating 4.75 for movie Silence of the Lambs, The (1991)
Predicting rating 4.75 for movie Alien Contamination (1980)
Predicting rating 4.74 for movie Galaxy of Terror (Quest) (1981)
Predicting rating 4.74 for movie Raise Your Voice (2004)
Predicting rating 4.73 for movie Into the Forest of Fireflies' Light (2011)
Predicting rating 4.73 for movie 61* (2001)
Predicting rating 4.73 for movie Laggies (2014)
Predicting rating 4.73 for movie Black Mirror
Predicting rating 4.73 for movie Delirium (2014)
Predicting rating 4.73 for movie King of Hearts (1966)


Original vs Predicted ratings:

Original 4.0, Predicted 3.93 for Snow White and the Seven Dwarfs (1937)
Original 4.0, Predicted 3.83 for Tarzan (1999)
Original 5.0, Predicted 4.94 for Shrek (2001)
Original 5.0, Predic