In [65]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from recsys_utils import *


In [66]:
#load Data
x,w,b,num_movies,num_features,num_users= load_precalc_params_small()
"""Y is for ratings given to the movie and R is for the matrix of that ratings , for example-
tsmean = np.mean(Y[0,R[0,:].astype(bool)])
1️ R[0,:]
[1,1,0,1,0] for R

All users who rated Movie 0
"""
Y,R = load_ratings_small()
print ("Y",Y.shape,"R",R.shape)
print("x",x.shape)
print("w",w.shape)
print("b",b.shape)
print("num_features",num_features)
print("num_movies",num_movies)
print("num_users",num_users)


Y (4778, 443) R (4778, 443)
x (4778, 10)
w (443, 10)
b (1, 443)
num_features 10
num_movies 4778
num_users 443


In [67]:
#from the matrix , we can compute the statistics of "average rating "
tsmean = np.mean(Y[0,R[0,:].astype(bool)])
print(f"Average rating for movie : {tsmean: 0.3f}/5")

Average rating for movie :  0.055/5


In [68]:
from public_tests import *

def cofi_cost_func(X, W, b, Y, R, lambda_):
    """
    Collaborative filtering cost function.

    Parameters:
    X : ndarray (num_movies, num_features)
        Matrix of movie features.
    W : ndarray (num_users, num_features)
        Matrix of user preference parameters.
    b : ndarray (1, num_users)
        Bias term for each user.
    Y : ndarray (num_movies, num_users)
        Matrix of movie ratings by users.
    R : ndarray (num_movies, num_users)
        Indicator matrix where R[i,j] = 1 if movie i was rated by user j.
    lambda_ : float
        Regularization parameter.

    Returns:
    J : float
        Computed cost including regularization.
    """

    nm, nu = Y.shape
    J = 0

    for j in range(nu):
        w = W[j, :]
        b_j = b[0, j]

        for i in range(nm):
            x = X[i, :]
            y = Y[i, j]
            r = R[i, j]

            J += r * np.square(np.dot(w, x) + b_j - y)

    # Error cost
    J = J / 2

    # Regularization cost
    J += (lambda_ / 2) * (np.sum(np.square(W)) + np.sum(np.square(X)))

    return J


test_cofi_cost_func(cofi_cost_func)


[92mAll tests passed!


In [69]:
num_users_r = 4
num_movies_r = 5
num_features_r = 3

X_r = x[:num_movies_r, :num_features_r]
w_r = w[:num_users_r, :num_features_r]
b_r = tf.reshape(b[0, :num_users_r], (1, -1))

Y_r = Y[:num_movies_r, :num_users_r]
R_r = R[:num_movies_r, :num_users_r]  

In [70]:
#Evaluate cost fxn with regulariztion 
J=cofi_cost_func(X_r,w_r,b_r,Y_r,R_r,1.5);
print(f"cost(with regularization):{5:0.2f}")

cost(with regularization):5.00


In [71]:
#Vectorized Implementation
def cofi_cost_func_v(X, W, b, Y, R, lambda_):
    """
    Returns the cost for collaborative filtering.
    Vectorized implementation for speed.
    """

    pred = tf.linalg.matmul(X, tf.transpose(W)) + b
    err = (pred - Y) * R

    J = 0.5 * tf.reduce_sum(err ** 2)
    J += (lambda_ / 2) * (tf.reduce_sum(X ** 2) + tf.reduce_sum(W ** 2))

    return J


In [72]:
# Evaluate Cost Function
J = cofi_cost_func_v(X_r, w_r, b_r, Y_r, R_r, 0)
print(f"cost: {J:0.2f}")

# Evaluate Cost Function with Regularization
J = cofi_cost_func_v(X_r, w_r, b_r, Y_r, R_r, 105)
print(f"cost (with regularization): {J:0.2f}")

# Training movie recommendations
movielist, movielist_df = load_Movie_List_pd()

my_ratings = np.zeros(num_movies)
my_ratings[2700] = 5

my_rated = [i for i in range(len(my_ratings)) if my_ratings[i] > 0]

print("\nNew user ratings:\n")

for i in my_rated:
    print(f"Rated {my_ratings[i]} for {movielist_df.iloc[i]['title']}")


cost: 5.17
cost (with regularization): 1014.79

New user ratings:

Rated 5.0 for Toy Story 3 (2010)


In [73]:
movieList, movieList_df = load_Movie_List_pd()

my_ratings = np.zeros(num_movies)          #  Initialize my ratings

# Check the file small_movie_list.csv for id of each movie in our dataset
# For example, Toy Story 3 (2010) has ID 2700, so to rate it "5", you can set
my_ratings[2700] = 5 

#Or suppose you did not enjoy Persuasion (2007), you can set
my_ratings[2609] = 2;

# We have selected a few movies we liked / did not like and the ratings we
# gave are as follows:
my_ratings[929]  = 5   # Lord of the Rings: The Return of the King, The
my_ratings[246]  = 5   # Shrek (2001)
my_ratings[2716] = 3   # Inception
my_ratings[1150] = 5   # Incredibles, The (2004)
my_ratings[382]  = 2   # Amelie (Fabuleux destin d'Amélie Poulain, Le)
my_ratings[366]  = 5   # Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
my_ratings[622]  = 5   # Harry Potter and the Chamber of Secrets (2002)
my_ratings[988]  = 3   # Eternal Sunshine of the Spotless Mind (2004)
my_ratings[2925] = 1   # Louis Theroux: Law & Disorder (2008)
my_ratings[2937] = 1   # Nothing to Declare (Rien à déclarer)
my_ratings[793]  = 5   # Pirates of the Caribbean: The Curse of the Black Pearl (2003)
my_rated = [i for i in range(len(my_ratings)) if my_ratings[i] > 0]

print('\nNew user ratings:\n')
for i in range(len(my_ratings)):
    if my_ratings[i] > 0 :
        print(f'Rated {my_ratings[i]} for  {movieList_df.loc[i,"title"]}');


New user ratings:

Rated 5.0 for  Shrek (2001)
Rated 5.0 for  Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
Rated 2.0 for  Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)
Rated 5.0 for  Harry Potter and the Chamber of Secrets (2002)
Rated 5.0 for  Pirates of the Caribbean: The Curse of the Black Pearl (2003)
Rated 5.0 for  Lord of the Rings: The Return of the King, The (2003)
Rated 3.0 for  Eternal Sunshine of the Spotless Mind (2004)
Rated 5.0 for  Incredibles, The (2004)
Rated 2.0 for  Persuasion (2007)
Rated 5.0 for  Toy Story 3 (2010)
Rated 3.0 for  Inception (2010)
Rated 1.0 for  Louis Theroux: Law & Disorder (2008)
Rated 1.0 for  Nothing to Declare (Rien à déclarer) (2010)


In [74]:
# RELOAD RATINGS AND ADD NEW RATINGS
Y, R = load_ratings_small()

# Add new user ratings as first column
Y = np.c_[my_ratings, Y]

# Add rating mask column (1 if rated, 0 if not)
R = np.c_[(my_ratings != 0).astype(int), R]
#normalize the dataset
Ynorm, Ymean = normalizeRatings(Y, R)


In [75]:
#  Useful Values
num_movies, num_users = Y.shape
num_features = 100

# Set Initial Parameters (W, X), use tf.Variable to track these variables
tf.random.set_seed(1234) # for consistent results
W = tf.Variable(tf.random.normal((num_users,  num_features),dtype=tf.float64),  name='W')
X = tf.Variable(tf.random.normal((num_movies, num_features),dtype=tf.float64),  name='X')
b = tf.Variable(tf.random.normal((1,          num_users),   dtype=tf.float64),  name='b')

# Instantiate an optimizer.
optimizer = keras.optimizers.Adam(learning_rate=1e-1)

In [76]:
iterations = 200
lambda_ = 1
for iter in range(iterations):
    # Use TensorFlow’s GradientTape
    # to record the operations used to compute the cost 
    with tf.GradientTape() as tape:

        # Compute the cost (forward pass included in cost)
        cost_value = cofi_cost_func_v(X, W, b, Ynorm, R, lambda_)

    # Use the gradient tape to automatically retrieve
    # the gradients of the trainable variables with respect to the loss
    grads = tape.gradient( cost_value, [X,W,b] )

    # Run one step of gradient descent by updating
    # the value of the variables to minimize the loss.
    optimizer.apply_gradients( zip(grads, [X,W,b]) )

    # Log periodically.
    if iter % 20 == 0:
        print(f"Training loss at iteration {iter}: {cost_value:0.1f}")

Training loss at iteration 0: 109039116.8
Training loss at iteration 20: 1882274.1
Training loss at iteration 40: 310110.3
Training loss at iteration 60: 73554.9
Training loss at iteration 80: 45101.1
Training loss at iteration 100: 38639.4
Training loss at iteration 120: 35286.0
Training loss at iteration 140: 32566.4
Training loss at iteration 160: 30125.5
Training loss at iteration 180: 27903.1


In [77]:
# Make a prediction using trained weights and biases
p = np.matmul(X.numpy(), np.transpose(W.numpy())) + b.numpy()

#restore the mean
pm = p + Ymean

my_predictions = pm[:,0]

# sort predictions
ix = tf.argsort(my_predictions, direction='DESCENDING')

for i in range(17):
    j = ix[i]
    if j not in my_rated:
        print(f'Predicting rating {my_predictions[j]:0.2f} for movie {movieList[j]}')

print('\n\nOriginal vs Predicted ratings:\n')
for i in range(len(my_ratings)):
    if my_ratings[i] > 0:
        print(f'Original {my_ratings[i]}, Predicted {my_predictions[i]:0.2f} for {movieList[i]}')

Predicting rating 6.33 for movie Rewrite, The (2014)
Predicting rating 5.96 for movie Coffee Town (2013)
Predicting rating 5.85 for movie Milk (2008)
Predicting rating 5.85 for movie Break-Up, The (2006)
Predicting rating 5.84 for movie Never Let Me Go (2010)
Predicting rating 5.83 for movie Passengers (2016)
Predicting rating 5.74 for movie Risen (2016)
Predicting rating 5.68 for movie American Pie Presents: Band Camp (American Pie 4: Band Camp) (2005)
Predicting rating 5.63 for movie Straight Outta Compton (2015)
Predicting rating 5.59 for movie Eye, The (Gin gwai) (Jian gui) (2002)
Predicting rating 5.59 for movie Perfect Storm, The (2000)
Predicting rating 5.51 for movie Phir Hera Pheri (2006)
Predicting rating 5.47 for movie Chronicles of Narnia: Prince Caspian, The (2008)
Predicting rating 5.44 for movie Two Family House (2000)
Predicting rating 5.44 for movie Eragon (2006)
Predicting rating 5.40 for movie Rollo and the Woods Sprite (Rölli ja metsänhenki) (2001)
Predicting rating

In [78]:
#sort predictions
ix = tf.argsort(my_predictions, direction='DESCENDING')

for i in range(17):
    j = ix[i].numpy()   # get movie index from sorted predictions

    if j not in my_rated:
        print(f'predicting rating {my_predictions[j]:0.2f} '
              f'for movie {movieList[j]}')

        print(f'\n\noriginal {my_ratings[j]}, Predicted {my_predictions[j]:0.2f} '
              f'for {movieList[j]}')

# dataframe filtering and sorting
filter_ = (movieList_df["number of ratings"] > 20)

movieList_df["pred"] = my_predictions

movieList_df = movieList_df.reindex(
    columns=["pred", "mean rating", "number of ratings", "title"]
)

movieList_df.loc[ix.numpy()[:300]].loc[filter_]\
    .sort_values("mean rating", ascending=False)


predicting rating 6.33 for movie Rewrite, The (2014)


original 0.0, Predicted 6.33 for Rewrite, The (2014)
predicting rating 5.96 for movie Coffee Town (2013)


original 0.0, Predicted 5.96 for Coffee Town (2013)
predicting rating 5.85 for movie Milk (2008)


original 0.0, Predicted 5.85 for Milk (2008)
predicting rating 5.85 for movie Break-Up, The (2006)


original 0.0, Predicted 5.85 for Break-Up, The (2006)
predicting rating 5.84 for movie Never Let Me Go (2010)


original 0.0, Predicted 5.84 for Never Let Me Go (2010)
predicting rating 5.83 for movie Passengers (2016)


original 0.0, Predicted 5.83 for Passengers (2016)
predicting rating 5.74 for movie Risen (2016)


original 0.0, Predicted 5.74 for Risen (2016)
predicting rating 5.68 for movie American Pie Presents: Band Camp (American Pie 4: Band Camp) (2005)


original 0.0, Predicted 5.68 for American Pie Presents: Band Camp (American Pie 4: Band Camp) (2005)
predicting rating 5.63 for movie Straight Outta Compton (2015)


ori

Unnamed: 0,pred,mean rating,number of ratings,title
2079,4.757652,4.158537,41,In Bruges (2008)
929,4.875233,4.118919,185,"Lord of the Rings: The Return of the King, The..."
2700,4.830373,4.109091,55,Toy Story 3 (2010)
1598,4.582337,4.027778,36,Thank You for Smoking (2006)
1122,4.691816,4.006494,77,Shaun of the Dead (2004)
2420,4.526494,4.004762,105,Up (2009)
3283,4.834319,3.982143,28,Argo (2012)
642,4.709891,3.945652,46,Adaptation (2002)
1431,4.534757,3.94,50,Serenity (2005)
3556,5.263443,3.916667,54,"Wolf of Wall Street, The (2013)"


In [79]:
filter=(movieList_df["number of ratings"] > 20)
movieList_df["pred"] = my_predictions
movieList_df = movieList_df.reindex(columns=["pred", "mean rating", "number of ratings", "title"])
movieList_df.loc[ix[:300]].loc[filter].sort_values("mean rating", ascending=False)

Unnamed: 0,pred,mean rating,number of ratings,title
2079,4.757652,4.158537,41,In Bruges (2008)
929,4.875233,4.118919,185,"Lord of the Rings: The Return of the King, The..."
2700,4.830373,4.109091,55,Toy Story 3 (2010)
1598,4.582337,4.027778,36,Thank You for Smoking (2006)
1122,4.691816,4.006494,77,Shaun of the Dead (2004)
2420,4.526494,4.004762,105,Up (2009)
3283,4.834319,3.982143,28,Argo (2012)
642,4.709891,3.945652,46,Adaptation (2002)
1431,4.534757,3.94,50,Serenity (2005)
3556,5.263443,3.916667,54,"Wolf of Wall Street, The (2013)"
