In [2]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd

In [16]:
X=np.loadtxt('./data/small_movies_X.csv',delimiter=',')
W=np.loadtxt('./data/small_movies_W.csv',delimiter=',')
b=np.loadtxt('./data/small_movies_b.csv',delimiter=',')
b=b.reshape(1,-1)
num_movies,num_features=X.shape
num_users,_=W.shape
Y=np.loadtxt('./data/small_movies_Y.csv',delimiter=',')
R=np.loadtxt('./data/small_movies_R.csv',delimiter=',')
print("Y", Y.shape, "R", R.shape)
print("X", X.shape)
print("W", W.shape)
print("b", b.shape)
print("num_features", num_features)
print("num_movies",   num_movies)
print("num_users",    num_users)

Y (4778, 443) R (4778, 443)
X (4778, 10)
W (443, 10)
b (1, 443)
num_features 10
num_movies 4778
num_users 443


In [17]:
tsmean=np.mean(Y[0,R[0,:].astype(bool)])
print(f"Average rating for movie 1 : {tsmean:0.3f} / 5" )

Average rating for movie 1 : 3.400 / 5


In [20]:
def cofi_cost_func(X,W,b,Y,R,lambda_):
    nm,nu=Y.shape
    J=0
    for j in range(nu):
        w = W[j,:]
        b_j = b[0,j]
        for i in range(nm):
            x = X[i,:]
            y = Y[i,j]
            r = R[i,j]
            J += r * np.square((np.dot(w,x) + b_j - y ))
    J += (lambda_) * (np.sum(np.square(W)) + np.sum(np.square(X)))
    J = J/2
    return J

In [21]:
num_users_r = 4
num_movies_r = 5 
num_features_r = 3

X_r = X[:num_movies_r, :num_features_r]
W_r = W[:num_users_r,  :num_features_r]
b_r = b[0, :num_users_r].reshape(1,-1)
Y_r = Y[:num_movies_r, :num_users_r]
R_r = R[:num_movies_r, :num_users_r]

# Evaluate cost function
J = cofi_cost_func(X_r, W_r, b_r, Y_r, R_r, 0);
print(f"Cost: {J:0.2f}")

Cost: 13.67


In [22]:
J = cofi_cost_func(X_r, W_r, b_r, Y_r, R_r, 1.5);
print(f"Cost (with regularization): {J:0.2f}")

Cost (with regularization): 28.09


In [23]:
def cofi_cost_func_vectorized(X,W,b,Y,R,lambda_):
    j=(tf.linalg.matmul(X,tf.transpose(W))+b-Y)*R
    J=0.5*tf.reduce_sum(j**2)+(lambda_/2)*(tf.reduce_sum(W**2)+tf.reduce_sum(X**2))
    return J

In [26]:
J = cofi_cost_func_vectorized(X_r, W_r, b_r, Y_r, R_r, 0);
print(f"Cost: {J:0.2f}")

# Evaluate cost function with regularization 
J = cofi_cost_func_vectorized(X_r, W_r, b_r, Y_r, R_r, 1.5);
print(f"Cost (with regularization): {J:0.2f}")

Cost: 13.67
Cost (with regularization): 28.09


In [28]:
movielist_df = pd.read_csv('./data/small_movie_list.csv', header=0, index_col=0,  delimiter=',', quotechar='"')
movielist = df["title"].to_list()

In [31]:
my_ratings = np.zeros(num_movies)
my_ratings[2700] = 5 
my_ratings[2609] = 2
my_ratings[929]  = 5  
my_ratings[246]  = 5  
my_ratings[2716] = 3   
my_ratings[1150] = 5   
my_ratings[382]  = 2   
my_ratings[366]  = 5   
my_ratings[622]  = 5   
my_ratings[988]  = 3   
my_ratings[2925] = 1   
my_ratings[2937] = 1   
my_ratings[793]  = 5
my_rated = [i for i in range(len(my_ratings)) if my_ratings[i] > 0]
print('\nNew user ratings:\n')
for i in range(len(my_ratings)):
    if my_ratings[i] > 0 :
        print(f'Rated {my_ratings[i]} for  {movielist_df.loc[i,"title"]}');


New user ratings:

Rated 5.0 for  Shrek (2001)
Rated 5.0 for  Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
Rated 2.0 for  Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)
Rated 5.0 for  Harry Potter and the Chamber of Secrets (2002)
Rated 5.0 for  Pirates of the Caribbean: The Curse of the Black Pearl (2003)
Rated 5.0 for  Lord of the Rings: The Return of the King, The (2003)
Rated 3.0 for  Eternal Sunshine of the Spotless Mind (2004)
Rated 5.0 for  Incredibles, The (2004)
Rated 2.0 for  Persuasion (2007)
Rated 5.0 for  Toy Story 3 (2010)
Rated 3.0 for  Inception (2010)
Rated 1.0 for  Louis Theroux: Law & Disorder (2008)
Rated 1.0 for  Nothing to Declare (Rien à déclarer) (2010)


In [32]:
Y=np.c_[my_ratings,Y]
R=np.c_[(my_ratings!=0).astype(int),R]
Ymean = (np.sum(Y*R,axis=1)/(np.sum(R, axis=1)+1e-12)).reshape(-1,1)
Ynorm = Y - np.multiply(Ymean, R) 

In [33]:
num_movies,num_users=Y.shape
tf.random.set_seed(1234)
W=tf.Variable(tf.random.normal((num_users,num_features),dtype=tf.float64),name='W')
X=tf.Variable(tf.random.normal((num_movies,num_features),dtype=tf.float64),name='X')
b=tf.Variable(tf.random.normal((1,num_users),dtype=tf.float64),name='b')
optimizer=keras.optimizers.Adam(learning_rate=1e-1)

In [34]:
iterations=200
lambda_=1
for iter in range(iterations):
    with tf.GradientTape() as tape:
        cost_value=cofi_cost_func_vectorized(X,W,b,Ynorm,R,lambda_)
    grads=tape.gradient(cost_value,[X,W,b])
    optimizer.apply_gradients(zip(grads,[X,W,b]))
    if iter%20==0:
        print(f"Training loss at iteration {iter}: {cost_value:0.1f}")

Training loss at iteration 0: 267453.3
Training loss at iteration 20: 16320.3
Training loss at iteration 40: 9551.2
Training loss at iteration 60: 7243.0
Training loss at iteration 80: 6215.2
Training loss at iteration 100: 5690.9
Training loss at iteration 120: 5412.0
Training loss at iteration 140: 5251.8
Training loss at iteration 160: 5147.2
Training loss at iteration 180: 5072.2


In [37]:
p=np.matmul(X.numpy(),np.transpose(W.numpy()))+b.numpy()
pm=p+Ymean
my_predictions=pm[:,0]
ix=tf.argsort(my_predictions,direction='DESCENDING')
for i in range(17):
    j = ix[i]
    if j not in my_rated:
        print(f'Predicting rating {my_predictions[j]:0.2f} for movie {movielist[j]}')

print('\n\nOriginal vs Predicted ratings:\n')
for i in range(len(my_ratings)):
    if my_ratings[i] > 0:
        print(f'Original {my_ratings[i]}, Predicted {my_predictions[i]:0.2f} for {movielist[i]}')

Predicting rating 5.59 for movie Argo (2012)
Predicting rating 5.37 for movie Black Mirror: White Christmas (2014)
Predicting rating 5.29 for movie Spy Kids (2001)
Predicting rating 5.25 for movie Big Hero 6 (2014)
Predicting rating 5.25 for movie Elite Squad (Tropa de Elite) (2007)
Predicting rating 5.21 for movie How to Train Your Dragon (2010)
Predicting rating 5.19 for movie Day of the Doctor, The (2013)
Predicting rating 5.05 for movie Moneyball (2011)
Predicting rating 5.04 for movie Dictator, The (2012)
Predicting rating 5.03 for movie Love Actually (2003)
Predicting rating 5.02 for movie Jackass: The Movie (2002)
Predicting rating 4.99 for movie Deathgasm (2015)
Predicting rating 4.99 for movie Martin Lawrence Live: Runteldat (2002)
Predicting rating 4.99 for movie Elite Squad: The Enemy Within (Tropa de Elite 2 - O Inimigo Agora É Outro) (2010)
Predicting rating 4.99 for movie The Raid: Redemption (2011)
Predicting rating 4.97 for movie Crash (2004)
Predicting rating 4.96 for 

In [39]:
filter=(movielist_df["number of ratings"] > 20)
movielist_df["pred"] = my_predictions
movielist_df = movielist_df.reindex(columns=["pred", "mean rating", "number of ratings", "title"])
movielist_df.loc[ix[:300]].loc[filter].sort_values("mean rating", ascending=False)

Unnamed: 0,pred,mean rating,number of ratings,title
929,4.584263,4.118919,185,"Lord of the Rings: The Return of the King, The..."
2700,4.655481,4.109091,55,Toy Story 3 (2010)
393,4.726625,4.106061,198,"Lord of the Rings: The Fellowship of the Ring,..."
3714,4.659733,4.050847,59,Guardians of the Galaxy (2014)
3527,4.585622,4.047619,21,Captain Phillips (2013)
653,4.750402,4.021277,188,"Lord of the Rings: The Two Towers, The (2002)"
3283,5.588243,3.982143,28,Argo (2012)
773,4.541688,3.960993,141,Finding Nemo (2003)
2851,4.624051,3.95,30,Limitless (2011)
2649,5.209564,3.943396,53,How to Train Your Dragon (2010)
