In [1]:
import numpy as np
import pandas as pd
from scipy.io import loadmat
import scipy.optimize as opt
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = loadmat("ex8_movies.mat")
Y = df['Y']
R = df['R']
print(Y.shape,R.shape)
# R[m,n]=1/0 contains if nth u1ser has rated mth movie or not
# Y[m,n] contains rating of mth movie by nth user
# So there are 1682 movies and 943 users

(1682, 943) (1682, 943)


In [3]:
# Load PreTrained Weights
df = loadmat("ex8_movieParams.mat")
X = df['X']
Theta = df['Theta']
# X contains feature vector for each 1682 movies
X.shape
# Reduce the size of X to run faster

(1682, 10)

In [4]:
num_users = 4
num_movies = 5
num_features = 3
X = X[:num_movies,:num_features];
Theta = Theta[:num_users,:num_features];
# Note Y and R have same dim for selecting only those movies that have been rated
Y = Y[:num_movies,:num_users];
R = R[:num_movies, :num_users];

In [5]:
def CostFunction(params, Y, R, num_users,num_movies,num_features,l):
    '''
        Collaborative Filtering
    '''
    X = np.reshape(params[:(num_movies*num_features)],(num_movies,num_features))
    Theta = np.reshape(params[(num_movies*num_features):],(num_users,num_features))
    J = 0
    
    X_grad = np.zeros(X.shape)
    Theta_grad = np.zeros(Theta.shape)
    
    # X(feature vector for each movie) - num_movies by num_features
    # Theta(user liking) - num_users by num_features
    # Y(user rating of movies) - num_movies * num_users
    # R - num_movies by num_users 1 if user has rated else zero
    
    hx = X @ Theta.T
    # To Consider only movies that have only been rated(no need of multiplication with Theta here)
    minus = hx[R==1] - Y[R==1]
    tosum = minus ** 2
    J = 0.5 * sum(tosum)
    # Double sum of both X and Theta
    # No bias terms are included here
    J = J + ((l/2)* np.sum(X ** 2)) + ((l/2) * np.sum(Theta ** 2))
    
    # Calculate Grad
    # Since we need to multiply it with Theta so preserve zero also
    fhx = (hx - Y) * R
    X_grad = fhx @ Theta
    Theta_grad = fhx.T @ X
    # Regularize
    X_grad = X_grad + (l * X)
    Theta_grad = Theta_grad + (l * Theta)
    
    grad = np.hstack((X_grad.flatten(),Theta_grad.flatten()))
    return J,grad

In [6]:
params = np.hstack((X.flatten(),Theta.flatten()))
# Check Cost at lambda = 0
J,_=CostFunction(params, Y, R, num_users,num_movies,num_features,0)
J

22.22460372568567

In [7]:
# Check Cost with Lambda = 1.5
J,_=CostFunction(params, Y, R, num_users,num_movies,num_features,1.5)
J

31.344056244274217

In [8]:
df = pd.read_csv("movie_ids.txt",sep=r'\s{2,}',header=None,engine='python')
movielist = df[0]
my_ratings = np.zeros((1682,1))
# Rate Some Movies
my_ratings[0] = 5
my_ratings[28] = 3
my_ratings[40] = 5
my_ratings[49] = 4
my_ratings[63] = 4
my_ratings[71] = 2
my_ratings[68] = 2
my_ratings[81] = 2
my_ratings[95] = 4
my_ratings[194] = 4
my_ratings[203] = 4
my_ratings[209] = 3
my_ratings[256] = 2
my_ratings[540] = 3
my_ratings[889] = 4

In [9]:
for i in range(len(my_ratings)):
    if(my_ratings[i]>0):
        print(f"Rated {movielist[i]} with {my_ratings[i]}")

Rated 1 Toy Story (1995) with [5.]
Rated 29 Batman Forever (1995) with [3.]
Rated 41 Billy Madison (1995) with [5.]
Rated 50 Star Wars (1977) with [4.]
Rated 64 Shawshank Redemption, The (1994) with [4.]
Rated 69 Forrest Gump (1994) with [2.]
Rated 72 Mask, The (1994) with [2.]
Rated 82 Jurassic Park (1993) with [2.]
Rated 96 Terminator 2: Judgment Day (1991) with [4.]
Rated 195 Terminator, The (1984) with [4.]
Rated 204 Back to the Future (1985) with [4.]
Rated 210 Indiana Jones and the Last Crusade (1989) with [3.]
Rated 257 Men in Black (1997) with [2.]
Rated 541 Mortal Kombat (1995) with [3.]
Rated 890 Mortal Kombat: Annihilation (1997) with [4.]


In [10]:
df = loadmat("ex8_movies.mat")
Y = df['Y']
R = df['R']
# Add our ratings
Y = np.hstack((my_ratings,Y))
R = np.hstack((my_ratings!=0,R))

In [11]:
def NormalizeRatings(Y,R):
    m,n = Y.shape
    # Mean for each movie
    Ymean = np.zeros((m,1))
    Ynorm = np.zeros(Y.shape)
    for i in range(m):
        # Only rated ones
        idx = R[i,:] == 1
        Ymean[i] = np.mean(Y[i,idx])
        Ynorm[i,idx] = Y[i,idx] - Ymean[i]
    return Ynorm,Ymean

In [12]:
# Normalized Ratings and Mean
Ynorm,Ymean = NormalizeRatings(Y,R)
num_users = Y.shape[1]
num_movies = Y.shape[0]
num_features = 10
# Set Initial Parameters
X = np.random.randn(num_movies,num_features)
Theta = np.random.randn(num_users,num_features)

In [13]:
# Train
params = np.hstack((X.flatten(),Theta.flatten()))
l = 10
fmin = opt.minimize(fun=CostFunction,x0=params,args=(Ynorm, R, num_users,num_movies,num_features,l),method='CG',jac=True,options={'maxiter':1000})
fmin

     fun: 38954.149977778194
     jac: array([ 4.19150348e-08, -1.35804620e-06,  3.09242409e-10, ...,
       -1.01921708e-06,  2.74753840e-07,  1.92310061e-06])
 message: 'Desired error not necessarily achieved due to precision loss.'
    nfev: 638
     nit: 311
    njev: 624
  status: 2
 success: False
       x: array([ 0.18843696, -0.34303228, -0.28211742, ...,  0.23439223,
       -0.07333137, -0.41374487])

In [14]:
# Reshape
X = np.reshape(fmin.x[:(num_movies*num_features)],(num_movies,num_features))
Theta = np.reshape(fmin.x[(num_movies*num_features):],(num_users,num_features))
# Predictions
p = X @ Theta.T
# Lets check My Predicted Movie Ratings (Its at index 0)
my_predictions = p[:,0] + Ymean
# pred_sorted contains Top 20 Movie Predictions
pred = my_predictions[:,0]
np.sort(pred)[::-1][:25]

array([5.00361987, 5.00361987, 5.00361987, 5.00361987, 5.00361987,
       5.00361987, 5.00361987, 5.00361987, 5.00361987, 5.00361987,
       4.62861987, 4.50361987, 4.50361987, 4.50361987, 4.50361987,
       4.49469129, 4.47006282, 4.46972156, 4.46040999, 4.45138106,
       4.44728184, 4.39117967, 4.38938766, 4.36149658, 4.34761987])

In [16]:
# Argsort to get movie label
idx = np.argsort(pred,axis=0)[::-1]

# Top Recommended Movies Recommended to Me
for i in range(25):
    movieid = idx[i]
    moviename = ' '.join(movielist[movieid].split()[1:])
    print(f'Predicted Rating of {int(pred[movieid])} for movie {moviename}')

Predicted Rating of 5 for movie Star Kid (1997)
Predicted Rating of 5 for movie Saint of Fort Washington, The (1993)
Predicted Rating of 5 for movie Entertaining Angels: The Dorothy Day Story (1996)
Predicted Rating of 5 for movie Great Day in Harlem, A (1994)
Predicted Rating of 5 for movie They Made Me a Criminal (1939)
Predicted Rating of 5 for movie Someone Else's America (1995)
Predicted Rating of 5 for movie Marlene Dietrich: Shadow and Light (1996)
Predicted Rating of 5 for movie Prefontaine (1997)
Predicted Rating of 5 for movie Santa with Muscles (1996)
Predicted Rating of 5 for movie Aiqing wansui (1994)
Predicted Rating of 4 for movie Pather Panchali (1955)
Predicted Rating of 4 for movie Some Mother's Son (1996)
Predicted Rating of 4 for movie Maya Lin: A Strong Clear Vision (1994)
Predicted Rating of 4 for movie Anna (1996)
Predicted Rating of 4 for movie Everest (1998)
Predicted Rating of 4 for movie Close Shave, A (1995)
Predicted Rating of 4 for movie Schindler's List (