In [94]:
# Load Librarys
import numpy as np
import scipy.io as sio
import scipy.optimize as op

In [95]:
# Load and visulaize data
data = sio.loadmat('ex8_movies.mat')
R = data['R']
Y = data['Y']

# Calculate and print the average rating of the first movie
ratingList = Y[1, np.where(R[1, :])]
averageRating = ratingList.sum() / (ratingList.size)
print('Average rating for movie 1 (Toy Story): {}'.format(averageRating))

Average rating for movie 1 (Toy Story): 3.2061068702290076


In [96]:
# For the results of this algorithm to exactly match those of the coursera forum tests, the flattening/packing 
# of the  parameters must be done in a fortran way, like octave and matlab would. Numpy defaults to C like
def pack_thetas(X, Thetas):
    return np.concatenate((X.flatten(order='F'), Thetas.flatten(order='F')))
    
def unpack_thetas(params, num_users, num_movies, num_features):
    X = params[:(num_movies * num_features)].reshape([num_movies, num_features], order='F')
    Theta = params[num_movies * num_features:].reshape([num_users, num_features], order='F')
    return X, Theta

def cofiCostFuncRegularized(params, Y, R, num_users, num_movies, num_features, lamda):
        
    X, Theta = unpack_thetas(params, num_users, num_movies, num_features)

    # The squared difference the predicted rating and actual rating
    scorePredictions = (np.dot(X, Theta.T) - Y) ** 2
    
    # Uses the fact that for movies a user hasn't rated, the entry in R in 0. Therefore removing all possible errors
    scorePredictions = scorePredictions * R
    
    cost = scorePredictions.sum() / 2
    
    regularization_term = ((lamda / 2) * (Theta**2).sum()) + ((lamda / 2) * (X**2).sum())
      
    return cost + regularization_term

# Implement with for loops for comparison
def cofiGradFuncRegularized(params, Y, R, num_users, num_movies, num_features, lamda):
    
    X, Theta = unpack_thetas(params, num_users, numOfMovies, numOfFeatures)
    
    # Common part to both the X and Theta partial derivitives
    tempGrad = (np.dot(X, Theta.T) - Y) * R
    
    X_grad = np.dot(tempGrad, Theta)
    Theta_grad = np.dot(X.T, tempGrad).T
    
    X_regularization_term = lamda * X
    Theta_regularization_term = lamda * Theta
    
    X_grad_reg = X_grad + X_regularization_term
    Theta_grad_reg = Theta_grad + Theta_regularization_term
    
    return pack_thetas(X_grad_reg, Theta_grad_reg)


In [97]:
# Load a set of predefined values for X and Theta
data = sio.loadmat('ex8_movieParams.mat')
X = data['X']
Theta = data['Theta']

# Reduce the data set size so that this runs faster
numOfUsers = 4
numOfMovies = 5
numOfFeatures = 3
X_slice = X[:numOfMovies, :numOfFeatures]
Theta = Theta[:numOfUsers, :numOfFeatures]
Y_slice = Y[:numOfMovies, :numOfUsers]
R_slice = R[:numOfMovies, :numOfUsers]

# This cofiCostFunc shoud return a value of around 22.22
params = pack_thetas(X_slice, Theta)
print("Initial Cost: ", cofiCostFuncRegularized(params, Y_slice, R_slice, numOfUsers, numOfMovies, numOfFeatures, 0))

myArgs = (Y_slice, R_slice, numOfUsers, numOfMovies, numOfFeatures, 0)
result = op.minimize(cofiCostFuncRegularized, params, args=myArgs, jac=cofiGradFuncRegularized, method='BFGS')
optimal_params = result.x

print("Cost after optization: ", cofiCostFuncRegularized(optimal_params, Y_slice, R_slice, numOfUsers, numOfMovies, numOfFeatures, 0))


Initial Cost:  22.2246037257
Cost after optization:  6.73161745235e-13


In [98]:
def normalizeRatings(Y, R):
    
    m, n = Y.shape
    
    Ymean = np.zeros([m, 1])
    Ynorm = np.zeros([m , n])
    
    for i in range(m):
        idx = np.where(R[i, :] == 1)
        Ymean[i] = np.mean(Y[i, idx])
        Ynorm[i, idx] = Y[i, idx] - Ymean[i]
        
    return Ynorm, Ymean    


In [99]:
# Load and visulaize data
data = sio.loadmat('ex8_movies.mat')
R = data['R']
Y = data['Y']

# Load movie titles into file
f = open('movie_ids.txt', 'r')
movieList = np.array([x for x in f])

#print(movieList)
my_ratings = np.zeros([1682, 1]);

# Add some ratings
my_ratings[0] = 4;
my_ratings[97] = 2;
my_ratings[6] = 3;
my_ratings[11]= 5;
my_ratings[53] = 4;
my_ratings[63]= 5;
my_ratings[65]= 3;
my_ratings[68] = 5;
my_ratings[182] = 4;
my_ratings[225] = 5;
my_ratings[354]= 5;

# Add my new rating to the list of ratings
Y = np.hstack((my_ratings, Y))

# Create R array for my new user preferences
myR = np.where(my_ratings != 0, 1, 0)

# Add it to the main R array
R = np.hstack((myR, R))

# Normalize my ratings
Ynorm , Ymean = normalizeRatings(Y, R)

# Useful variables
numOfUsers = Y.shape[1]
numOfMovies = Y.shape[0]
numOfFeatures = 10

# Randomly initialize the parameters
X =  np.random.randn(numOfMovies, numOfFeatures)
Theta = np.random.randn(numOfUsers, numOfFeatures)
lamda = 10

# Flatten params into single array
params = pack_thetas(X, Theta)

myargs = (Y, R, numOfUsers, numOfMovies, numOfFeatures, lamda)
opts = {'maxiter': 100}

#result = op.minimize(cofiCostFuncRegularized, params, args=myargs, jac=cofiGradFuncRegularized, method='L-BFGS-B' , options=opts)
#optimal_parmas = result.x
#print(result)

res1 = op.optimize.fmin_cg(f=cofiCostFuncRegularized, x0=params, fprime=cofiGradFuncRegularized, args=myargs, **opts)



         Current function value: 71910.379197
         Iterations: 200
         Function evaluations: 303
         Gradient evaluations: 303


In [100]:
# Unpack the found parameters
X_optimal, Theta_optimal = unpack_thetas(res1, numOfUsers, numOfMovies, numOfFeatures)

# predicted value of the model
p = np.dot(X_optimal, Theta_optimal.T)

# Unnormalize
predictions = p + Ymean
my_predictions = predictions[:, 0]

# Find the index of the best predicted movies to watch. Sorted from best predicted rating to worst
sort = np.argsort(my_predictions)[::-1]

print("Top recomendations for you:")
for movie_index in range(10):
    movie = movieList[sort[movie_index]]
    prediction = my_predictions[sort[movie_index]]
    print("Predicting rating {} for movie {}".format(prediction, movie))
    
    
print((my_ratings.size))
    
print("Original ratings provided:")
for movie_index in range(my_ratings.size):
    if my_ratings[movie_index] > 0:
        print("Rated {} for {}".format(my_ratings[movie_index], movieList[movie_index]))
    

Top recomendations for you:
Predicting rating 8.545845589324259 for movie 313 Titanic (1997)

Predicting rating 8.537472986810753 for movie 50 Star Wars (1977)

Predicting rating 8.32430800244579 for movie 64 Shawshank Redemption, The (1994)

Predicting rating 8.242771502323919 for movie 174 Raiders of the Lost Ark (1981)

Predicting rating 8.226280241373082 for movie 318 Schindler's List (1993)

Predicting rating 8.188855939229652 for movie 272 Good Will Hunting (1997)

Predicting rating 8.07680362117411 for movie 172 Empire Strikes Back, The (1980)

Predicting rating 8.05594562333999 for movie 12 Usual Suspects, The (1995)

Predicting rating 8.037330718974326 for movie 127 Godfather, The (1972)

Predicting rating 8.015913198716248 for movie 22 Braveheart (1995)

1682
Original ratings provided:
Rated [ 4.] for 1 Toy Story (1995)

Rated [ 3.] for 7 Twelve Monkeys (1995)

Rated [ 5.] for 12 Usual Suspects, The (1995)

Rated [ 4.] for 54 Outbreak (1995)

Rated [ 5.] for 64 Shawshank Rede

######################################################################################################################################################################################################################################

In [102]:
'Everything Below is for tests described on the coursera forum for the cost and gradient functions'

R = np.array([[1, 0, 1,], [1, 1, 1], [0, 0, 0], [1, 1, 0]])
params = np.array([0.10000,   0.20000,   0.30000,   0.40000,   0.50000,   0.60000,   0.70000,   0.80000,  0.90000,   1.00000,   1.10000,   1.20000,   1.30000,   1.40000])

Y = np.array([ [16, 2, 3, 13], [5, 11, 10, 8], [9, 7, 6, 12], [4, 14, 15, 1] ])
Y = Y[:, :3]

numOfUsers = 3
numOfMovies = 4
numOfFeatures = 2
lamda = 6.0

cost = cofiCostFuncRegularized(params, Y, R, numOfUsers, numOfMovies, numOfFeatures, lamda)
gradient = cofiGradFuncRegularized(params, Y, R, numOfUsers, numOfMovies, numOfFeatures, lamda)
print("Cost: ", cost)
print("Gradient: ", gradient)


Cost:  331.0811
Gradient:  [-15.588 -22.344   1.8   -12.572 -18.438 -26.862   4.2   -14.744   1.977
  -1.028   4.593  -5.059  -8.26    1.941]
