In [1]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import loadmat
from scipy import optimize

In [2]:
data = loadmat(r"./movies.mat")

In [3]:
Y,r = data['Y'],data['R']

The given data has 1682 movies and 943 users 

In [4]:
data2 = preTrained_data = loadmat(r"./movieParameters.mat")

In [5]:
def CostFunction(Y,r,params,num_movies,num_users,num_features,lambda_ = 0.0):
    
    #Extract and Unravel the data from params
    X = params[:num_movies*num_features].reshape(num_movies,num_features)
    
    theta = params[num_movies*num_features:].reshape(num_users,num_features)
    
    #Calculate the cost
    J = (1/2)*np.sum(np.square(X.dot(theta.T) - Y)*r) + (lambda_/2)*np.sum(np.square(X)) + (lambda_/2)*np.sum(np.square(theta))
    
    #calculate gradients
    X_grad = np.dot((X.dot(theta.T) - Y)*r,theta) + (lambda_)*X
    
    theta_grad = np.dot(((X.dot(theta.T) - Y)*r).T,X) + (lambda_)*theta
    
    grad = np.concatenate([X_grad.ravel(),theta_grad.ravel()])
    
    return J,grad

In [6]:
pretrained_X,pretrained_Theta = data2['X'],data2['Theta']

In [7]:
#  Reducing the data set size so that this runs faster
num_users = 4
num_movies = 5
num_features = 3

pretrained_X = pretrained_X[:num_movies, :num_features]
pretrained_Theta = pretrained_Theta[:num_users, :num_features]
pretrained_Y = Y[:num_movies, 0:num_users]
pretrained_R = r[:num_movies, 0:num_users]

pretrained_params = np.concatenate([pretrained_X.ravel(),pretrained_Theta.ravel()])

In [8]:
J,grad = CostFunction(pretrained_Y,pretrained_R,pretrained_params,num_movies,num_users,num_features)

print('Cost at loaded parameters:  %.2f' % J)

Cost at loaded parameters:  22.22


In [9]:
J,grad = CostFunction(pretrained_Y,pretrained_R,pretrained_params,num_movies,num_users,num_features,1.5)

print('Cost at loaded parameters:  %.2f' % J)

Cost at loaded parameters:  31.34


In [10]:
def LoadMoviesList():
    with open(r"./movie_ids.txt",  encoding='ISO-8859-1') as fid:
        movies = fid.readlines()
        
    movieNames = []
    for movie in movies:
        parts = movie.split()
        movieNames.append(' '.join(parts[1:]).strip())
    return movieNames

In [11]:
movieList = LoadMoviesList()

In [12]:
movieList

['Toy Story (1995)',
 'GoldenEye (1995)',
 'Four Rooms (1995)',
 'Get Shorty (1995)',
 'Copycat (1995)',
 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)',
 'Twelve Monkeys (1995)',
 'Babe (1995)',
 'Dead Man Walking (1995)',
 'Richard III (1995)',
 'Seven (Se7en) (1995)',
 'Usual Suspects, The (1995)',
 'Mighty Aphrodite (1995)',
 'Postino, Il (1994)',
 "Mr. Holland's Opus (1995)",
 'French Twist (Gazon maudit) (1995)',
 'From Dusk Till Dawn (1996)',
 'White Balloon, The (1995)',
 "Antonia's Line (1995)",
 'Angels and Insects (1995)',
 'Muppet Treasure Island (1996)',
 'Braveheart (1995)',
 'Taxi Driver (1976)',
 'Rumble in the Bronx (1995)',
 'Birdcage, The (1996)',
 'Brothers McMullen, The (1995)',
 'Bad Boys (1995)',
 'Apollo 13 (1995)',
 'Batman Forever (1995)',
 'Belle de jour (1967)',
 'Crimson Tide (1995)',
 'Crumb (1994)',
 'Desperado (1995)',
 'Doom Generation, The (1995)',
 'Free Willy 2: The Adventure Home (1995)',
 'Mad Love (1995)',
 'Nadja (1994)',
 'Net, The (1995

In [13]:
n_m = len(movieList)

#I will assign few movies some ratings
my_ratings = np.zeros(n_m)

my_ratings[movieList.index("Star Wars (1977)")] = 5

my_ratings[movieList.index("Stargate (1994)")] = 3

my_ratings[movieList.index("Ace Ventura: Pet Detective (1994)")] = 4

my_ratings[movieList.index("Blade Runner (1982)")] = 2

my_ratings[movieList.index("So I Married an Axe Murderer (1993)")] = 1


In [14]:
def normalize_rating(Y,r):
    Y_mean = np.zeros(len(Y))
    Y_norm = np.zeros(Y.shape)
    
    for i in range(len(Y)):
        idx = r[i, :] == 1
        Y_mean[i] = np.mean(Y[i, idx])
        Y_norm[i, idx] = Y[i, idx] - Y_mean[i]

    return Y_norm, Y_mean

In [15]:
Y = np.hstack([my_ratings[:, None], Y])
R = np.hstack([(my_ratings > 0)[:, None], r])

In [16]:
Ynorm, Ymean = normalize_rating(Y, R)

In [17]:
#Initialize the required parameters

num_movies, num_users = Y.shape
num_features = 10

# Set Initial Parameters (Theta, X)
X = np.random.randn(num_movies, num_features)
Theta = np.random.randn(num_users, num_features)

In [18]:
# Set options for scipy.optimize.minimize
options = {'maxiter': 100}
initial_paramters = np.concatenate([X.ravel(),Theta.ravel()])

# Set Regularization
lambda_ = 10

res = optimize.minimize(lambda x: CostFunction(Ynorm,R,x,num_movies,num_users,num_features,lambda_),
                        initial_paramters,
                        method='TNC',
                        jac=True,
                        options=options)

trained_params = res.x

In [19]:
X_trained = trained_params[:num_movies*num_features].reshape(num_movies,num_features)

theta_trained = trained_params[num_movies*num_features:].reshape(num_users,num_features)

In [20]:
#Predict 
predictions = X_trained.dot(theta_trained.T)

In [21]:
#I had added my predictions ratings on the first column

my_predictions = predictions[:,0] + Ymean

In [22]:
desc_pred_sorted = np.argsort(my_predictions)[::-1]

#Printing top 10 movie Recommendations based on my initial ratings
for i in range(10):
    j = desc_pred_sorted[i]
    print('Predicting rating %.1f for movie %s' % (my_predictions[j], movieList[j]))

Predicting rating 5.0 for movie Marlene Dietrich: Shadow and Light (1996)
Predicting rating 5.0 for movie They Made Me a Criminal (1939)
Predicting rating 5.0 for movie Entertaining Angels: The Dorothy Day Story (1996)
Predicting rating 5.0 for movie Prefontaine (1997)
Predicting rating 5.0 for movie Someone Else's America (1995)
Predicting rating 5.0 for movie Great Day in Harlem, A (1994)
Predicting rating 5.0 for movie Santa with Muscles (1996)
Predicting rating 5.0 for movie Star Kid (1997)
Predicting rating 5.0 for movie Saint of Fort Washington, The (1993)
Predicting rating 5.0 for movie Aiqing wansui (1994)
