In [5]:
!pip install scikit-surprise

import pandas as pd
from surprise import accuracy, Dataset, SVD, SVDpp, NMF
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate



In [6]:
data = Dataset.load_builtin(name = 'ml-100k' , prompt = True)

Dataset ml-100k could not be found. Do you want to download it? [Y/n] y
Trying to download dataset from https://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /root/.surprise_data/ml-100k


In [7]:
benchmark = []
algorithms = [SVD(), SVDpp(), NMF()]

for algorithm in algorithms:
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=5, verbose=False)
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = pd.concat([pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']), tmp])
    benchmark.append(tmp)

In [8]:
surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')
surprise_results

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVDpp,0.917789,27.10702,5.118367
SVD,0.935811,1.735392,0.178543
NMF,0.962035,2.258209,0.170451


In [9]:
trainset, testset = train_test_split(data, test_size=0.25)
algo = SVDpp()
algo.fit(trainset)
predictions = algo.test(testset)

accuracy.rmse(predictions)

RMSE: 0.9241


0.9240808320917283

In [None]:
"Additional task"

In [33]:
import numpy as np
from scipy import optimize
from scipy.io import loadmat

In [41]:
def loadMovieList():
    """
    Reads the fixed movie list in movie_ids.txt and returns a list of movie names.
    Returns
    -------
    movieNames : list
        A list of strings, representing all movie names.
    """
    # Read the fixed movieulary list
    with open('movie_ids.txt',  encoding='ISO-8859-1') as fid:
        movies = fid.readlines()

    movieNames = []
    for movie in movies:
        parts = movie.split()
        movieNames.append(' '.join(parts[1:]).strip())
    return movieNames

In [48]:
names = loadMovieList()

In [49]:
len(names)

1682

In [72]:
# Load data
data = loadmat('movies.mat')
Y, R = data['Y'], data['R']

# Y is a 1682x943 matrix, containing ratings (1-5) of
# 1682 movies on 943 users

# R is a 1682x943 matrix, where R(i,j) = 1
# if and only if user j gave a rating to movie i

# From the matrix, we can compute statistics like average rating.
print('Average rating for movie 1601 (',names[1601] ,'): %f / 5' %
      np.mean(Y[180, R[1601, :]]))


Average rating for movie 1601 ( Price Above Rubies, A (1998) ): 4.984093 / 5


In [45]:
def normalizeRatings(Y, R):
    m, n = Y.shape
    Ymean = np.zeros(m)
    Ynorm = np.zeros(Y.shape)

    for i in range(m):
        idx = R[i, :] == 1
        Ymean[i] = np.mean(Y[i, idx])
        Ynorm[i, idx] = Y[i, idx] - Ymean[i]

    return Ynorm, Ymean

In [46]:
def computeNumericalGradient(J, theta, e=1e-4):
    numgrad = np.zeros(theta.shape)
    perturb = np.diag(e * np.ones(theta.shape))
    for i in range(theta.size):
        loss1, _ = J(theta - perturb[:, i])
        loss2, _ = J(theta + perturb[:, i])
        numgrad[i] = (loss2 - loss1)/(2*e)
    return numgrad

In [47]:
def checkCostFunction(cofiCostFunc, lambda_=0.):
    # Create small problem
    X_t = np.random.rand(4, 3)
    Theta_t = np.random.rand(5, 3)

    # Zap out most entries
    Y = np.dot(X_t, Theta_t.T)
    Y[np.random.rand(*Y.shape) > 0.5] = 0
    R = np.zeros(Y.shape)
    R[Y != 0] = 1

    # Run Gradient Checking
    X = np.random.randn(*X_t.shape)
    Theta = np.random.randn(*Theta_t.shape)
    num_movies, num_users = Y.shape
    num_features = Theta_t.shape[1]

    params = np.concatenate([X.ravel(), Theta.ravel()])
    numgrad = computeNumericalGradient(
        lambda x: cofiCostFunc(x, Y, R, num_users, num_movies, num_features, lambda_), params)

    cost, grad = cofiCostFunc(params, Y, R, num_users,num_movies, num_features, lambda_)

    print(np.stack([numgrad, grad], axis=1))
    print('\nThe above two columns you get should be very similar.'
          '(Left-Your Numerical Gradient, Right-Analytical Gradient)')

    diff = np.linalg.norm(numgrad-grad)/np.linalg.norm(numgrad+grad)
    print('If your cost function implementation is correct, then '
          'the relative difference will be small (less than 1e-9).')
    print('\nRelative Difference: %g' % diff)

In [65]:


def cofiCostFunc(params, Y, R, num_users, num_movies,
                      num_features, lambda_=0.0):
    # Unfold the U and W matrices from params
    X = params[:num_movies*num_features].reshape(num_movies, num_features)
    Theta = params[num_movies*num_features:].reshape(num_users, num_features)

    # You need to return the following values correctly
    J = 0
    X_grad = np.zeros(X.shape)
    Theta_grad = np.zeros(Theta.shape)

    J = (1 / 2) * np.sum(np.square((X.dot(Theta.T) - Y) * R)) + (lambda_ / 2) * np.sum(np.square(X)) + \
                                                                (lambda_ / 2) * np.sum(np.square(Theta))

    for i in range(R.shape[0]):

        idx = np.where(R[i, :] == 1)[0]
        Theta_temp = Theta[idx, :]
        Y_temp = Y[i, idx]
        X_grad[i, :] = np.dot(np.dot(X[i, :], Theta_temp.T) - Y_temp, Theta_temp) + lambda_ * X[i, :]

    for j in range(R.shape[1]):

        idx = np.where(R[:, j] == 1)[0]
        X_temp = X[idx, :]
        Y_temp = Y[idx, j]
        Theta_grad[j, :] = np.dot(np.dot(X_temp, Theta[j, :]) - Y_temp, X_temp) + lambda_ * Theta[j, :]

    grad = np.concatenate([X_grad.ravel(), Theta_grad.ravel()])
    return J, grad



In [66]:
#  Check gradients by running checkcostFunction
checkCostFunction(cofiCostFunc, 1.5)

[[-1.93296765e+00 -1.93296765e+00]
 [ 7.10507079e-01  7.10507079e-01]
 [ 7.83306527e-01  7.83306527e-01]
 [-5.43008108e+00 -5.43008108e+00]
 [-8.58165706e+00 -8.58165706e+00]
 [ 4.64381580e+00  4.64381580e+00]
 [ 1.65967071e+00  1.65967071e+00]
 [ 3.84385335e+00  3.84385335e+00]
 [ 2.14240505e+00  2.14240505e+00]
 [ 1.61927751e+00  1.61927751e+00]
 [ 1.11413226e+00  1.11413226e+00]
 [ 3.95461144e-01  3.95461144e-01]
 [-2.34653944e+00 -2.34653944e+00]
 [ 1.33239962e+00  1.33239962e+00]
 [-1.08737837e-01 -1.08737837e-01]
 [ 8.79421156e+00  8.79421156e+00]
 [ 5.32953478e+00  5.32953478e+00]
 [-4.07324567e+00 -4.07324567e+00]
 [-6.06157431e-01 -6.06157431e-01]
 [ 1.28094966e+00  1.28094966e+00]
 [-1.15348494e+00 -1.15348494e+00]
 [-1.12154253e+00 -1.12154253e+00]
 [-3.85755489e+00 -3.85755489e+00]
 [-1.80727426e+00 -1.80727426e+00]
 [-2.86333044e-01 -2.86333044e-01]
 [-1.71603319e+00 -1.71603319e+00]
 [-3.79996424e-03 -3.79996421e-03]]

The above two columns you get should be very similar.

In [80]:
movieList = loadMovieList()

#  Before we will train the collaborative filtering model, we will first
#  add ratings that correspond to a new user that we just observed. This
#  part of the code will also allow you to put in your own ratings for the
#  movies in our dataset!
n_m = len(movieList)
#  Initialize my ratings
my_ratings = np.zeros(n_m)
# We have selected a few movies we liked / did not like and the ratings we
# gave are as follows:
my_ratings[596] = 2
my_ratings[1601] = 5
my_ratings[666] = 5
my_ratings[11]= 1
my_ratings[53] = 5
my_ratings[63] = 1
my_ratings[999] = 5
my_ratings[364] = 3
my_ratings[688] = 3
my_ratings[182] = 5
my_ratings[225] = 5
my_ratings[354] = 2
my_ratings[1122] = 4
my_ratings[124] = 4
my_ratings[924] = 1

In [81]:
#  Add our own ratings to the data matrix
Y = np.hstack([my_ratings[:, None], Y])
R = np.hstack([(my_ratings > 0)[:, None], R])

#  Normalize Ratings
Ynorm, Ymean = normalizeRatings(Y, R)

#  Useful Values
num_movies, num_users = Y.shape
num_features = 7

# Set Initial Parameters (Theta, X)
X = np.random.randn(num_movies, num_features)
Theta = np.random.randn(num_users, num_features)

initial_parameters = np.concatenate([X.ravel(), Theta.ravel()])

# Set options for scipy.optimize.minimize
options = {'maxfun': 100}

# Set Regularization
lambda_ = 10
res = optimize.minimize(lambda x: cofiCostFunc(x, Ynorm, R, num_users,
                                               num_movies, num_features, lambda_),
                        initial_parameters,
                        method='TNC',
                        jac=True,
                        options=options)
theta = res.x

# Unfold the returned theta back into U and W
X = theta[:num_movies*num_features].reshape(num_movies, num_features)
Theta = theta[num_movies*num_features:].reshape(num_users, num_features)

In [82]:
# Make recommendations by computing the predictions matrix
p = np.dot(X, Theta.T)
my_predictions = p[:, 0] + Ymean

movieList = loadMovieList()

ix = np.argsort(my_predictions)[::-1]

print('Top recommendations for you:')
print('----------------------------')
for i in range(15):
    j = ix[i]
    print('Predicting rating %.1f for movie %s' % (my_predictions[j], movieList[j]))

print('\nOriginal ratings provided:')
print('--------------------------')
for i in range(len(my_ratings)):
    if my_ratings[i] > 0:
        print('Rated %d for %s' % (my_ratings[i], movieList[i]))

Top recommendations for you:
----------------------------
Predicting rating 5.0 for movie Great Day in Harlem, A (1994)
Predicting rating 5.0 for movie Prefontaine (1997)
Predicting rating 5.0 for movie Someone Else's America (1995)
Predicting rating 5.0 for movie Santa with Muscles (1996)
Predicting rating 5.0 for movie Marlene Dietrich: Shadow and Light (1996)
Predicting rating 5.0 for movie Aiqing wansui (1994)
Predicting rating 5.0 for movie Entertaining Angels: The Dorothy Day Story (1996)
Predicting rating 5.0 for movie They Made Me a Criminal (1939)
Predicting rating 5.0 for movie Star Kid (1997)
Predicting rating 5.0 for movie Saint of Fort Washington, The (1993)
Predicting rating 4.6 for movie Pather Panchali (1955)
Predicting rating 4.6 for movie Wrong Trousers, The (1993)
Predicting rating 4.5 for movie Everest (1998)
Predicting rating 4.5 for movie Anna (1996)
Predicting rating 4.5 for movie Some Mother's Son (1996)

Original ratings provided:
--------------------------
Rat