Here we will be developing a **Matrix Factorization** based recommender system using default movie dataset.

In [1]:
# install the surprise package
# !pip install scikit-surprise

In [2]:
from collections import defaultdict

from surprise import SVD, Dataset

In [3]:
# First train an SVD algorithm on the movielens dataset.
data = Dataset.load_builtin('ml-100k',prompt=False)

In [4]:
trainset = data.build_full_trainset()

In [5]:
# prints user data with movies rated
# {user_id:[(movie_id,ratings)]}
trainset.ur

defaultdict(list,
            {0: [(0, 3.0),
              (528, 4.0),
              (377, 4.0),
              (522, 3.0),
              (431, 5.0),
              (834, 5.0),
              (380, 4.0),
              (329, 4.0),
              (550, 5.0),
              (83, 4.0),
              (632, 2.0),
              (86, 4.0),
              (289, 5.0),
              (363, 3.0),
              (438, 5.0),
              (389, 5.0),
              (649, 4.0),
              (947, 4.0),
              (423, 3.0),
              (291, 3.0),
              (10, 2.0),
              (1006, 4.0),
              (179, 3.0),
              (751, 3.0),
              (487, 3.0),
              (665, 3.0),
              (92, 4.0),
              (512, 5.0),
              (1045, 3.0),
              (672, 4.0),
              (656, 4.0),
              (221, 5.0),
              (432, 2.0),
              (365, 3.0),
              (321, 2.0),
              (466, 4.0),
              (302, 4.0),
              (491, 3

In [6]:
# initialisng the algorithm
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x21cba38dcd0>

In [7]:
# Then predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()

In [8]:
# (user_id,movie_id_not_watched,global_mean)
testset

[('196', '302', 3.52986),
 ('196', '377', 3.52986),
 ('196', '51', 3.52986),
 ('196', '346', 3.52986),
 ('196', '474', 3.52986),
 ('196', '265', 3.52986),
 ('196', '465', 3.52986),
 ('196', '451', 3.52986),
 ('196', '86', 3.52986),
 ('196', '1014', 3.52986),
 ('196', '222', 3.52986),
 ('196', '40', 3.52986),
 ('196', '29', 3.52986),
 ('196', '785', 3.52986),
 ('196', '387', 3.52986),
 ('196', '274', 3.52986),
 ('196', '1042', 3.52986),
 ('196', '1184', 3.52986),
 ('196', '392', 3.52986),
 ('196', '486', 3.52986),
 ('196', '144', 3.52986),
 ('196', '118', 3.52986),
 ('196', '1', 3.52986),
 ('196', '546', 3.52986),
 ('196', '95', 3.52986),
 ('196', '768', 3.52986),
 ('196', '277', 3.52986),
 ('196', '234', 3.52986),
 ('196', '246', 3.52986),
 ('196', '98', 3.52986),
 ('196', '193', 3.52986),
 ('196', '88', 3.52986),
 ('196', '194', 3.52986),
 ('196', '1081', 3.52986),
 ('196', '603', 3.52986),
 ('196', '796', 3.52986),
 ('196', '32', 3.52986),
 ('196', '16', 3.52986),
 ('196', '304', 3.5

In [9]:
# making predictions on test data
predictions = algo.test(testset)

In [10]:
# (user_id,movie_id,global_mean,predicted_rating,impossible)
predictions

[Prediction(uid='196', iid='302', r_ui=3.52986, est=4.12428202067203, details={'was_impossible': False}),
 Prediction(uid='196', iid='377', r_ui=3.52986, est=2.6990175463300075, details={'was_impossible': False}),
 Prediction(uid='196', iid='51', r_ui=3.52986, est=3.062643599949275, details={'was_impossible': False}),
 Prediction(uid='196', iid='346', r_ui=3.52986, est=3.4631497520588503, details={'was_impossible': False}),
 Prediction(uid='196', iid='474', r_ui=3.52986, est=4.261394532064005, details={'was_impossible': False}),
 Prediction(uid='196', iid='265', r_ui=3.52986, est=3.547405645502848, details={'was_impossible': False}),
 Prediction(uid='196', iid='465', r_ui=3.52986, est=3.495582926137719, details={'was_impossible': False}),
 Prediction(uid='196', iid='451', r_ui=3.52986, est=3.3010460212629704, details={'was_impossible': False}),
 Prediction(uid='196', iid='86', r_ui=3.52986, est=3.7631085828518422, details={'was_impossible': False}),
 Prediction(uid='196', iid='1014', r

This can be impossible to calculate in times of **COLD START**, when a user has not rated or watched any movie or else when a movie has not watched by any user, so we won't be able to come up with the characteristics.

In [11]:
def get_top_n(predictions, n=10):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [12]:
top_n = get_top_n(predictions, n=10)

In [13]:
# recommend these to a user
top_n

defaultdict(list,
            {'196': [('169', 4.628467524815306),
              ('178', 4.595885208695818),
              ('603', 4.506951288113721),
              ('134', 4.479502749846128),
              ('408', 4.422348668053786),
              ('496', 4.401979214730408),
              ('516', 4.353654882440636),
              ('272', 4.3396611594992915),
              ('513', 4.312690132291686),
              ('483', 4.310034461401132)],
             '186': [('427', 4.732705104357049),
              ('178', 4.728504429232632),
              ('496', 4.661800875706556),
              ('483', 4.622469617132441),
              ('512', 4.514130337943002),
              ('357', 4.489166076285036),
              ('318', 4.480404067983484),
              ('15', 4.476028680440171),
              ('197', 4.451856143211282),
              ('513', 4.437852517244628)],
             '22': [('12', 4.821609513273356),
              ('114', 4.693146771506547),
              ('603', 4.5830915935960