In [7]:
import pandas as pd
import numpy as np
from typing import List, Dict
from IPython.display import display, HTML, Markdown

from surprise import SVD, NMF, accuracy
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate, train_test_split

import warnings
warnings.filterwarnings('ignore')


def display_best_and_worse_recommendations(recommendations: pd.DataFrame):
    recommendations.sort_values('Estimated Prediction', ascending=False, inplace=True)

    top_recommendations = recommendations.iloc[:10]
    top_recommendations.columns = ['Prediction (sorted by best)', 'Movie Title']

    worse_recommendations = recommendations.iloc[-10:]
    worse_recommendations.columns = ['Prediction (sorted by worse)', 'Movie Title']

    display(HTML("<h1>Recommendations your user will love</h1>"))
    display(top_recommendations)

    display(HTML("<h1>Recommendations your user will hate</h1>"))
    display(worse_recommendations)
    

def load_movies_dataset() -> pd.DataFrame:
    movie_data_columns = [
    'movie_id', 'title', 'release_date', 'video_release_date', 'url',
    'unknown', 'Action', 'Adventure', 'Animation', "Children's",
    'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
    'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller',
    'War', 'Western'
    ]

    movie_data = pd.read_csv(
        'datasets/ml-100k/u.item', 
        sep = '|', 
        encoding = "ISO-8859-1", 
        header = None, 
        names = movie_data_columns,
        index_col = 'movie_id'
    )
    movie_data['release_date'] = pd.to_datetime(movie_data['release_date'])
    return movie_data

def load_ratings() -> pd.DataFrame:
    ratings_data = pd.read_csv(
        'datasets/ml-100k/u.data',
        sep = '\t',
        encoding = "ISO-8859-1",
        header = None,
        names=['user_id', 'movie_id', 'rating', 'timestamp']
    )
    
    return ratings_data[['user_id', 'movie_id', 'rating']]

In [8]:
ratings_data = load_ratings()
ratings_data.head(10)

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1
5,298,474,4
6,115,265,2
7,253,465,5
8,305,451,3
9,6,86,3


# Train a SVD

To train a SVD we need 3 things:
 - the ratings matrix
 - the min and max rating
 - the number of latent features we want

In [9]:
min_rating, max_rating = ratings_data['rating'].min(), ratings_data['rating'].max()

reader = Reader(rating_scale=(min_rating, max_rating))
data = Dataset.load_from_df(ratings_data, reader)
trainset, testset = train_test_split(data, test_size=.25)

# Let's train a new Nonnegative SVD
model = SVD(n_factors=100, biased=False)
model.fit(trainset)

# Let's calculate the RMSE
predictions = model.test(testset)
accuracy.rmse(predictions)

RMSE: 0.9552


0.9551990775842778

# Inspecting the product matrix

Surprise SVD stores the product matrix under the `model.qi` attribute. Let's take a look

In [10]:
pd.DataFrame(model.qi).head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.26261,0.142224,-0.239025,0.091008,-0.161134,-0.253035,0.011785,0.023782,0.279836,0.030894,...,0.006066,-0.042586,0.393558,-0.118363,-0.087241,0.228428,0.165883,0.051462,0.232213,-0.06901
1,0.087106,0.020993,-0.03652,0.038983,-0.435307,-0.152888,0.125392,0.03357,0.296397,0.430692,...,0.103032,-0.037176,-0.015685,-0.318298,-0.1943,0.230393,0.028437,-0.286853,0.102861,-0.117908
2,-0.120169,0.019679,-0.230574,0.346015,-0.326656,-0.614393,-0.037458,0.006673,0.214825,0.3425,...,0.166442,0.17697,0.121004,-0.232655,-0.031624,-0.117431,0.355225,0.119347,0.033231,-0.269793
3,0.073343,-0.035761,-0.095856,0.088034,-0.282153,-0.229868,0.111861,-0.066004,0.371013,0.074061,...,-0.04334,-0.033691,0.075167,-0.175258,0.234317,0.313216,0.100885,-0.279445,-0.016088,-0.121859
4,0.28235,0.01387,0.014025,0.024641,-0.089734,0.059387,0.0461,-0.013762,0.238987,0.015625,...,-0.110641,-0.216248,-0.023872,0.139533,-0.032444,0.21789,0.18531,-0.00842,-0.04747,-0.148907
5,0.13974,0.329912,-0.296786,0.301485,-0.233134,-0.421325,0.161304,-0.014518,0.413452,0.251704,...,0.190363,0.221484,0.078601,-0.064349,-0.283212,0.21415,0.288584,-0.233209,0.14617,-0.243453
6,-0.025795,0.070982,0.032176,-0.011758,-0.28004,-0.262665,0.274227,-0.043655,0.419093,0.515168,...,0.083478,-0.070444,0.027781,0.129142,0.175027,0.409178,0.374437,-0.320948,-0.127752,0.05676
7,0.175186,0.093852,-0.176845,0.24548,-0.168983,-0.265641,0.036242,0.079635,0.305497,0.111564,...,0.132067,-0.100374,0.156268,0.11559,-0.09053,0.23367,0.045526,-0.096654,-0.014127,-0.139701
8,0.130139,-0.204911,-0.038201,0.200179,-0.328206,-0.219035,0.032673,0.057476,0.002071,0.061166,...,0.133206,-0.019929,0.168593,0.062626,0.080328,0.329577,-0.052072,-0.094532,-0.020873,-0.058119
9,0.140671,0.015702,0.286293,0.123934,-0.275887,-0.377789,-0.104823,-0.110004,0.181042,0.401619,...,0.37577,-0.282494,0.186145,0.086383,0.060675,0.088916,0.343265,-0.160411,0.234523,-0.236984
