In [16]:
import pandas as pd
import numpy as np
from typing import List, Dict
from IPython.display import display, HTML, Markdown

import warnings
warnings.filterwarnings('ignore')


def display_best_and_worse_recommendations(recommendations):
    recommendations.sort_values('Estimated Prediction', ascending=False, inplace=True)

    top_recommendations = recommendations.iloc[:10]
    top_recommendations.columns = ['Prediction (sorted by best)', 'Movie Title']

    worse_recommendations = recommendations.iloc[-10:]
    worse_recommendations.columns = ['Prediction (sorted by worse)', 'Movie Title']

    display(HTML("<h1>Recommendations your user will love</h1>"))
    display(top_recommendations)

    display(HTML("<h1>Recommendations your user will hate</h1>"))
    display(worse_recommendations)
    

def load_movies_dataset():
    movie_data_columns = [
    'movie_id', 'title', 'release_date', 'video_release_date', 'url',
    'unknown', 'Action', 'Adventure', 'Animation', "Children's",
    'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
    'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller',
    'War', 'Western'
    ]

    movie_data = pd.read_csv(
        'datasets/ml-100k/u.item', 
        sep = '|', 
        encoding = "ISO-8859-1", 
        header = None, 
        names = movie_data_columns,
        index_col = 'movie_id'
    )
    movie_data['release_date'] = pd.to_datetime(movie_data['release_date'])
    return movie_data

def load_ratings():
    ratings_data = pd.read_csv(
        'datasets/ml-100k/u.data',
        sep = '\t',
        encoding = "ISO-8859-1",
        header = None,
        names=['user_id', 'movie_id', 'rating', 'timestamp']
    )
    return ratings_data

# A practical guide to Singular Value Decomposition in Python

Recommender systems have become increasingly popular in recent years, and are used by some of the largest websites in the world to predict the likelihood of a user taking an action on an item. In the world of Netflix, this means recommending similar movies to the ones you have seen. In the world of dating, this means suggesting matches similar to people you already showed interest in!

My path to recommenders has been an unusual one: from a Software Engineer to working on matching algorithms at a dating company, with a little background on machine learning. With my knowledge of Python and the use of basic SVD (Singular Value Decomposition) frameworks, I was able to understand SVDs from a practical standpoint of what you can do with them, instead of focusing on the science.

In my talk, you will learn 2 practical ways of generating recommendations using SVDs: matrix factorization and item similarity. We will be learning the high-level components of SVD the "doer way": we will be implementing a simple movie recommendation engine with the help of Jupiter notebooks, the MovieLens database, and the Surprise recommendation package.

## Table of contents

 - Downloading and exploring the MovieLens dataset
 - Training a SVD model using Surprise
 - Using the predict() API inside of Surprise
 - Recommendations via Matrix Factorization: Performing predict() manually
 - recommendations via Product based CF: Finding similarity between vectors

In [15]:
movie_data = load_movies_dataset()
movie_data.loc[1]

title                                                  Toy Story (1995)
release_date                                        1995-01-01 00:00:00
video_release_date                                                  NaN
url                   http://us.imdb.com/M/title-exact?Toy%20Story%2...
unknown                                                               0
Action                                                                0
Adventure                                                             0
Animation                                                             1
Children's                                                            1
Comedy                                                                1
Crime                                                                 0
Documentary                                                           0
Drama                                                                 0
Fantasy                                                         

# Movies dataset

This dataset contains all the movies and their metadata

`movie_id` 1 is **Toy Story**

<p><img src="https://static1.squarespace.com/static/51cdafc4e4b09eb676a64e68/t/579282fabebafbb6c366252c/1469219594863/" alt="Drawing" style="width: 200px; float: left"/></p>

In [17]:
ratings_data = load_ratings()
ratings_data.head(10)

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
5,298,474,4,884182806
6,115,265,2,881171488
7,253,465,5,891628467
8,305,451,3,886324817
9,6,86,3,883603013


# Ratings dataset

Contains the **interactions** between users and movies

- User **196** rated movie **242** with a score of **3** 
- User **186** rated movie **302** with a score of **3** 
- User **22** rated movie **377** with a score of **3** 

In [5]:
ratings_data[ratings_data['movie_id'] == 1]['rating'].describe()

count    452.000000
mean       3.878319
std        0.927897
min        1.000000
25%        3.000000
50%        4.000000
75%        5.000000
max        5.000000
Name: rating, dtype: float64

On average, people really LOVE toy story! and I don't blame them!

# Running our interactions through Surprise SVD

Let's take the **interactions** between the Users and Movies, and generate **latent features**  

In [29]:
from surprise import SVD, NMF, accuracy
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate, train_test_split

# For simplicity, we are going to load the ml-100k preset already existing in Surprise
# data = Dataset.load_builtin('ml-100k')

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_data[['user_id', 'movie_id', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=.25)

# Let's train a new Nonnegative SVD
model = NMF(n_factors=10, biased=False)
model.fit(trainset)

# Validate the model has learned appropriately
predictions = model.test(testset)
accuracy.rmse(predictions)

RMSE: 0.9720


0.9719730181112705

In [34]:
pd.DataFrame(model.qi).head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.727842,0.776314,0.396181,0.5217,0.566004,0.530977,0.405176,0.564379,0.735616,0.801454
1,0.662393,0.302177,0.572211,0.085858,1.210979,0.330043,1.094939,0.39505,0.886081,0.104785
2,0.282344,0.638182,0.439849,0.851961,0.296186,0.205809,0.800242,0.68834,1.035395,0.355117
3,0.277798,0.403195,0.712434,0.80216,0.500845,0.381861,0.644265,0.28877,0.496445,0.547964
4,1.205935,0.828465,0.553019,0.385743,0.508142,0.633543,0.546533,0.285637,0.927199,0.576034
5,1.123098,0.453856,0.431123,0.165252,0.658889,0.705495,0.713971,1.014495,0.017229,0.829705
6,0.557269,0.400335,0.790033,1.049789,0.619059,0.457531,0.209612,0.851629,0.654501,0.439791
7,0.000346,0.306583,0.003323,0.672181,0.452565,0.544589,0.01932,0.780426,0.084463,0.596229
8,0.590012,0.706263,0.69202,1.25756,0.28125,1.142423,0.325942,0.520675,0.402966,0.387369
9,0.546471,0.417007,0.253548,0.554012,0.426745,0.467116,0.996529,0.662785,0.417163,0.328236


# Exploring the product matrix

The matrix has `n_factors` columns (we chose 10). Every row represents a movie

In [45]:
print(f"The shape of our product matrix is {model.qi.shape}.")
print(f"There are {ratings_data['movie_id'].unique().shape[0]} unique movies movies")

The shape of our product matrix is (1645, 10).
There are 1682 unique movies movies


Around 3% of movies are not present. This is because Surprise removes products (and users) that do not have a minimum number of ratings. 

# Generating predictions with simplicity

Before looking into the latent features of our movies, let's use the API provided by Surprise. More specifically, Surprise provides us 1 API

 - `model.predict` computes the rating prediction for given user and movie
 
Let's look at how we can use this API to generate movies that a given user may like

```python
>>> model.predict('302', '1')
Prediction(uid=302, iid=1, r_ui=None, est=3.5327866666666665, details={'was_impossible': False})
```

NOTE: User ID and Movie ID are **strings**

In [46]:
movie_id_to_title_map: Dict[int, str] = dict(movie_data['title'])
# {1: 'Toy Story (1995)',
#  2: 'GoldenEye (1995)',
#  3: 'Four Rooms (1995)'}

def generate_recommended_movies_for_user(user_id: int) -> pd.DataFrame:
    """Return a DataFrame containing recommendations for the user, and the
    associated score
    """
    results = []
    for movie_id, movie_title in movie_id_to_title_map.items():
        
        # For each movie, calculate score prediction 
        prediction = model.predict(str(user_id), str(movie_id))
        results.append((prediction.est, movie_title))
       
    return pd.DataFrame(results, columns=['Estimated Prediction', 'Movie Title'])


# Let's generate some recommendations for a user
recommendations = generate_recommended_movies_for_user(302)
display_best_and_worse_recommendations(recommendations)

Unnamed: 0,Prediction (sorted by best),Movie Title
0,3.531507,Toy Story (1995)
1104,3.531507,Firestorm (1998)
1128,3.531507,Chungking Express (1994)
1127,3.531507,Heidi Fleiss: Hollywood Madam (1995)
1126,3.531507,"Truman Show, The (1998)"
1125,3.531507,"Old Man and the Sea, The (1958)"
1124,3.531507,"Innocents, The (1961)"
1123,3.531507,"Farewell to Arms, A (1932)"
1122,3.531507,"Last Time I Saw Paris, The (1954)"
1121,3.531507,They Made Me a Criminal (1939)


Unnamed: 0,Prediction (sorted by worse),Movie Title
562,3.531507,Stephen King's The Langoliers (1995)
561,3.531507,"Quick and the Dead, The (1995)"
560,3.531507,Mary Shelley's Frankenstein (1994)
559,3.531507,"Kid in King Arthur's Court, A (1995)"
558,3.531507,Interview with the Vampire (1994)
557,3.531507,Heavenly Creatures (1994)
556,3.531507,Farinelli: il castrato (1994)
555,3.531507,Wild Bill (1995)
554,3.531507,White Man's Burden (1995)
1681,3.531507,Scream of Stone (Schrei aus Stein) (1991)


# Predict, under the hood

So far we have seen how the `predict()` API works in surface. But how does it **really** work inside of surprise. It's, surprisingly, simple! (get the pun?)

But before we go there, let's go back to our Feature Vectors

![Latent Features](https://cdn-images-1.medium.com/max/1600/0*_gKhyxIC3wup0cCE.jpg)

## Looking at the Movie matrix (vT)

Let's take a look at the latent features for every movie. Product features can be found in the `qi` attribute.
 - create a DataFrame that maps product matrix row index to movie
 - join the newly created dataframe with the movie dataset
 - join the newly created dataframe with the latent features

In [8]:
# Create a DataFrame that maps product matrix row index to movie
movie_to_product_matrix = pd.DataFrame(
    list(model.trainset._raw2inner_id_items.items()
), columns=['movie_id', 'vT_index'], dtype=int).set_index('movie_id', drop=False)

# Join the newly created dataframe with the movie dataset
mapping_matrix_with_title = movie_to_product_matrix.join(movie_data['title'])

# Create a dataframe containing latent features, and join it to the remaining dataset
latent_features = pd.DataFrame(model.qi, columns=[f"Latent Feature {k}" for k in range(1, 11)])
mapping_matrix_with_title_and_features = mapping_matrix_with_title.set_index('vT_index').join(latent_features)

mapping_matrix_with_title_and_features.head(10)

Unnamed: 0_level_0,movie_id,title,Latent Feature 1,Latent Feature 2,Latent Feature 3,Latent Feature 4,Latent Feature 5,Latent Feature 6,Latent Feature 7,Latent Feature 8,Latent Feature 9,Latent Feature 10
vT_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,1014,Romy and Michele's High School Reunion (1997),0.127867,0.169688,0.837986,0.707207,0.9287,0.837498,0.544891,0.378054,0.385599,0.146024
1,285,Secrets & Lies (1996),0.674757,0.571038,0.517294,0.844572,0.730509,1.067785,0.770744,0.753895,0.191933,0.806309
2,340,Boogie Nights (1997),0.572156,0.645273,0.14163,0.608193,0.093337,1.063379,0.671422,0.550304,0.734766,0.72255
3,187,"Godfather: Part II, The (1974)",0.578167,0.790092,0.576586,0.68957,0.785012,0.786882,0.610778,0.722659,0.797572,0.597268
4,475,Trainspotting (1996),0.727273,0.725943,0.859316,0.765486,0.585312,0.805084,0.450492,0.24022,0.628596,0.58678
5,403,Batman (1989),0.465745,0.441506,0.720105,0.704755,0.665289,0.439092,0.499607,0.696861,0.350975,0.59865
6,357,One Flew Over the Cuckoo's Nest (1975),0.990322,0.461247,0.826027,1.026533,0.781991,0.659309,0.929139,0.742188,0.126785,0.382074
7,210,Indiana Jones and the Last Crusade (1989),0.636474,0.867544,0.671765,0.549894,0.638666,0.547522,0.644678,0.590954,0.438182,0.857417
8,1065,Koyaanisqatsi (1983),1.03762,0.284221,0.334406,0.332122,1.341482,0.813887,0.27263,0.153923,1.21448,0.39171
9,321,Mother (1996),0.364889,0.597097,0.098801,0.235464,0.859981,0.734622,0.412576,0.245794,0.818899,0.842961


These are **learned features**. We cannot attribute them to anything specific, but they usually have some real-world correlation

# Find similar movies using Cosine Similarity

Usually, there isn't a straightforward way to pinpoint what a latent feature may be a strong indicator of. Even though we don't know exactly what these features correlate to, we can still compare vectors together. The latent feature at same index of every vector will relate to the same attribute.

To find how similar 2 movies are, all we need to do is compare their vectors

In [47]:
from scipy.spatial.distance import cosine


def compute_similarity(movie_a: str, movie_b: str) -> float:
    try:
        movie_a_vectors: np.array = mapping_matrix_with_title_and_features[
            mapping_matrix_with_title_and_features['title'] == movie_a
        ].iloc[0, 2:].as_matrix()
        movie_b_vectors: np.array = mapping_matrix_with_title_and_features[
            mapping_matrix_with_title_and_features['title'] == movie_b
        ].iloc[0, 2:].as_matrix()
    except IndexError:
        # SVD may sometimes remove users or products that do not contain
        # a minimum number of ratings to/from them. This helps improve the
        # quality of recommendations
        return -1
    
    return 1 - cosine(movie_a_vectors, movie_b_vectors)


# compute_similarity('Evita (1996)', 'Evita (1996)')
# compute_similarity('Toy Story (1995)', 'Evita (1996)')
compute_similarity('They Made Me a Criminal (1939)', 'Toy Story (1995)')

0.8925997371062081

In [48]:
def generate_similar_movies_for_movie(movie_title: str) -> pd.DataFrame:
    all_movies = movie_data[['title']]
    all_movies['similarity'] = all_movies['title'].map(lambda title: compute_similarity(title, movie_title))
    return all_movies


similarity_table = generate_similar_movies_for_movie('Postino, Il (1994)')

In [11]:
similarity_table.sort_values('similarity', ascending=False).head(10)

Unnamed: 0_level_0,title,similarity
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
14,"Postino, Il (1994)",1.0
847,Looking for Richard (1996),0.980683
9,Dead Man Walking (1995),0.977411
249,Austin Powers: International Man of Mystery (1...,0.975899
492,East of Eden (1955),0.974961
498,"African Queen, The (1951)",0.97443
232,Young Guns (1988),0.972017
746,Real Genius (1985),0.971593
123,"Frighteners, The (1996)",0.970966
1636,Brothers in Trouble (1995),0.968086
