In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.decomposition import NMF
import pickle

#### Let's collect some recommendations for a new users that loves Disney Movies! 

In [83]:
# for calculating recommendations
query = {
    # movieId, rating
    1:3,
    4470:5, 
    48:5,
    594:5,
    27619:5,
    152081:5,
    595:5,
    616:5,
    1029:5
}


# for testing the recommender after getting some recommendations
relevant_items = [
    596, 4016, 1033, 134853, 
    2018, 588, 364, 26999, 75395,2085, 
    1907, 2078, 1032, 177765   
]

In [94]:
#ratings[ratings.movieId == list(query.keys())]
ratings.set_index('movieId').drop(index=query.keys())
#ratings.head()

Unnamed: 0_level_0,userId,rating,timestamp
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,1,4.0,964981247
6,1,4.0,964982224
47,1,5.0,964983815
50,1,5.0,964982931
70,1,3.0,964982400
...,...,...,...
166534,610,4.0,1493848402
168248,610,5.0,1493850091
168250,610,5.0,1494273047
168252,610,5.0,1493846352


# Non Negative Matrix Factorization for Recommender Systems
---



In [78]:
movies.loc[4470]

title     Ariel (1988)
genres           Drama
Name: 4470, dtype: object

In [96]:
ratings = pd.read_csv('../data/ml-latest-small/ratings.csv')
movies = pd.read_csv('../data/ml-latest-small/movies.csv', index_col=0)


In [56]:
movies.head(
    
)
movies[movies.title.str.contains('Jumanji')].index[0]

2

In [66]:
ratings[ratings.movieId == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
516,5,1,4.0,847434962
874,7,1,4.5,1106635946
1434,15,1,2.5,1510577970
1667,17,1,4.5,1305696483
...,...,...,...,...
97364,606,1,2.5,1349082950
98479,607,1,4.0,964744033
98666,608,1,2.5,1117408267
99497,609,1,3.0,847221025


In [35]:
# which movies are in the query?
movies.set_index('movieId').loc[query.keys()]

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
4470,Ariel (1988),Drama
48,Pocahontas (1995),Animation|Children|Drama|Musical|Romance
594,Snow White and the Seven Dwarfs (1937),Animation|Children|Drama|Fantasy|Musical
27619,"Lion King 1½, The (2004)",Adventure|Animation|Children|Comedy
152081,Zootopia (2016),Action|Adventure|Animation|Children|Comedy
595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX
616,"Aristocats, The (1970)",Animation|Children
1029,Dumbo (1941),Animation|Children|Drama|Musical


---
## 1. Model Development

### Preprocessing

- filter out movies rated by less than 20/ 50 / 100 ... users
- filter out movies with an average rating lower than 2
- create a sparse user item matrix

In [36]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [97]:
#filter out movies with an average rating lower than 2
avg_rating_movie = ratings.groupby('movieId')['rating'].mean()
avg_rating_movie_good = avg_rating_movie.loc[avg_rating_movie > 2]

In [98]:
# calculate the number of ratings per movie
ratings_per_movie = ratings.groupby('movieId')['userId'].count()

In [115]:
# filter for movies with more than 20 ratings and extract the index
popular_movies = ratings_per_movie.loc[ratings_per_movie > 20]
popular_movies.sort_values(ascending=False)[:10]

movieId
356     329
318     317
296     307
593     279
2571    278
260     251
480     238
110     237
589     224
527     220
Name: userId, dtype: int64

In [116]:
filtered_movie_ids = list(set((list(popular_movies.index) + list(avg_rating_movie_good.index))))
filtered_movie_ids


[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 32799,
 34,
 131098,
 36,
 131104,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 65585,
 52,
 53,
 54,
 55,
 65588,
 57,
 58,
 98361,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 65596,
 68,
 69,
 70,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 85,
 86,
 87,
 88,
 89,
 163925,
 92,
 93,
 94,
 95,
 96,
 97,
 32862,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 32875,
 110,
 111,
 112,
 113,
 65642,
 116,
 117,
 118,
 119,
 121,
 122,
 123,
 32892,
 125,
 128,
 32898,
 132,
 135,
 137,
 32906,
 140,
 141,
 163981,
 144,
 145,
 146,
 147,
 148,
 32914,
 150,
 151,
 32917,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 131237,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 180,
 181,
 183,
 184,
 185,
 186,
 187,
 188,
 189,
 190,
 191,
 98491,
 19

In [117]:
# filter the ratings matrix and only keep the popular movies
ratings = ratings.set_index('movieId').loc[filtered_movie_ids]
ratings = ratings.reset_index()
ratings


Unnamed: 0,movieId,userId,rating,timestamp
0,1,1,4.0,964982703
1,1,5,4.0,847434962
2,1,7,4.5,1106635946
3,1,15,2.5,1510577970
4,1,17,4.5,1305696483
...,...,...,...,...
98766,65514,332,3.5,1352674106
98767,65514,380,5.0,1494696245
98768,65514,408,4.0,1468361145
98769,65514,534,4.0,1459787995


In [118]:
ratings.sort_values('rating', ascending=False)

Unnamed: 0,movieId,userId,rating,timestamp
49385,2791,448,5.0,1019125935
20470,858,319,5.0,1461351651
20487,858,357,5.0,1348610443
64563,4450,160,5.0,1065983846
20484,858,348,5.0,1378850181
...,...,...,...,...
46149,2571,111,0.5,1516140656
83071,108689,567,0.5,1525289685
48574,2720,182,0.5,1063284777
69137,5296,182,0.5,1055156342


In [119]:
# Initialize a sparse user-item rating matrix 
# (data, (row_ind, col_ind)
R = csr_matrix((ratings['rating'], (ratings['userId'], ratings['movieId'])))
R

<611x193610 sparse matrix of type '<class 'numpy.float64'>'
	with 98771 stored elements in Compressed Sparse Row format>

### Training

- initialize the model
- fit it on the user item matrix
- optionally, tune the number of components (hidden features): what happens if you set the number of components to a really low number?
- decrease the `tol` to train for a longer time

In [120]:
# initialize the unsupervised model
# 55 hidden features, F=55
model = NMF(n_components=55, init='nndsvd', max_iter=10000, tol=0.01, verbose=2)

# fit it to the user-item rating matrix
model.fit(R)

# initialzed P, Q matrix with random values
# iterate and optimize the values stored in P and Q

violation: 1.0
violation: 0.3061598906947986
violation: 0.20826250162825022
violation: 0.1522992172724846
violation: 0.1136351493204533
violation: 0.08922544386584705
violation: 0.06942951112071515
violation: 0.05475681255983771
violation: 0.045120667652462636
violation: 0.03909041242279949
violation: 0.0347993221754289
violation: 0.030608380746605508
violation: 0.026762471138002068
violation: 0.023370699435926574
violation: 0.020522081972538558
violation: 0.018280159456408155
violation: 0.016426390334388922
violation: 0.015215971852648804
violation: 0.014262205288807837
violation: 0.013448914841033601
violation: 0.012835591931658579
violation: 0.0123108495528994
violation: 0.011879999823143427
violation: 0.011397883974091758
violation: 0.010930774520299358
violation: 0.010513303172070282
violation: 0.010052393675030804
violation: 0.009565209047519924
Converged at iteration 29


NMF(init='nndsvd', max_iter=10000, n_components=55, tol=0.01, verbose=2)

### Model inspection

In [121]:
R

<611x193610 sparse matrix of type '<class 'numpy.float64'>'
	with 98771 stored elements in Compressed Sparse Row format>

#### the hidden features

In [122]:
model.components_.shape

(55, 193610)

In [None]:
# user-'genre' matrix [611x55]
P = model.transform(R)

# movie-'genre' matrix [55x168253]
Q = model.components_

P.shape, Q.shape

In [None]:
# user with id 1: sparse format
R[1,:]

In [None]:
# user with id 1: dense embedding
P[1, :]

In [None]:
# dense embedding for movie with id 1
Q[:, 1]

In [None]:
# reconstructed matrix Rhat
# R_hat = P.dot(Q)

In [123]:
# R -> encoding -> P -> decoding -> Rhat
R_hat = model.inverse_transform(model.transform(R))

violation: 1.0
violation: 0.7893342579565776
violation: 0.24395040234114868
violation: 0.09423178803894806
violation: 0.0414006221089833
violation: 0.020981032095723863
violation: 0.01252936354148839
violation: 0.008135529593664235
Converged at iteration 9


In [124]:
R_hat

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 2.85128017e+00, 1.27661800e+00, ...,
        0.00000000e+00, 0.00000000e+00, 3.25706919e-03],
       [0.00000000e+00, 2.20255670e-01, 1.15043527e-01, ...,
        0.00000000e+00, 0.00000000e+00, 2.17679945e-02],
       ...,
       [0.00000000e+00, 1.31980898e+00, 2.74404834e+00, ...,
        0.00000000e+00, 0.00000000e+00, 8.12758353e-04],
       [0.00000000e+00, 5.22296412e-01, 7.56723839e-01, ...,
        0.00000000e+00, 0.00000000e+00, 3.71568708e-04],
       [0.00000000e+00, 5.01054767e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 1.01533359e-02]])

#### the reconstruction error

$$
L(R, \hat{R}) = \sqrt{\sum_i\sum_j(R_{ij}-\hat{R}_{ij})^2} = \sqrt{\sum_i\sum_j(R_{ij}-PQ_{ij})^2}
$$

In [125]:
R.shape, R_hat.shape

((611, 193610), (611, 193610))

In [126]:
# reconstruction error
np.sqrt(np.sum(np.square(R - R_hat)))

762.8631328981511

In [127]:
model.reconstruction_err_

762.8643017340211

---
## 2. Model deployment: Make recommendations for a new user

### Save the trained model on your hard drive

In [128]:
with open('./nmf_recommender.pkl', 'wb') as file:
    pickle.dump(model, file)

In [None]:
!ls

### Read the model from hard drive

In [129]:
with open('./nmf_recommender.pkl', 'rb') as file:
    model_oo = pickle.load(file)

print(model_oo)

NMF(init='nndsvd', max_iter=10000, n_components=55, tol=0.01, verbose=2)


In [None]:
model.reconstruction_err_

### Receive a user query

In [None]:
query

In [None]:
R[1,:]

### Construct a user vector

we need the same input as was used during training!

In [None]:
list(query.values())

In [131]:
data = list(query.values())             # the ratings of the new user
row_ind = [0]*len(data)              # we use just a single row 0 for this user 
col_ind = list(query.keys())                           # the columns (=movieId) of the ratings
data, row_ind, col_ind

([3, 5, 5, 5, 5, 5, 5, 5, 5],
 [0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 4470, 48, 594, 27619, 152081, 595, 616, 1029])

In [132]:
# new user vector: needs to have the same format as the training data

user_vec = csr_matrix((data, (row_ind, col_ind)), shape=(1, R.shape[1]))
user_vec

<1x193610 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [133]:
R

<611x193610 sparse matrix of type '<class 'numpy.float64'>'
	with 98771 stored elements in Compressed Sparse Row format>

In [71]:
movies.drop(index=[1,2,3])
# movies.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
6,Heat (1995),Action|Crime|Thriller
7,Sabrina (1995),Comedy|Romance
8,Tom and Huck (1995),Adventure|Children
...,...,...
193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
193585,Flint (2017),Drama
193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


### Calculate the score

1. transform the user vector to its dense representation (encoding) 
2. inverse transform the dense vector into the sparse representation (decoding)

$$
\hat{r}_{ij} = p_i' \cdot q_j 
$$

In [134]:
# user_vec -> encoding -> p_user_vec -> decoding -> user_vec_hat


scores = model.inverse_transform(model.transform(user_vec))

# convert to a pandas series
scores = pd.Series(scores[0])
scores

violation: 1.0
violation: 1.1972012331592705
violation: 0.0646316959592654
violation: 0.022824150719394032
violation: 0.005071941395282126
Converged at iteration 6


0         0.000000
1         0.412206
2         0.304914
3         0.077833
4         0.005282
            ...   
193605    0.000000
193606    0.000000
193607    0.000000
193608    0.000000
193609    0.000493
Length: 193610, dtype: float64

### Give recommendations

In [135]:
query.keys()

dict_keys([1, 4470, 48, 594, 27619, 152081, 595, 616, 1029])

In [136]:
# give a zero score to movies the user has allready seen
scores[query.keys()] = 0

In [137]:
# sort the scores from high to low 
scores = scores.sort_values(ascending=False)
scores

364      0.638796
588      0.595460
1073     0.503287
2078     0.461625
1028     0.455598
           ...   
66639    0.000000
66640    0.000000
66641    0.000000
66642    0.000000
96805    0.000000
Length: 193610, dtype: float64

In [138]:
# get the movieIds of the top 10 entries
recommendations = scores.head(10).index
recommendations

Int64Index([364, 588, 1073, 2078, 1028, 2081, 3114, 1035, 2087, 919], dtype='int64')

In [140]:
movies.loc[recommendations]

Unnamed: 0,title,genres
364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX
588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical
1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical
2078,"Jungle Book, The (1967)",Animation|Children|Comedy|Musical
1028,Mary Poppins (1964),Children|Comedy|Fantasy|Musical
2081,"Little Mermaid, The (1989)",Animation|Children|Comedy|Musical|Romance
3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy
1035,"Sound of Music, The (1965)",Musical|Romance
2087,Peter Pan (1953),Animation|Children|Fantasy|Musical
919,"Wizard of Oz, The (1939)",Adventure|Children|Fantasy|Musical


In [None]:
# recall@10: fraction of relevant items in the top 10 recommendations


In [None]:
# precision@10: fraction of recommendations that are relevant


---
## 3. Project Task: NMF recommender function

1. Collect different user queries for "typical" users (e.g. a horror movie buff) and evaluate the algorithm
2. Set the number of components to a very low number (e.g. 2). What happens to the recommendations?
3. Implement a recommender function that recommends movies to a new user based on the NMF model!

Note: Training of the model happens outside of the function! Don't retrain the model every time you want to calculate recommendations for a user.


In [None]:
# collaborative filtering = look at ratings only!
def recommend_nmf(query, model, ratings, k=10):
    """
    Filters and recommends the top k movies for any given input query based on a trained NMF model. 
    Returns a list of k movie ids.
    """
    # 1. candiate generation
    
    # construct a user vector
    create_user_vec = 
   
    # 2. scoring
    
    # calculate the score with the NMF model
    
    
    # 3. ranking
    
    # filter out movies allready seen by the user
    
    # return the top-k highst rated movie ids or titles
    
    return [364, 372, 43, 34, 243]

In [None]:
# recommender.py
# from recommender import recommend_nmf