In [46]:
import numpy as np
from sklearn.decomposition import NMF 
import pandas as pd
import pickle

## 1. Load small MovieLens-dataset

In [47]:
user_rating_init = pd.read_csv("./data/ml-latest-small/ratings.csv",index_col=1,)
user_rating_init.reset_index(inplace=True)
user_rating_init.drop(['timestamp'], axis=1 , inplace=True)
user_rating_init

Unnamed: 0,movieId,userId,rating
0,1,1,4.0
1,3,1,4.0
2,6,1,4.0
3,47,1,5.0
4,50,1,5.0
...,...,...,...
100831,166534,610,4.0
100832,168248,610,5.0
100833,168250,610,5.0
100834,168252,610,5.0


## 2. Edit data

### 2.1 Add movie titles

In [48]:
# pivot the table
user_item_init = pd.pivot(data=user_rating_init,
                index='userId',
                columns='movieId',
                values='rating')
user_item_init

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


In [49]:
# load the movie titles and genres into a dataframe
movie_genre = pd.read_csv("./data/ml-latest-small/movies.csv",index_col=1,)
# change index from title to movieId
movie_genre.reset_index(inplace=True)
movie_genre.set_index('movieId',inplace=True)

In [50]:
# Merge the dataframes on movieId and subsitute movieId with movie title
user_item = pd.merge(movie_genre, user_item_init.T, left_index=True, right_index=True)
user_item.drop('genres', inplace=True, axis=1)
user_item.reset_index(inplace=True, drop=True)
user_item = user_item.set_index('title').T
user_item.index.name = 'userId'
user_item.head(5)

title,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995),...,Gintama: The Movie (2010),anohana: The Flower We Saw That Day - The Movie (2013),Silver Spoon (2014),Love Live! The School Idol Movie (2015),Jon Stewart Has Left the Building (2015),Black Butler: Book of the Atlantic (2017),No Game No Life: Zero (2017),Flint (2017),Bungo Stray Dogs: Dead Apple (2018),Andrew Dice Clay: Dice Rules (1991)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


### 2.2 delete all movies with less than 20 votes

In [51]:
# define mask with all movieIds having at least 20 votes
mask_at_least_20_evals = user_item.notna().sum() >= 20
# drop these user Ids (within the transposed dataframe (with movieId as index))
user_item = user_item.T.drop(user_item.T.loc[~mask_at_least_20_evals].index).T
user_item

title,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),GoldenEye (1995),"American President, The (1995)",Casino (1995),Sense and Sensibility (1995),...,Captain America: Civil War (2016),Doctor Strange (2016),The Martian (2015),Inside Out (2015),The Revenant (2015),"Big Short, The (2015)",Zootopia (2016),Arrival (2016),Rogue One: A Star Wars Story (2016),Logan (2017)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,4.0,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,2.5,,2.5,,4.0,...,,,,,,,,,,
607,4.0,,,,,,,3.0,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,4.0,,4.5,,...,,,,,,,,,,
609,3.0,,,,,,4.0,,,,...,,,,,,,,,,


### 2.3 Fill the NaN by average of movie

In [52]:
user_item_f = user_item.fillna(user_item.mean())
user_item_f.head(5)

title,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),GoldenEye (1995),"American President, The (1995)",Casino (1995),Sense and Sensibility (1995),...,Captain America: Civil War (2016),Doctor Strange (2016),The Martian (2015),Inside Out (2015),The Revenant (2015),"Big Short, The (2015)",Zootopia (2016),Arrival (2016),Rogue One: A Star Wars Story (2016),Logan (2017)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,3.431818,4.0,3.071429,4.0,3.185185,3.496212,3.671429,3.926829,3.776119,...,3.613636,3.704545,4.0,3.813953,3.903226,3.961538,3.890625,3.980769,3.925926,4.28
2,3.92093,3.431818,3.259615,3.071429,3.946078,3.185185,3.496212,3.671429,3.926829,3.776119,...,3.613636,3.704545,4.0,3.813953,3.903226,3.961538,3.890625,3.980769,3.925926,4.28
3,3.92093,3.431818,3.259615,3.071429,3.946078,3.185185,3.496212,3.671429,3.926829,3.776119,...,3.613636,3.704545,4.0,3.813953,3.903226,3.961538,3.890625,3.980769,3.925926,4.28
4,3.92093,3.431818,3.259615,3.071429,3.946078,3.185185,3.496212,3.671429,3.926829,3.776119,...,3.613636,3.704545,4.0,3.813953,3.903226,3.961538,3.890625,3.980769,3.925926,4.28
5,4.0,3.431818,3.259615,3.071429,3.946078,3.185185,3.496212,3.671429,3.926829,3.776119,...,3.613636,3.704545,4.0,3.813953,3.903226,3.961538,3.890625,3.980769,3.925926,4.28


## 3. NMF Model (non negative matrix factorization)

### 3.1 instantiate and fit the nmf model

In [53]:
# create a model with 5 hidden features
comp = 5
nmf_model = NMF(n_components=comp, init='nndsvd',max_iter=1000)
nmf_model.fit(user_item_f)



NMF(init='nndsvd', max_iter=1000, n_components=5)

### 3.2 get Q and P matrices

In [54]:
# Q is the upper part (hidden features, movies)
comp_names = ['nmf_' + str(i + 1) for i in range(comp)]
Q_df = pd.DataFrame(data=nmf_model.components_,
            columns=nmf_model.feature_names_in_,
            index=comp_names)
Q_df

Unnamed: 0,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),GoldenEye (1995),"American President, The (1995)",Casino (1995),Sense and Sensibility (1995),...,Captain America: Civil War (2016),Doctor Strange (2016),The Martian (2015),Inside Out (2015),The Revenant (2015),"Big Short, The (2015)",Zootopia (2016),Arrival (2016),Rogue One: A Star Wars Story (2016),Logan (2017)
nmf_1,1.115864,1.045576,1.187293,1.200603,1.395424,1.32536,1.172093,1.36296,1.738081,1.723023,...,1.367437,1.459187,1.653286,1.426664,1.535709,1.635227,1.44743,1.661789,1.526482,1.755222
nmf_2,1.948087,1.737447,1.266723,0.940808,1.08288,0.503647,1.7907,1.009188,0.012528,0.15459,...,0.446524,0.397025,0.174274,0.496493,0.256155,0.208502,0.490056,0.296245,0.4314,0.305398
nmf_3,1.100492,0.558567,0.285312,0.0,0.54317,0.074504,0.127497,0.158252,0.144101,0.057991,...,0.43482,0.322479,0.2396,0.535813,0.588186,0.319937,0.647712,0.177033,0.311842,0.316759
nmf_4,0.057465,0.144908,0.0,0.099286,0.168901,0.077618,0.218451,0.217756,0.117131,0.095909,...,0.234454,0.213194,0.239199,0.227884,0.16287,0.178532,0.208665,0.184012,0.275636,0.195224
nmf_5,0.200326,0.104358,0.197219,0.126608,0.163216,0.159202,0.0,0.12793,0.166181,0.012772,...,0.112701,0.129364,0.129766,0.126309,0.149758,0.153447,0.123902,0.154971,0.123673,0.17123


In [55]:
# P is the left part (users, hidden features)
P = nmf_model.transform(user_item_f)
P.shape


(610, 5)

### 3.3 reconstruct the R matrix

In [56]:
# the R matrix is the dot product of P x Q
print(Q_df.shape, P.shape) # check shape for the order of the dot product
R_hat_mat = np.dot(P, Q_df)
# put R into a dataframe
R_hat_df = pd.DataFrame(data=R_hat_mat, columns=Q_df.columns, index=user_item_f.index)
R_hat_df


(5, 1295) (610, 5)


Unnamed: 0_level_0,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),GoldenEye (1995),"American President, The (1995)",Casino (1995),Sense and Sensibility (1995),...,Captain America: Civil War (2016),Doctor Strange (2016),The Martian (2015),Inside Out (2015),The Revenant (2015),"Big Short, The (2015)",Zootopia (2016),Arrival (2016),Rogue One: A Star Wars Story (2016),Logan (2017)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.310252,3.633759,3.551461,3.202146,4.126291,3.327446,3.555005,3.752819,4.032919,3.810671,...,3.659978,3.762872,4.023191,3.888723,4.025917,4.048768,3.988775,4.058465,3.943867,4.378476
2,3.908560,3.431106,3.258528,3.076697,3.944379,3.187741,3.504125,3.678165,3.928116,3.776355,...,3.615017,3.707077,4.003912,3.813012,3.898964,3.962348,3.887039,3.984377,3.930655,4.281573
3,3.883391,3.410982,3.211593,3.039420,3.922482,3.144653,3.463670,3.664168,3.865624,3.659759,...,3.603085,3.685978,3.975220,3.797805,3.861966,3.923265,3.863092,3.944991,3.921520,4.242509
4,3.745718,3.372343,3.275548,3.139247,3.886982,3.231780,3.540688,3.698896,3.958689,3.831316,...,3.558351,3.681525,3.997525,3.738027,3.819589,3.945260,3.789807,4.002209,3.903542,4.270695
5,3.880008,3.404407,3.244957,3.066721,3.929847,3.187144,3.480236,3.662418,3.941014,3.798913,...,3.610137,3.704671,4.007059,3.808125,3.903221,3.967165,3.884482,3.987220,3.925070,4.284496
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,3.870102,3.277825,3.224671,2.990410,3.912328,3.200415,3.217995,3.571947,4.024213,3.779618,...,3.619410,3.718987,4.034073,3.833506,3.995220,4.027173,3.929608,4.015747,3.915908,4.337016
607,4.025934,3.558814,3.370001,3.162993,3.991562,3.222631,3.676437,3.736780,3.906264,3.818991,...,3.593857,3.691831,3.969001,3.791219,3.861712,3.935117,3.863683,3.974924,3.907562,4.260466
608,3.186689,2.998627,2.402751,2.628132,3.730358,2.822281,3.199926,3.570850,3.805887,3.615626,...,3.877089,3.860561,4.261410,4.038911,3.997137,4.029019,4.084097,3.985141,4.247454,4.318644
609,3.843733,3.383623,3.216619,3.055637,3.921788,3.179073,3.462402,3.664857,3.938364,3.776701,...,3.619566,3.712257,4.019232,3.814714,3.902275,3.970651,3.886142,3.991349,3.940562,4.288485


### 3.4 calculate the reconstructed error between R und R-hat

In [57]:
# model knows the original R matrix as it was input to the model
nmf_model.reconstruction_err_

224.010944599133

In [58]:
# different reconstruction error by manual calculation
np.sqrt(((np.square(user_item_f - R_hat_mat)).sum()).sum())

224.0119214246648

### 3.5 compare the 2 R matrices and save the model

In [59]:
user_item_f.head(2)

title,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),GoldenEye (1995),"American President, The (1995)",Casino (1995),Sense and Sensibility (1995),...,Captain America: Civil War (2016),Doctor Strange (2016),The Martian (2015),Inside Out (2015),The Revenant (2015),"Big Short, The (2015)",Zootopia (2016),Arrival (2016),Rogue One: A Star Wars Story (2016),Logan (2017)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,3.431818,4.0,3.071429,4.0,3.185185,3.496212,3.671429,3.926829,3.776119,...,3.613636,3.704545,4.0,3.813953,3.903226,3.961538,3.890625,3.980769,3.925926,4.28
2,3.92093,3.431818,3.259615,3.071429,3.946078,3.185185,3.496212,3.671429,3.926829,3.776119,...,3.613636,3.704545,4.0,3.813953,3.903226,3.961538,3.890625,3.980769,3.925926,4.28


In [60]:
R_hat_df.head(2)

Unnamed: 0_level_0,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),GoldenEye (1995),"American President, The (1995)",Casino (1995),Sense and Sensibility (1995),...,Captain America: Civil War (2016),Doctor Strange (2016),The Martian (2015),Inside Out (2015),The Revenant (2015),"Big Short, The (2015)",Zootopia (2016),Arrival (2016),Rogue One: A Star Wars Story (2016),Logan (2017)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.310252,3.633759,3.551461,3.202146,4.126291,3.327446,3.555005,3.752819,4.032919,3.810671,...,3.659978,3.762872,4.023191,3.888723,4.025917,4.048768,3.988775,4.058465,3.943867,4.378476
2,3.90856,3.431106,3.258528,3.076697,3.944379,3.187741,3.504125,3.678165,3.928116,3.776355,...,3.615017,3.707077,4.003912,3.813012,3.898964,3.962348,3.887039,3.984377,3.930655,4.281573


In [61]:
# save the model with pickle
with open('nmf_model.pkl', mode='wb') as file:
    pickle.dump(nmf_model, file)

## 4. Create recommandations for a new user

### 4.1 new user query

In [62]:
# New user has to enter some ratings to enable a prediction
new_user_query = {"Toy Story (1995)": 5,
                 "Ace Ventura: When Nature Calls (1995)":5,
                 "Get Shorty (1995)":5}


### 4.2 buidling R matrix for new user

In [63]:
# get a list containing all movies
movies = user_item_f.columns.to_list()

In [64]:
# create dataframe for this user
df_user = pd.DataFrame(data=new_user_query,
                            index=['new_user'],
                            columns=movies)
df_user

                    

Unnamed: 0,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),GoldenEye (1995),"American President, The (1995)",Casino (1995),Sense and Sensibility (1995),...,Captain America: Civil War (2016),Doctor Strange (2016),The Martian (2015),Inside Out (2015),The Revenant (2015),"Big Short, The (2015)",Zootopia (2016),Arrival (2016),Rogue One: A Star Wars Story (2016),Logan (2017)
new_user,5,,,,,,,,,,...,,,,,,,,,,


In [65]:
# fill the NaN with the original mean
df_user_imp = df_user.fillna(user_item_f.mean())
df_user_imp

Unnamed: 0,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),GoldenEye (1995),"American President, The (1995)",Casino (1995),Sense and Sensibility (1995),...,Captain America: Civil War (2016),Doctor Strange (2016),The Martian (2015),Inside Out (2015),The Revenant (2015),"Big Short, The (2015)",Zootopia (2016),Arrival (2016),Rogue One: A Star Wars Story (2016),Logan (2017)
new_user,5,3.431818,3.259615,3.071429,3.946078,3.185185,3.496212,3.671429,3.926829,3.776119,...,3.613636,3.704545,4.0,3.813953,3.903226,3.961538,3.890625,3.980769,3.925926,4.28


### 4.3 get the user's P matrix

In [66]:
new_user_matrix = nmf_model.transform(df_user_imp)
new_user_matrix

array([[2.06098612, 0.41859773, 0.56920256, 1.21639739, 0.68233803]])

### 4.4 calculate R hat for the user

In [67]:
# The Q matrix doesn't depend on the user but only on the movies
R_hat_new_user = np.dot(new_user_matrix, Q_df)
R_hat_new_user_df = pd.DataFrame(data=R_hat_new_user, index=['new_user'], columns=movies) 
R_hat_new_user_df

Unnamed: 0,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),GoldenEye (1995),"American President, The (1995)",Casino (1995),Sense and Sensibility (1995),...,Captain America: Civil War (2016),Doctor Strange (2016),The Martian (2015),Inside Out (2015),The Revenant (2015),"Big Short, The (2015)",Zootopia (2016),Arrival (2016),Rogue One: A Star Wars Story (2016),Logan (2017)
new_user,3.948237,3.44762,3.274211,3.075408,3.955233,3.187825,3.503545,3.673732,3.925296,3.774225,...,3.614771,3.704713,3.996236,3.816533,3.907399,3.961437,3.895311,3.979273,3.923814,4.279934


### 4.5 create recommendations for new user

In [69]:
# create a list of movies the user has already seen
seen = list(new_user_query.keys())
# set these movies to 0 (to avoid there recommendation)
R_hat_trans = R_hat_new_user_df.T.reset_index()
mask = R_hat_trans['index'].isin(seen)
R_hat_trans['new_user'].loc[mask] = 0
R_hat_trans.set_index('index', inplace=True)
# sort the movies by rating and show the top 10
R_hat_trans.sort_values(by=['new_user'], ascending=False).head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  R_hat_trans['new_user'].loc[mask] = 0


Unnamed: 0_level_0,new_user
index,Unnamed: 1_level_1
"Streetcar Named Desire, A (1951)",4.471072
"Shawshank Redemption, The (1994)",4.454232
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),4.329663
"Godfather, The (1972)",4.31074
"Philadelphia Story, The (1940)",4.305761
Fight Club (1999),4.302067
Lawrence of Arabia (1962),4.300344
In the Name of the Father (1993),4.299219
Hoop Dreams (1994),4.292765
Harold and Maude (1971),4.287868


## 5. create recommendation with a NMF recommender function

In [25]:
def recommend_nmf(query, model, k=10):
    """
    Filters and recommends the top k movies for any given input query based on a trained NMF model. 
    Returns a list of k movie ids.
    """
    # 1. candidate generation (construct new_user-item dataframe given the query)
    # create dataframe for this user
    df_user = pd.DataFrame(data=query,
                                index=['new_user'],
                                columns=movies)
    # fill the dataframe with the mean of filled ratings (orig)
    df_user_imp = df_user.fillna(user_item_f.mean())

    # 2. scoring (calculate the score with the NMF model)
    # get the users P matrix
    new_user_mat = nmf_model.transform(df_user_imp) 
    # create the names of the features from the dimension of the user P-matrix
    comp_names = ['lmn_' + str(i) for i in range(new_user_mat.shape[1])]
    Q_df = pd.DataFrame(data=model.components_,
            columns=model.feature_names_in_,
            index= comp_names)
    R_hat_new_user_df = pd.DataFrame(data=np.dot(new_user_mat, Q_df), index=['new_user'], columns=movies)
    
    # 3. ranking (get the sorting of films for the user but exclude the ones he has already seen)
    seen = list(query.keys())
    # set movies all ready seen by the user to 0 
    R_hat_trans = R_hat_new_user_df.T.reset_index()
    mask = R_hat_trans['index'].isin(seen)
    R_hat_trans['new_user'].loc[mask] = 0
    R_hat_trans.set_index('index', inplace=True)
    
    # return the top-k highest rated movie ids or titles
    return R_hat_trans.sort_values(by=['new_user'], ascending=False).head(k)

In [26]:
# query
new_user_query = {"Toy Story (1995)": 5,
                 "Ace Ventura: When Nature Calls (1995)":5,
                 "Get Shorty (1995)":5}
recommend_nmf(new_user_query, nmf_model, k=10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  R_hat_trans['new_user'].loc[mask] = 0


Unnamed: 0_level_0,new_user
index,Unnamed: 1_level_1
"Streetcar Named Desire, A (1951)",4.471071
"Shawshank Redemption, The (1994)",4.454238
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),4.329662
"Godfather, The (1972)",4.310746
"Philadelphia Story, The (1940)",4.305759
Fight Club (1999),4.302077
Lawrence of Arabia (1962),4.300341
In the Name of the Father (1993),4.299218
Hoop Dreams (1994),4.292764
Harold and Maude (1971),4.287867


In [27]:
# save csv file for user_item_matrix for latter usage
user_item_f.to_csv('user_item.csv')