In [70]:
#%pip install scikit-learn

In [71]:
import pandas as pd
import numpy as np
from numpy.linalg import svd

### Dataset

[MovieLens Small: 100,000 ratings and 3,600 tag applications applied to 9,000 movies by 600 users. Last updated 9/2018.](https://grouplens.org/datasets/movielens/)

### Referências

1. https://web.stanford.edu/class/cs168/l/l9.pdf
2. https://stanford.edu/~rezab/classes/cme323/S15/notes/lec14.pdf
3. https://github.com/nickvandewiele/collaborative-filtering/blob/master/notebooks/user-based-collaborative-filtering.ipynb

### Preparando os dados

In [72]:
df = pd.read_csv('../data/ml-latest-small/ratings.csv')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [73]:
movies_df = pd.read_csv('../data/ml-latest-small/movies.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [74]:
known_movies = movies_df['movieId'].unique()
# remove unknown movies
df = df[df['movieId'].isin(known_movies)]

In [75]:
df['userId'].nunique(), df['movieId'].nunique()

(610, 9724)

Vamos formar a matriz fazendo o pivoteamento de ``userId`` com ``movieId``, ``rating`` sendo o valor.

Minha ideia é trabalhar com os dados na sua forma mais real possível. Portanto, a matriz será bastante "larga"
($A \in \mathbb{R}^{n \times m}$, $m \gg n$),
considerando que há muito mais filmes do que usuários.

In [76]:
# m is a pivot of userId and movieId, rating is the value
data = df.pivot(index='userId', columns='movieId', values='rating')
data

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


In [77]:
m = np.array(data)
m

array([[4. , nan, 4. , ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [2.5, 2. , 2. , ..., nan, nan, nan],
       [3. , nan, nan, ..., nan, nan, nan],
       [5. , nan, nan, ..., nan, nan, nan]])

### SVD

A escolha de como preencher os dados faltantes inicialmente é muito importante [1]. Por isso, vamos tentar duas abordagens: preencher com 0s, e preencher com a média da coluna (filme).

In [78]:
# pegando a máscara de dados faltantes
mask = np.isnan(m)

In [79]:
def recommend_movie(userId, m, n=10):
    user_idx = userId - 1
    user_ratings = m[user_idx]
    unseen_mask = mask[user_idx]
    unseen_ratings = user_ratings[unseen_mask]
    sorted_unseen_ratings = np.sort(unseen_ratings)[::-1][:n]
    unseen_movie_idx = np.argsort(unseen_ratings)[::-1][:n]
    unseen_movie_ids = data.columns[unseen_mask][unseen_movie_idx]
    unseen_movie_titles = movies_df[movies_df['movieId'].isin(unseen_movie_ids)]['title']
    return pd.DataFrame({'title': unseen_movie_titles, 'pred_rating': sorted_unseen_ratings}).reset_index(drop=True)

# get the top 10 movies for user from the original df
def top_user_movies(userId, n=10):
    movies = df[df['userId'] == userId].sort_values('rating', ascending=False).head(n)
    return movies.merge(movies_df, on='movieId')[['title', 'rating']].reset_index(drop=True)

#### 0s

In [80]:
# fillna with 0
m0 = np.nan_to_num(m)
m0

array([[4. , 0. , 4. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [2.5, 2. , 2. , ..., 0. , 0. , 0. ],
       [3. , 0. , 0. , ..., 0. , 0. , 0. ],
       [5. , 0. , 0. , ..., 0. , 0. , 0. ]])

In [81]:
r0 = 20
u0, s0, vt0 = svd(m0, full_matrices=False)
m0r = u0[:, :r0] @ np.diag(s0[:r0]) @ vt0[:r0, :]
# elimina negativos
m0r = np.clip(m0r, 0, 5)
# colocando os dados em 0.0, 0.5, 1.0, 1.5..
m0r = np.abs(np.round(m0r * 2) / 2)
m0r

array([[2.5, 1.5, 1. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [2.5, 2. , 1.5, ..., 0. , 0. , 0. ],
       [1. , 0.5, 0.5, ..., 0. , 0. , 0. ],
       [5. , 0.5, 0. , ..., 0. , 0. , 0. ]])

In [82]:
mfinal_0 = np.where(mask, m0r, m)
mfinal_0

array([[4. , 1.5, 4. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [2.5, 2. , 2. , ..., 0. , 0. , 0. ],
       [3. , 0.5, 0.5, ..., 0. , 0. , 0. ],
       [5. , 0.5, 0. , ..., 0. , 0. , 0. ]])

**Ao usar o notebook, vá mudando o userId**

In [83]:
recommend_movie(1, mfinal_0)

Unnamed: 0,title,pred_rating
0,Terminator 2: Judgment Day (1991),4.0
1,"Godfather, The (1972)",3.5
2,Die Hard (1988),3.5
3,Aliens (1986),3.5
4,Stand by Me (1986),3.0
5,Jaws (1975),3.0
6,"Breakfast Club, The (1985)",3.0
7,"Sixth Sense, The (1999)",3.0
8,"Christmas Story, A (1983)",3.0
9,Ferris Bueller's Day Off (1986),2.5


In [84]:
top_user_movies(1)

Unnamed: 0,title,rating
0,Seven (a.k.a. Se7en) (1995),5.0
1,"Usual Suspects, The (1995)",5.0
2,Bottle Rocket (1996),5.0
3,Dumb & Dumber (Dumb and Dumber) (1994),5.0
4,Billy Madison (1995),5.0
5,Desperado (1995),5.0
6,Canadian Bacon (1995),5.0
7,Rob Roy (1995),5.0
8,Pinocchio (1940),5.0
9,Tombstone (1993),5.0


#### Média da coluna

In [85]:
m

array([[4. , nan, 4. , ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [2.5, 2. , 2. , ..., nan, nan, nan],
       [3. , nan, nan, ..., nan, nan, nan],
       [5. , nan, nan, ..., nan, nan, nan]])

In [86]:
col_means = np.nanmean(m, axis=0)
inds = np.where(np.isnan(m))
m1 = m.copy()
m1[inds] = np.take(col_means, inds[1])
# fix the values to 0.0, 0.5, 1.0, 1.5...
m1 = np.abs(np.round(m1 * 2) / 2)
m1

array([[4. , 3.5, 4. , ..., 3.5, 3.5, 4. ],
       [4. , 3.5, 3.5, ..., 3.5, 3.5, 4. ],
       [4. , 3.5, 3.5, ..., 3.5, 3.5, 4. ],
       ...,
       [2.5, 2. , 2. , ..., 3.5, 3.5, 4. ],
       [3. , 3.5, 3.5, ..., 3.5, 3.5, 4. ],
       [5. , 3.5, 3.5, ..., 3.5, 3.5, 4. ]])

In [92]:
r1 = 20
u1, s1, vt1 = svd(m1, full_matrices=False)
m1r = u1[:, :r1] @ np.diag(s1[:r1]) @ vt1[:r1, :]
# elimina negativos
m1r = np.clip(m1r, 0, 5)
# colocando os dados em 0.0, 0.5, 1.0, 1.5..
m1r = np.abs(np.round(m1r * 2) / 2)
m1r

array([[4. , 3.5, 4. , ..., 3.5, 3.5, 4. ],
       [4. , 3.5, 3.5, ..., 3.5, 3.5, 4. ],
       [4. , 3.5, 3.5, ..., 3.5, 3.5, 4. ],
       ...,
       [2.5, 2. , 2.5, ..., 3.5, 3.5, 4. ],
       [4. , 3.5, 3.5, ..., 3.5, 3.5, 4. ],
       [4. , 4. , 3.5, ..., 3.5, 3.5, 4. ]])

In [94]:
mfinal_1 = np.where(mask, m1r, m)
mfinal_1

array([[4. , 3.5, 4. , ..., 3.5, 3.5, 4. ],
       [4. , 3.5, 3.5, ..., 3.5, 3.5, 4. ],
       [4. , 3.5, 3.5, ..., 3.5, 3.5, 4. ],
       ...,
       [2.5, 2. , 2. , ..., 3.5, 3.5, 4. ],
       [3. , 3.5, 3.5, ..., 3.5, 3.5, 4. ],
       [5. , 4. , 3.5, ..., 3.5, 3.5, 4. ]])

Ficou muito parecida com a original...

In [95]:
np.linalg.norm(m1r - m1)

np.float64(245.27025910207703)

Ok...

**Possível caminho:** eliminar filmes muito pouco vistos.

In [96]:
recommend_movie(1, mfinal_1)

Unnamed: 0,title,pred_rating
0,Lamerica (1994),5.0
1,Heidi Fleiss: Hollywood Madam (1995),5.0
2,"Breed, The (2006)",5.0
3,Sun Alley (Sonnenallee) (1999),5.0
4,"Valet, The (La doublure) (2006)",5.0
5,Tom Segura: Completely Normal (2014),5.0
6,What Love Is (2007),5.0
7,My Love (2006),5.0
8,Sherlock - A Study in Pink (2010),5.0
9,Won't You Be My Neighbor? (2018),5.0


Problema (ao menos na minha compreensão): a variância é tão baixa que $r=1$ já a capturou.

Ou seja, a _low-rank approximation_ é próxima demais a matriz original para conseguirmos gerar um bom preenchimento de valores.