In [4]:
#%pip install scikit-learn

In [5]:
import pandas as pd
import numpy as np
from numpy.linalg import svd

### Links úteis

* https://www.reddit.com/r/MachineLearning/comments/2id9h8/svd_to_fill_in_missing_values_of_a_matrix/
* https://en.wikipedia.org/wiki/Matrix_completion
* https://datajobs.com/data-science-repo/Recommender-Systems-%5BNetflix%5D.pdf

### Preparando os dados

In [12]:
ratings_df = pd.read_csv('../data/ml-latest-small/ratings.csv')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [7]:
# countando filmes or usuário
movie_user_count = ratings_df.groupby('userId')['movieId'].count()
movie_user_count

userId
1         141
2          52
3         147
4          27
5          33
         ... 
200944    298
200945    108
200946     23
200947     61
200948    236
Name: movieId, Length: 200948, dtype: int64

Vamos selecionar os 100 usuários com mais filmes assistidos. Assim, há maior chance de colisão.

In [8]:
# get top 100 users with most movies watched
top_users = movie_user_count.sort_values(ascending=False).head(100).index
filtered_df = ratings_df[ratings_df['userId'].isin(top_users)]
filtered_df = filtered_df.reset_index(drop=True)
filtered_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,5029,1,3.5,1437305519
1,5029,2,4.5,1504224863
2,5029,3,4.0,1500231179
3,5029,10,3.0,1500229573
4,5029,13,3.0,1500355573


In [9]:
# build pivot table with users in rows and movies in columns]
pivot_table = filtered_df.pivot(index='userId', columns='movieId', values='rating')
# cut to first 100 movies
pivot_table = pivot_table[pivot_table.columns[:100]]
pivot_table.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,92,93,94,95,96,97,98,99,100,101
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5029,3.5,4.5,4.0,,,,,,,3.0,...,,,,3.5,,,,,,
7858,5.0,0.5,0.5,0.5,1.0,4.0,1.0,0.5,0.5,3.0,...,1.0,1.0,3.5,2.0,,,,,1.0,1.0
8307,4.0,3.5,,3.0,2.5,3.5,3.5,,3.5,4.0,...,3.5,3.0,2.5,3.0,,,,,,
8359,4.0,3.0,3.0,,3.0,4.0,,3.0,3.5,4.0,...,,3.0,3.0,3.0,,,,,3.5,2.5
8588,4.0,3.5,3.0,,3.5,5.0,,,2.0,4.5,...,0.5,1.5,,1.5,,,,,3.0,3.0


### Preparando a matriz

In [10]:
M = pivot_table.fillna(0).values
M = np.array(M)
M

array([[3.5, 4.5, 4. , ..., 0. , 0. , 0. ],
       [5. , 0.5, 0.5, ..., 0. , 1. , 1. ],
       [4. , 3.5, 0. , ..., 0. , 0. , 0. ],
       ...,
       [4.5, 3. , 3.5, ..., 0. , 3. , 4. ],
       [3. , 3. , 2. , ..., 0. , 3. , 2.5],
       [4. , 2. , 2.5, ..., 0. , 3. , 0. ]])

### Fazendo o SVD

In [11]:
r = 2
u, s, vt = np.linalg.svd(M, full_matrices=False)
_u, _s, _vt = u[:, :r], np.diag(s[:r]), vt[:r, :]
_m = _u @ _s @ _vt
_m

array([[ 2.49166534,  2.48925201,  1.17225678, ..., -0.36659727,
         0.07698739,  1.17718274],
       [ 3.53700105,  2.61527545,  1.70358697, ...,  0.31444351,
         1.27762856,  2.12796199],
       [ 4.62215061,  3.90882609,  2.20510235, ..., -0.03563003,
         1.04467393,  2.53642323],
       ...,
       [ 4.1329539 ,  3.56908161,  1.96853635, ..., -0.09909336,
         0.8400152 ,  2.23117694],
       [ 3.31664543,  1.59476896,  1.6343691 , ...,  1.07448775,
         2.28911444,  2.42208731],
       [ 4.14203861,  2.78431083,  2.00698298, ...,  0.62126819,
         1.85029803,  2.630458  ]])