In [48]:
import pandas as pd
import numpy as np
import implicit
from scipy.sparse import csr_matrix

### Preparando os dados

In [49]:
df = pd.read_csv('../data/ml-latest-small/ratings.csv')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [50]:
movies_df = pd.read_csv('../data/ml-latest-small/movies.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [60]:
def top_user_movies(userId, n=10):
    movies = df[df['userId'] == userId].sort_values('rating', ascending=False).head(n)
    return movies.merge(movies_df, on='movieId')[['title', 'rating']].reset_index(drop=True)

In [51]:
known_movies = movies_df['movieId'].unique()
# remove unknown movies
df = df[df['movieId'].isin(known_movies)]

In [52]:
df['userId'].nunique(), df['movieId'].nunique()

(610, 9724)

In [53]:
# m is a pivot of userId and movieId, rating is the value
data = df.pivot(index='userId', columns='movieId', values='rating')
data

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


In [54]:
m = np.array(data)
# replace nan with 0
m[np.isnan(m)] = 0
m = csr_matrix(m)
m

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 100836 stored elements and shape (610, 9724)>

### Construindo o sistema

**Alternating least squares**

In [55]:
model = implicit.als.AlternatingLeastSquares(factors=64)

In [56]:
model.fit(m)

100%|██████████| 15/15 [00:00<00:00, 49.55it/s]


In [57]:
recommended = model.recommend(0, m[0])
recommended

(array([ 793, 1543,  957, 1052, 2034, 1291, 2391,  337, 1705, 3358],
       dtype=int32),
 array([1.1815284 , 0.94623363, 0.94286114, 0.8703433 , 0.85463434,
        0.8057953 , 0.7871276 , 0.77562404, 0.77498543, 0.77022314],
       dtype=float32))

In [59]:
movies_df['title'][recommended[0]]

793               Die Hard (1988)
1543      Jungle Book, The (1967)
957           Shining, The (1980)
1052        101 Dalmatians (1996)
2034    Muppets From Space (1999)
1291               Titanic (1997)
2391      Any Given Sunday (1999)
337              True Lies (1994)
1705    Overnight Delivery (1998)
3358        Without a Clue (1988)
Name: title, dtype: object

In [65]:
top_user_movies(1, 10)

Unnamed: 0,title,rating
0,Seven (a.k.a. Se7en) (1995),5.0
1,"Usual Suspects, The (1995)",5.0
2,Bottle Rocket (1996),5.0
3,Dumb & Dumber (Dumb and Dumber) (1994),5.0
4,Billy Madison (1995),5.0
5,Desperado (1995),5.0
6,Canadian Bacon (1995),5.0
7,Rob Roy (1995),5.0
8,Pinocchio (1940),5.0
9,Tombstone (1993),5.0


In [64]:
# count how many movies with title "Jungle Book, The (1994)"
movies_df[movies_df['title'].str.contains('Jungle Book, The')]

Unnamed: 0,movieId,title,genres
320,362,"Jungle Book, The (1994)",Adventure|Children|Romance
1543,2078,"Jungle Book, The (1967)",Animation|Children|Comedy|Musical
