Código disponível no livro: https://link.springer.com/chapter/10.1007%2F978-1-4842-6513-0_10

In [None]:
! pip install tensorflow

In [1]:
import tensorflow as tf
from zipfile import ZipFile
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Embedding
from tensorflow.keras.utils import get_file

In [2]:
URL = "http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
movielens_path = get_file("movielens.zip", URL, extract=True)

In [3]:
with ZipFile(movielens_path) as z:
    with z.open("ml-latest-small/ratings.csv") as f:
        df = pd.read_csv(f)
print(df.shape)
df.head(3)

(100836, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


In [4]:
#Processando os ids dos usuários - padronização
user_ids = df["userId"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
user_encoded2user = {i: x for i, x in enumerate(user_ids)}
df["user"] = df["userId"].map(user2user_encoded)
num_users = len(user_encoded2user)

In [5]:
#Processando os ids dos Filmes - padronização
movie_ids = df["movieId"].unique().tolist()
movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}
movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}
df["movie"] = df["movieId"].map(movie2movie_encoded)
num_movies = len(movie_encoded2movie)

In [6]:
print("Number of users: ", num_users,
      "\nNumber of Movies: ", num_movies)

Number of users:  610 
Number of Movies:  9724


In [7]:
#normalizar as avaliações com minmax (eficiência)
min, max  = df["rating"].min(), df["rating"].max()
df["rating"] = df["rating"].apply(lambda x:(x-min)/(max-min))
df.head(3)

Unnamed: 0,userId,movieId,rating,timestamp,user,movie
0,1,1,0.777778,964982703,0,0
1,1,3,0.777778,964981247,0,1
2,1,6,0.777778,964982224,0,2


In [8]:
#Definição do X e y para treino e teste
X = df[["user", "movie"]].values
y = df["rating"].values

(x_train, x_val, y_train, y_val) = train_test_split(
          X, y,
          test_size=0.1,
          random_state=42)

print("Shape of the x_train: ", x_train.shape)
print("Shape of the y_train: ", y_train.shape)
print("Shape of the x_val: ", x_val.shape)
print("Shape of the x_val: ", y_val.shape)

Shape of the x_train:  (90752, 2)
Shape of the y_train:  (90752,)
Shape of the x_val:  (10084, 2)
Shape of the x_val:  (10084,)


In [9]:
class RecommenderNet(tf.keras.Model):
    # __init function is to initialize the values of
    # instance members for the new object
    def __init__(self, num_users, num_movies, embedding_size, **kwargs):
        super(RecommenderNet, self).__init__(**kwargs)
        # Variable for embedding size
        self.embedding_size = embedding_size
        # Variables for user count, and related weights and biases
        self.num_users = num_users
        self.user_embedding = Embedding(
            num_users,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=tf.keras.regularizers.l2(1e-6),
        )
        self.user_bias = Embedding(num_users, 1)
        # Variables for movie count, and related weights and biases
        self.num_movies = num_movies
        self.movie_embedding = Embedding(
            num_movies,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=tf.keras.regularizers.l2(1e-6),
        )
        self.movie_bias = Embedding(num_movies, 1)
        
    
    def call(self, inputs):
        # call function is for the dot products
        # of user and movie vectors
        # It also accepts the inputs, feeds them into the layers,
        # and feed into the final sigmoid layer
        # User vector and bias values with input values
        user_vector = self.user_embedding(inputs[:, 0])
        user_bias = self.user_bias(inputs[:, 0])
        # Movie vector and bias values with input values
        movie_vector = self.movie_embedding(inputs[:, 1])
        movie_bias = self.movie_bias(inputs[:, 1])
        # tf.tensordot calculcates the dot product
        dot_user_movie = tf.tensordot(user_vector, movie_vector, 2)
        # Add all the components (including bias)
        x = dot_user_movie + user_bias + movie_bias
        # The sigmoid activation forces the rating to between 0 and 1
        return tf.nn.sigmoid(x)


In [10]:

model = RecommenderNet(num_users, num_movies, embedding_size=50)


In [11]:
model.compile(
    loss='mse',
    optimizer=tf.keras.optimizers.Adam(lr=0.001)
)

In [12]:
history = model.fit(
    x=x_train,
    y=y_train,
    batch_size=64,
    epochs=5,
    verbose=1,
    validation_data=(x_val, y_val),
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [13]:
#obter usuário qualquer
user_id = df.userId.sample(1).iloc[0]
print("The selected user ID is: ", user_id)

The selected user ID is:  443


In [14]:
#filtrar filmes que usuário já assistiu
movies_watched = df[df.userId == user_id]
not_watched = df[~df['movieId'].isin(movies_watched.movieId.values)]['movieId'].unique()
not_watched = [[movie2movie_encoded.get(x)] for x in not_watched]
print('The number of movies the user has not seen before: ', len(not_watched))

The number of movies the user has not seen before:  9687


In [15]:
#obtem id atualizado do usuário e prediz os filmes para ele
user_encoder = user2user_encoded.get(user_id)
user_movie_array = np.hstack(
        ([[user_encoder]] * len(not_watched), not_watched )
        )
ratings = model.predict(user_movie_array).flatten()

In [16]:
#ordena itens e retona índices dos filmes
top10_indices = ratings.argsort()[-10:][::-1]

In [17]:
#pega o id original do filme 
recommended_movie_ids = [movie_encoded2movie.get(not_watched[x][0]) for x in top10_indices]

In [18]:
with ZipFile(movielens_path) as z:
    with z.open("ml-latest-small/movies.csv") as f:
        movie_df = pd.read_csv(f)
movie_df.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [19]:
#resgatar os nomes dos filmes que usuário assistiu com valores altos de avaliação
top_movies_user = (
    movies_watched.sort_values(by="rating", ascending=False)
    .head(10)
    .movieId.values
)
movie_df_rows = movie_df[movie_df["movieId"].isin(top_movies_user)]

print("Movies with high ratings from user")
movie_df_rows[['title','genres']]

Movies with high ratings from user


Unnamed: 0,title,genres
277,"Shawshank Redemption, The (1994)",Crime|Drama
314,Forrest Gump (1994),Comedy|Drama|Romance|War
520,Fargo (1996),Comedy|Crime|Drama|Thriller
4137,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy
7767,The Hunger Games (2012),Action|Adventure|Drama|Sci-Fi|Thriller
7966,"Odd Life of Timothy Green, The (2012)",Comedy|Drama|Fantasy
8023,Silver Linings Playbook (2012),Comedy|Drama
8295,The Hunger Games: Catching Fire (2013),Action|Adventure|Sci-Fi|IMAX
8312,"Secret Life of Walter Mitty, The (2013)",Adventure|Comedy|Drama
8572,The Hunger Games: Mockingjay - Part 1 (2014),Adventure|Sci-Fi|Thriller


In [20]:
#top 10 que a filtragem colaborativa recomendaria
recommended_movies = movie_df[movie_df["movieId"].isin(recommended_movie_ids)]
print("Top 10 movie recommendations")
recommended_movies[['title','genres']]

Top 10 movie recommendations


Unnamed: 0,title,genres
46,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
461,Schindler's List (1993),Drama|War
474,Blade Runner (1982),Action|Sci-Fi|Thriller
659,"Godfather, The (1972)",Crime|Drama
922,"Godfather: Part II, The (1974)",Crime|Drama
951,Chinatown (1974),Crime|Film-Noir|Mystery|Thriller
975,Cool Hand Luke (1967),Drama
2462,"Boondock Saints, The (2000)",Action|Crime|Drama|Thriller
3622,"Amelie (Fabuleux destin d'Amélie Poulain, Le) ...",Comedy|Romance
7355,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX
