In [None]:
# Instalando biblioteca fuzzywuzzy para otimizações
!pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [None]:
from google.colab import files
import pandas as pd
import io
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import process



In [None]:
# Upload de datasets guardados em local
# Devem ser inseridos movies.csv e ratings.csv
uploaded = files.upload();

Saving movies.csv to movies.csv
Saving ratings.csv to ratings.csv


In [None]:
# Obtendo o dataset de filmes por meio de biblioteca io
movies_uploaded_file = io.BytesIO(uploaded['movies.csv']);

# Utilizando pandas para a leitura do dataset e lendo os primeiros registros para testar
movies = pd.read_csv(movies_uploaded_file, usecols=['movieId', 'title'])
movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [None]:
# Obtendo o dataset de avaliações por meio de biblioteca io
ratings_uploaded_file = io.BytesIO(uploaded['ratings.csv']);

# Utilizando pandas para a leitura do dataset e lendo os primeiros registros para testar
ratings = pd.read_csv(ratings_uploaded_file, usecols=['userId', 'movieId', 'rating'])
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [None]:
# Verificando o shape do dataset
movies.shape
# ratings.shape

(9742, 2)

Criar um um dataset chamado ***movies x users*** que conterá como linhas os ID's de filmes (*movieId*) e como colunas os ID's de usuários (*userId*) que deverá ser preenchido com as avaliações dos usuários (*ratings*).

In [None]:
# Criando dataframe de filmes x usuários
movies_x_users_dataframe = ratings.pivot(index='movieId', columns='userId', values='rating').fillna(0)
movies_x_users_dataframe.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Irá criar uma matriz sparse dos valores dos dataframes
movies_matrix = csr_matrix(movies_x_users_dataframe.values)

Utilização da métrica como similaridade de cosseno entre os valores da matris Sparse. Iremos ter as linhas como ID's de filmes e colunas como ID's de usuário. Cada linha será considerada como um vetor (x1, x2, x3, ... xN) e será feita a identificação de similaridade de cossenos entre os mesmos. A recomendação será de fato baseado pelo quão similares (em %) são os resultados baseados em suas avaliações pelos usuários.

In [None]:
# Utilizando a classe de KNN (NearestNeighbors)
model = NearestNeighbors(metric = 'cosine', algorithm = 'brute', n_neighbors = 20)
model.fit(movies_matrix)

In [None]:
def get_recommended_movies(movie_name: str, data, neighbors_number):
  # Melhorando o processo de busca utilizando o fuzzywuzzy.process
  # Pelo tamanho do dataframe (matriz Sparse) que está inserida, realizar uma busca de string se torna muito custosa
  # Por isso, uma estratégia de indexação do título
  idx = process.extractOne(movie_name, movies['title'])[2] # [2] para retornar o título do filme
  print('Filme encontrado: ', movies['title'][idx], 'Index: ', idx)

  distance, indexes = model.kneighbors(data[idx], n_neighbors=neighbors_number)

  # Imprimindo os valores de cossenos (% de similaridade com o filme escolhido)
  # Existem valores com 0, 0.42739874, etc.
  # O que possuui 0 significa que existe a similaridade de 100%, que seria o próprio filme
  # print(distance, index)
  for index in indexes:
    print(movies['title'][index].where(index != idx))


In [None]:
movies_to_recommend_quantity = 10
get_recommended_movies('lion king', movies_matrix, movies_to_recommend_quantity)

Filme encontrado:  Lion King, The (1994) Index:  322
322                            NaN
506                 Aladdin (1992)
512    Beauty and the Beast (1991)
436          Mrs. Doubtfire (1993)
325               Mask, The (1994)
418           Jurassic Park (1993)
1                   Jumanji (1995)
314            Forrest Gump (1994)
32                     Babe (1995)
504              Home Alone (1990)
Name: title, dtype: object
