##Importando Bibliotecas
  - Diferente do expemplo anteorior agora vamos aplicar o algoritimo KNN para fazermos um sistema de recomendacao

In [1]:
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
import pandas as pd
import numpy as np

##Carregando e Tratando base
  - estas bases contem nomes de filmes, seus generos, ids de usuarios e avaliacoes que os proprios usuarios fizeram aos filmes
  - cada avaliacao é uma nota ente 0.0 a 5.0
  - precisamos ajustar essas bases pois nem todos os usuarios avaliam todos os filmes   

In [2]:
#carregando as bases com pandas
uri_filmes = 'https://raw.githubusercontent.com/RafaelBernardo18/aprendizado-de-maquina/main/movies.csv'
uri_notas = 'https://raw.githubusercontent.com/RafaelBernardo18/aprendizado-de-maquina/main/ratings.csv'

filmes_df = pd.read_csv(uri_filmes)
notas_df = pd.read_csv(uri_notas)

In [3]:
filmes_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
notas_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
filmes_df.info() #verificando a quantidade de campos da base de filmes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45843 entries, 0 to 45842
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  45843 non-null  int64 
 1   title    45843 non-null  object
 2   genres   45843 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.0+ MB


In [6]:
notas_df.info() #verificando a quantidade de campos da base de avaliacoes
#perceba que essa base é muito maior que a anterior pelo fato de que o mesmo usuario pode avaliar diversos filmes 
#consequentemente cada avaliacao se tornou uma instancia da base

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [7]:
#juntando os dataframes
#utilizando o metodo merge para juntar e ordenalos pela coluna movieId, pois ela esta presente em ambas as base  
base = pd.merge(notas_df, filmes_df, on='movieId')
base.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [8]:
#verificando campos
base.info()

#obs: caso tivessemos usado o metodo concat provavelmente teriamos diversos campos nao preenchidos

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100513 entries, 0 to 100512
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100513 non-null  int64  
 1   movieId    100513 non-null  int64  
 2   rating     100513 non-null  float64
 3   timestamp  100513 non-null  int64  
 4   title      100513 non-null  object 
 5   genres     100513 non-null  object 
dtypes: float64(1), int64(3), object(2)
memory usage: 5.4+ MB


In [9]:
#calculando a quantidade total de avaliacoes por filme e renomenado as colunas
filmes_contagem_notas = (base.groupby(by = ['title'])['rating']
                         .count()
                         .reset_index()
                         .rename(columns = {'rating': 'Cont_notas_totais'})[['title', 'Cont_notas_totais']])
filmes_contagem_notas.head()

Unnamed: 0,title,Cont_notas_totais
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2


In [10]:
#por fim juntando a coluna que conta a quantidade de avaliacoes por filme na base
notas_com_contagem_total = base.merge(filmes_contagem_notas)
notas_com_contagem_total.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,Cont_notas_totais
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215


In [11]:
notas_com_contagem_total.info() #verificando se há campos nulos e nao preechidos

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100513 entries, 0 to 100512
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   userId             100513 non-null  int64  
 1   movieId            100513 non-null  int64  
 2   rating             100513 non-null  float64
 3   timestamp          100513 non-null  int64  
 4   title              100513 non-null  object 
 5   genres             100513 non-null  object 
 6   Cont_notas_totais  100513 non-null  int64  
dtypes: float64(1), int64(4), object(2)
memory usage: 6.1+ MB


In [12]:
#para auxiliar no processo de recomendacao vamos limitar apaenas a filme com pelo menos 50 avaliacoes
limite = 50
filmes_populares = notas_com_contagem_total.query('Cont_notas_totais >= @limite')
filmes_populares.tail()
#perceba que mostrando o final da base não temos agora filmes com menos de 53 avaliacoes

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,Cont_notas_totais
79168,603,1997,4.0,953925513,"Exorcist, The (1973)",Horror|Mystery,53
79169,606,1997,3.0,1178911117,"Exorcist, The (1973)",Horror|Mystery,53
79170,607,1997,5.0,963079420,"Exorcist, The (1973)",Horror|Mystery,53
79171,608,1997,4.5,1117502891,"Exorcist, The (1973)",Horror|Mystery,53
79172,610,1997,4.0,1479543021,"Exorcist, The (1973)",Horror|Mystery,53


In [13]:
filmes_populares.info()
#perceba tambem que reduzimos significativamente a quantidade de instancias da base tambem

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41362 entries, 0 to 79172
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   userId             41362 non-null  int64  
 1   movieId            41362 non-null  int64  
 2   rating             41362 non-null  float64
 3   timestamp          41362 non-null  int64  
 4   title              41362 non-null  object 
 5   genres             41362 non-null  object 
 6   Cont_notas_totais  41362 non-null  int64  
dtypes: float64(1), int64(4), object(2)
memory usage: 2.5+ MB


##Extraindo Valores de Interesse
  - dessa vez vamos utilizar o metodo de tabela pivot e a matriz csr

In [14]:
#essa tabela é uma representacao agregada entre os valores presentes e a frequencia com que os mesmos aparecem 
#nesse caso estamos organizado pelo titulo e userId e agregando a avalicao que o usario com dado id deu para o filme 
valores_filmes = filmes_populares.pivot_table(index='title',columns='userId',values='rating').fillna(0) 

valores_filmes.head()

userId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,...,570,571,572,573,574,575,577,578,579,580,581,582,583,584,585,586,587,588,589,590,591,592,593,594,595,596,597,598,599,600,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,3.0,3.0,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,5.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,3.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,2.0,3.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,3.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,5.0,4.0,0.0,0.0,5.0,0.0,0.0,5.0,0.0,3.0,0.0,4.5
28 Days Later (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,1.5,0.0,0.0,0.0,0.0,0.0,3.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,5.0
300 (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,0.0,3.5,0.0,0.0,4.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,4.0


In [15]:
#criaremos uma matriz esparcada que representa valores de cordenadas as notas atribuidas
valores_filmes_matrix = csr_matrix(valores_filmes.values)

In [16]:
print(valores_filmes_matrix)

  (0, 11)	5.0
  (0, 18)	3.0
  (0, 66)	4.5
  (0, 90)	5.0
  (0, 102)	5.0
  (0, 109)	4.0
  (0, 130)	4.0
  (0, 151)	1.0
  (0, 156)	3.5
  (0, 174)	4.5
  (0, 179)	4.0
  (0, 195)	1.0
  (0, 197)	5.0
  (0, 216)	2.0
  (0, 233)	5.0
  (0, 246)	3.5
  (0, 257)	4.5
  (0, 271)	3.0
  (0, 272)	4.0
  (0, 277)	4.0
  (0, 283)	3.0
  (0, 295)	2.5
  (0, 304)	0.5
  (0, 313)	3.0
  (0, 322)	3.0
  :	:
  (449, 378)	2.0
  (449, 384)	3.0
  (449, 410)	0.5
  (449, 411)	4.0
  (449, 416)	4.0
  (449, 417)	4.0
  (449, 421)	3.0
  (449, 423)	2.0
  (449, 435)	2.5
  (449, 444)	3.0
  (449, 476)	4.5
  (449, 479)	5.0
  (449, 480)	4.5
  (449, 485)	1.5
  (449, 491)	5.0
  (449, 521)	4.0
  (449, 530)	4.0
  (449, 538)	1.5
  (449, 556)	4.0
  (449, 557)	3.0
  (449, 569)	5.0
  (449, 585)	2.5
  (449, 595)	4.5
  (449, 603)	3.0
  (449, 605)	4.0


##Treinando Modelo e Testando Recomendacao
  - utilizando o algoritimo KNN

In [17]:
#para o classificador atribuiremos apriori a distancia entre cossenos e tecncica de procura por forca bruta
#voce pode testar tambem a metrica Euclidean 
modelo = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
modelo.fit(valores_filmes_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [18]:
#testando recomendacoes

valor_index = 133 #escolhendo um valor de movieId da base

#este metodo kneighbors é atribuido para selecionar a as instancias mais proximas da base
# ela retonara a valores de distancia feitas pala metrica de cosseno e o nome dos filmes 
distancias, indices = modelo.kneighbors(valores_filmes.iloc[valor_index,:].values.reshape(1, -1), n_neighbors = 6)

#mostrando as recomendacoes
#é preciso ser chamada as variaveis com os metodos flatten pois kneigbors mantem tudo dentro de um ndarray(vetor com varias dimenssoes)
#é valido mencionar que a funcao flatten retorna a uma copia dos valores contidos no ndarray 
for i in range(0, len(distancias.flatten())):
      print('{0} : {1}, with distance of {2}:'.format(i, valores_filmes.index[indices.flatten()[i]], distancias.flatten()[i]))

0 : Donnie Darko (2001), with distance of 0.0:
1 : Eternal Sunshine of the Spotless Mind (2004), with distance of 0.39966069309466756:
2 : Kill Bill: Vol. 1 (2003), with distance of 0.4030251926867888:
3 : Fight Club (1999), with distance of 0.43322443692634793:
4 : Kill Bill: Vol. 2 (2004), with distance of 0.43418969688772413:
5 : Memento (2000), with distance of 0.43439523146729386:
