Neste projeto, será criado um sistema de recomendação de filmes utilizando os dados do MovieLens.

# Imports

In [39]:
import pandas as pd

# Get Data

In [40]:
# lendo os dados
filmes = pd.read_csv('movies.csv')
notas = pd.read_csv('ratings.csv')

# alterando os nomes das colunas
filmes.columns = ['filmeId', 'titulo', 'generos']
notas.columns = ['usuarioId', 'filmeId', 'nota', 'momento']

# EDA

In [41]:
filmes.head()

Unnamed: 0,filmeId,titulo,generos
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [42]:
notas.head()

Unnamed: 0,usuarioId,filmeId,nota,momento
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [43]:
notas.describe()

Unnamed: 0,usuarioId,filmeId,nota,momento
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


As notas vão de 0.5 a 5.

# Primeira abordagem: total de votos

Para a primeira abordagem, pode-se olhar as notas que os usuários atribuíram ao filme.

In [44]:
filmes.head()

Unnamed: 0,filmeId,titulo,generos
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [45]:
notas.head()

Unnamed: 0,usuarioId,filmeId,nota,momento
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [46]:
# criando tabela única para melhor análise
df = pd.merge(notas, filmes, how = 'left', right_on = 'filmeId',
left_on = 'filmeId')

In [47]:
df.head()

Unnamed: 0,usuarioId,filmeId,nota,momento,titulo,generos
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [48]:
# encontrando o filme com mais números de votos
df['filmeId'].value_counts().head()

356     329
318     317
296     307
593     279
2571    278
Name: filmeId, dtype: int64

In [49]:
## filtrando o filme com mais número de votos
df[df['filmeId'] == df['filmeId'].value_counts().index[0]]

Unnamed: 0,usuarioId,filmeId,nota,momento,titulo,generos
20,1,356,4.0,964980962,Forrest Gump (1994),Comedy|Drama|Romance|War
725,6,356,5.0,845553200,Forrest Gump (1994),Comedy|Drama|Romance|War
880,7,356,5.0,1106635915,Forrest Gump (1994),Comedy|Drama|Romance|War
1053,8,356,3.0,839463527,Forrest Gump (1994),Comedy|Drama|Romance|War
1120,10,356,3.5,1455301685,Forrest Gump (1994),Comedy|Drama|Romance|War
...,...,...,...,...,...,...
97154,605,356,3.0,1277097509,Forrest Gump (1994),Comedy|Drama|Romance|War
97428,606,356,4.0,1171231370,Forrest Gump (1994),Comedy|Drama|Romance|War
98735,608,356,3.0,1117162603,Forrest Gump (1994),Comedy|Drama|Romance|War
99514,609,356,4.0,847220869,Forrest Gump (1994),Comedy|Drama|Romance|War


Sendo o mais votado, poderia-se recomendar para os outros usuários.

In [50]:
# criando coluna com a quantidade de notas atribuídas aos filmes
df_aux = df.groupby(by = 'filmeId').agg({'nota':'count'}).reset_index()
df_aux.head()

Unnamed: 0,filmeId,nota
0,1,215
1,2,110
2,3,52
3,4,7
4,5,49


In [51]:
# juntando o dataframe inicial com a coluna de quantidade de notas
df = pd.merge(df, df_aux, how = 'left',
right_on = 'filmeId', left_on = 'filmeId')

In [52]:
# renomeando colunas
df.rename(columns = {'nota_x':'nota',
                    'nota_y':'total_de_votos'}, inplace = True)

In [53]:
# obtendo os filmes mais votados pelas pessoas
df.groupby(by = ['titulo']).agg({'total_de_votos':'count'}).reset_index().sort_values(by = 'total_de_votos', ascending = False).head(10)

Unnamed: 0,titulo,total_de_votos
3158,Forrest Gump (1994),329
7593,"Shawshank Redemption, The (1994)",317
6865,Pulp Fiction (1994),307
7680,"Silence of the Lambs, The (1991)",279
5512,"Matrix, The (1999)",278
8001,Star Wars: Episode IV - A New Hope (1977),251
4662,Jurassic Park (1993),238
1337,Braveheart (1995),237
8363,Terminator 2: Judgment Day (1991),224
7421,Schindler's List (1993),220


Para a primeira abordagem, estes seriam os filmes recomendados para os usuários.

In [54]:
# obtendo os filmes mais votados pelas pessoas e suas notas médias
df_aux2 = df.groupby(by = ['titulo', 'filmeId']).agg({'total_de_votos':'count', 'nota':'mean'}).reset_index().sort_values(by = 'total_de_votos', ascending = False)
df_aux2.rename(columns = {'nota':'nota_media'}, inplace = True)
df_aux2

Unnamed: 0,titulo,filmeId,total_de_votos,nota_media
3161,Forrest Gump (1994),356,329,4.164134
7597,"Shawshank Redemption, The (1994)",318,317,4.429022
6868,Pulp Fiction (1994),296,307,4.197068
7684,"Silence of the Lambs, The (1991)",593,279,4.161290
5515,"Matrix, The (1999)",2571,278,4.192446
...,...,...,...,...
4774,King Ralph (1991),7005,1,1.500000
4772,King Kong Lives (1986),2368,1,2.000000
4767,"Kindred, The (1986)",2740,1,1.000000
4766,Kindergarten Cop 2 (2016),158254,1,1.500000


As notas médias não são necessariamente as melhores para os filmes mais votados.

In [55]:
df_aux3 = df_aux2[['filmeId', 'nota_media']]
df_aux3.head()

Unnamed: 0,filmeId,nota_media
3161,356,4.164134
7597,318,4.429022
6868,296,4.197068
7684,593,4.16129
5515,2571,4.192446


In [56]:
df = pd.merge(df, df_aux3, how = 'left', right_on = 'filmeId', left_on = 'filmeId')

In [59]:
df.head()

Unnamed: 0,usuarioId,filmeId,nota,momento,titulo,generos,total_de_votos,nota_media
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215,3.92093
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance,52,3.259615
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller,102,3.946078
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,203,3.975369
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,204,4.237745


# Segunda abordagem - ordenar pela nota média

In [60]:
df.sort_values(by = 'nota_media', ascending = False).head(10)

Unnamed: 0,usuarioId,filmeId,nota,momento,titulo,generos,total_de_votos,nota_media
66992,432,45503,5.0,1316391388,Peaceful Warrior (2006),Drama,1,5.0
16936,105,179133,5.0,1526207351,Loving Vincent (2017),Animation|Crime|Drama,1,5.0
16885,105,147196,5.0,1526207040,The Girls (1961),Comedy|Romance,1,5.0
64142,414,6442,5.0,1055265907,Belle époque (1992),Comedy|Romance,2,5.0
17249,110,7096,5.0,1175330140,Rivers and Tides (2001),Documentary,1,5.0
28004,191,496,5.0,829760898,What Happened Was... (1994),Comedy|Drama|Romance|Thriller,1,5.0
68366,443,96430,5.0,1501722543,"Odd Life of Timothy Green, The (2012)",Comedy|Drama|Fantasy,1,5.0
78207,484,71268,5.0,1342300642,Tyler Perry's I Can Do Bad All by Myself (2009),Comedy|Drama,1,5.0
16886,105,147250,5.0,1526207354,The Adventures of Sherlock Holmes and Doctor W...,(no genres listed),1,5.0
16917,105,172587,5.0,1526207610,Vacations in Prostokvashino (1980),Animation,1,5.0


Estes filmes possuem altas notas médias, porém baixas notas 1. Os filmes não devem ser recomendados somente pelas maiores notas médias, pois estes filmes podem ser filmes de nicho ou votados somente por poucos usuários.

# Terceira abordagem: recomendação por gêneros

Pode-se recomendar ao usuário filmes similares em gênero aos que este assistiu.

Peguemos um número qualquer de usuarioId e vejamos os primeiros 10 filmes assistidos por ele.

In [72]:
df[df['usuarioId'] == 5].head(10)

Unnamed: 0,usuarioId,filmeId,nota,momento,titulo,generos,total_de_votos,nota_media
516,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215,3.92093
517,5,21,4.0,847435238,Get Shorty (1995),Comedy|Crime|Thriller,89,3.494382
518,5,34,4.0,847434881,Babe (1995),Children|Drama,128,3.652344
519,5,36,4.0,847435292,Dead Man Walking (1995),Crime|Drama,67,3.835821
520,5,39,3.0,847434961,Clueless (1995),Comedy|Romance,104,3.293269
521,5,50,4.0,847434881,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,204,4.237745
522,5,58,5.0,847435238,"Postman, The (Postino, Il) (1994)",Comedy|Drama|Romance,37,4.027027
523,5,110,4.0,847434880,Braveheart (1995),Action|Drama|War,237,4.031646
524,5,150,3.0,847434748,Apollo 13 (1995),Adventure|Drama|IMAX,201,3.845771
525,5,153,3.0,847434802,Batman Forever (1995),Action|Adventure|Comedy|Crime,137,2.916058


Baseado nestes filmes que o usuário assistiu, pode-se recomendar filmes baseados nos últimos filmes assistidos por ele na plataforma.

In [106]:
df

Unnamed: 0,usuarioId,filmeId,nota,momento,titulo,generos,total_de_votos,nota_media
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215,3.920930
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance,52,3.259615
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller,102,3.946078
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,203,3.975369
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,204,4.237745
...,...,...,...,...,...,...,...,...
100831,610,166534,4.0,1493848402,Split (2017),Drama|Horror|Thriller,6,3.333333
100832,610,168248,5.0,1493850091,John Wick: Chapter Two (2017),Action|Crime|Thriller,7,4.142857
100833,610,168250,5.0,1494273047,Get Out (2017),Horror,15,3.633333
100834,610,168252,5.0,1493846352,Logan (2017),Action|Sci-Fi,25,4.280000


In [105]:
df[df['generos'] == 'Action|Adventure|Comedy|Crime'].sort_values(by = 'nota_media', ascending = False)

Unnamed: 0,usuarioId,filmeId,nota,momento,titulo,generos,total_de_votos,nota_media
100732,610,119145,4.5,1493846044,Kingsman: The Secret Service (2015),Action|Adventure|Comedy|Crime,36,3.986111
7902,52,119145,4.0,1468051301,Kingsman: The Secret Service (2015),Action|Adventure|Comedy|Crime,36,3.986111
39168,272,119145,4.0,1532266319,Kingsman: The Secret Service (2015),Action|Adventure|Comedy|Crime,36,3.986111
41224,279,119145,4.0,1506394589,Kingsman: The Secret Service (2015),Action|Adventure|Comedy|Crime,36,3.986111
43926,292,119145,4.0,1483193792,Kingsman: The Secret Service (2015),Action|Adventure|Comedy|Crime,36,3.986111
...,...,...,...,...,...,...,...,...
40007,274,5803,1.5,1197189341,I Spy (2002),Action|Adventure|Comedy|Crime,7,2.500000
40964,275,5803,2.0,1049077559,I Spy (2002),Action|Adventure|Comedy|Crime,7,2.500000
69485,448,5803,2.0,1087149734,I Spy (2002),Action|Adventure|Comedy|Crime,7,2.500000
67997,438,5803,3.0,1105654270,I Spy (2002),Action|Adventure|Comedy|Crime,7,2.500000


In [99]:
df[df['generos'] == 'Action|Adventure|Comedy|Crime'].sort_values(by = 'nota_media', ascending = False)['titulo'].unique()

array(['Kingsman: The Secret Service (2015)',
       'Rumble in the Bronx (Hont faan kui) (1995)',
       'Nothing to Lose (1997)', "It's a Mad, Mad, Mad, Mad World (1963)",
       'Batman Forever (1995)', 'Crime Busters (1977)', 'I Spy (2002)'],
      dtype=object)

Estes são os filmes com maiores notas médias dentro do gênero do último filme assistido pelo usuário.

In [131]:
df[['titulo', 'generos', 'total_de_votos','nota_media']].set_index(['generos','titulo']).sort_values(by='nota_media', ascending = False)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_de_votos,nota_media
generos,titulo,Unnamed: 2_level_1,Unnamed: 3_level_1
Drama,Peaceful Warrior (2006),1,5.0
Animation|Crime|Drama,Loving Vincent (2017),1,5.0
Comedy|Romance,The Girls (1961),1,5.0
Comedy|Romance,Belle époque (1992),2,5.0
Documentary,Rivers and Tides (2001),1,5.0
...,...,...,...
Horror|Thriller,Films to Keep You Awake: The Christmas Tale (Películas para no dormir: Cuento de navidad) (2005),1,0.5
Children|Comedy,Tooth Fairy 2 (2012),1,0.5
Drama,"Cincinnati Kid, The (1965)",1,0.5
(no genres listed),Ben-hur (2016),1,0.5


# Usuários similares

In [134]:
df.head()

Unnamed: 0,usuarioId,filmeId,nota,momento,titulo,generos,total_de_votos,nota_media
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215,3.92093
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance,52,3.259615
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller,102,3.946078
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,203,3.975369
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,204,4.237745
