# Importa as bibliotecas e carrega os dados

In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.decomposition import TruncatedSVD

In [2]:
#Carrega as avaliações
ratings_file = 'ratings.csv'
df_ratings = pd.read_csv(ratings_file)
df_ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [3]:
#Carrega os filmes
movies_file = 'movies.csv'
df_movies = pd.read_csv(movies_file)
df_movies.dtypes

movieId     int64
title      object
genres     object
dtype: object

# Pré processamento dos dados, onde removemos os outliers

In [4]:
movie_data = pd.merge(df_ratings, df_movies, on='movieId') #Junta as duas tabelas ligadas pelo "MovieId"
colunas = ['timestamp', 'genres'] # Remover essas colunas que não usaremos
movie_data = movie_data.drop(colunas, axis=1)
movie_data = movie_data.dropna(axis = 0, subset = ['title']) #Remove todas as linhas que estao com titulo vazio
movie_data.head()

Unnamed: 0,userId,movieId,rating,title
0,1,16,4.0,Casino (1995)
1,9,16,4.0,Casino (1995)
2,12,16,1.5,Casino (1995)
3,24,16,4.0,Casino (1995)
4,29,16,3.0,Casino (1995)


In [5]:
movie_ratingTotal = (movie_data.
     groupby(by = ['title'])['rating'].
     count().
     reset_index().
     rename(columns = {'rating': 'totalRatingCount'})
     [['title', 'totalRatingCount']]
    )
movie_ratingTotal.head(5) #Soma todas as análises que um filme recebeu e adiciona em uma coluna

Unnamed: 0,title,totalRatingCount
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),1
3,'Til There Was You (1997),3
4,"'burbs, The (1989)",20


In [6]:
movie_dataC = movie_data.merge(movie_ratingTotal, left_on = 'title', right_on = 'title', how = 'left') #Coloca o total junto com as análises
movie_dataC.head(10)

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,16,4.0,Casino (1995),84
1,9,16,4.0,Casino (1995),84
2,12,16,1.5,Casino (1995),84
3,24,16,4.0,Casino (1995),84
4,29,16,3.0,Casino (1995),84
5,31,16,4.0,Casino (1995),84
6,47,16,4.0,Casino (1995),84
7,60,16,4.0,Casino (1995),84
8,62,16,4.5,Casino (1995),84
9,88,16,3.5,Casino (1995),84


In [7]:
Data_Final = movie_dataC.drop_duplicates(['userId','title']) #Remove os duplicados
Data_Final.head(10)

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,16,4.0,Casino (1995),84
1,9,16,4.0,Casino (1995),84
2,12,16,1.5,Casino (1995),84
3,24,16,4.0,Casino (1995),84
4,29,16,3.0,Casino (1995),84
5,31,16,4.0,Casino (1995),84
6,47,16,4.0,Casino (1995),84
7,60,16,4.0,Casino (1995),84
8,62,16,4.5,Casino (1995),84
9,88,16,3.5,Casino (1995),84


In [8]:
Data_Final = Data_Final.pivot(index = 'userId', columns = 'title', values = 'rating') #Faz com que as linhas sejam os usuários e as colunas sejam os filmes, com as intercecções a nota dada
Data_Final = Data_Final.fillna(0) #Preenche os valores vazios
Data_Final.head(10)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...And Justice for All (1979),10 (1979),...,[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),a/k/a Tommy Chong (2005),eXistenZ (1999),loudQUIETloud: A Film About the Pixies (2006),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
X = Data_Final.values #Retorna apenas os valores, sem os nomes e IDs
X = X.T #Retorna a Matriz Transposta
X.shape

(10323, 668)

In [10]:
SVD = TruncatedSVD(n_components=100) #Faz uma redução linear de uma matriz, Parecido com o PCA mas é um método diferente
matrix = SVD.fit_transform(X) #Utiliza a transposta

corr = np.corrcoef(matrix) #Calcula o coeficiente de correlação entre os filmes
Titulos = Data_Final.columns
lista_titulos = list(Titulos)

#Função que pega o nome do filme e acha os semelhantes

In [11]:
def IndicaFilme(nomefilme):
    for movie in lista_titulos:
        if(nomefilme in movie): #Acha o filme
            index=lista_titulos.index(movie) 
            print("Filme escolhido: ", movie) #Para ter certeza que é o filme certo
            nomefilme=movie
            break;
    ListaCorrelacao = corr[index] #Pega a lista da correção de todos os filmes com aquele filme
    count=0
    Correlacao = 0.99
    while(1):
        if(len(Titulos[(ListaCorrelacao >= Correlacao)])<10): #Tenta a maior correlação
            Correlacao-=0.01
        else:
          break
    for movie in Titulos[(ListaCorrelacao >= Correlacao)]: #Filtra as correlações maiores(Mais próximas)
        if count>10:
          break;
        if(nomefilme!=movie):
            print(movie)
            count+=1

# Testes

In [12]:
IndicaFilme("Minions")

Filme escolhido:  Minions (2015)
Dead Man Down (2013)
Epic (2013)
Expendables 2, The (2012)
Frozen Ground, The (2013)
Goon (2011)
Grown Ups 2 (2013)
Hitman: Agent 47 (2015)
Hotel Transylvania (2012)
Kick-Ass 2 (2013)
Pain & Gain (2013)


In [13]:
IndicaFilme("Batman Begins")

Filme escolhido:  Batman Begins (2005)
300 (2007)
Bourne Identity, The (2002)
Bourne Ultimatum, The (2007)
Casino Royale (2006)
Dark Knight, The (2008)
Incredibles, The (2004)
Iron Man (2008)
Lord of the Rings: The Return of the King, The (2003)
Lord of the Rings: The Two Towers, The (2002)
Sin City (2005)
Star Wars: Episode III - Revenge of the Sith (2005)


In [14]:
IndicaFilme("Pirates of the Caribbean")

Filme escolhido:  Pirates of the Caribbean: At World's End (2007)
300 (2007)
Avatar (2009)
Iron Man (2008)
Live Free or Die Hard (2007)
Pirates of the Caribbean: Dead Man's Chest (2006)
Quantum of Solace (2008)
Sherlock Holmes (2009)
Transformers (2007)
WALL·E (2008)
X-Men Origins: Wolverine (2009)
Yes Man (2008)


In [15]:
IndicaFilme("Interstellar")

Filme escolhido:  Interstellar (2014)
Dark Knight Rises, The (2012)
Django Unchained (2012)
Edge of Tomorrow (2014)
Ex Machina (2015)
Gone Girl (2014)
Gravity (2013)
Nightcrawler (2014)
The Imitation Game (2014)
Wolf of Wall Street, The (2013)
