## **Importando as bibliotecas necessárias**

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import seaborn as sns
import numpy as np
import json
import warnings
warnings.filterwarnings('ignore')

## **Importando o dataset**

In [2]:
movies = pd.read_csv('archive/tmdb_5000_movies.csv', usecols=['id','original_title', 'genres', 'vote_average', 'keywords'], dtype={'id':'int64','original_title':'object', 'genres':'object', 'vote_average': 'float64', 'keywords': 'object'})
credits = pd.read_csv('archive/tmdb_5000_credits.csv')

## **Limpando os dados**

##### Convertendo array de objetos em strings

In [3]:
# mudando o tipo da coluna de gêneros do tipo array de objetos para  o tipo string
movies['genres'] = movies['genres'].apply(json.loads)
for index,i in zip(movies.index,movies['genres']):
    list1 = []
    for j in range(len(i)):
        list1.append((i[j]['name']))
    movies.loc[index,'genres'] = str(list1)

# mudando o tipo da coluna de palavras chave do tipo array de objetos para  o tipo string
movies['keywords'] = movies['keywords'].apply(json.loads)
for index,i in zip(movies.index,movies['keywords']):
    list1 = []
    for j in range(len(i)):
        list1.append((i[j]['name']))
    movies.loc[index,'keywords'] = str(list1)

# mudando o tipo da coluna de elenco do tipo array de objetos para  o tipo string
credits['cast'] = credits['cast'].apply(json.loads)
for index,i in zip(credits.index,credits['cast']):
    list1 = []
    for j in range(len(i)):
        list1.append((i[j]['name']))
    credits.loc[index,'cast'] = str(list1)

# mudando o tipo da coluna de equipe técnica do tipo array de objetos para  o tipo string    
credits['crew'] = credits['crew'].apply(json.loads)
def director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
credits['crew'] = credits['crew'].apply(director)
credits.rename(columns={'crew':'director'},inplace=True)

In [4]:
movies.head()

Unnamed: 0,genres,id,keywords,original_title,vote_average
0,"['Action', 'Adventure', 'Fantasy', 'Science Fi...",19995,"['culture clash', 'future', 'space war', 'spac...",Avatar,7.2
1,"['Adventure', 'Fantasy', 'Action']",285,"['ocean', 'drug abuse', 'exotic island', 'east...",Pirates of the Caribbean: At World's End,6.9
2,"['Action', 'Adventure', 'Crime']",206647,"['spy', 'based on novel', 'secret agent', 'seq...",Spectre,6.3
3,"['Action', 'Crime', 'Drama', 'Thriller']",49026,"['dc comics', 'crime fighter', 'terrorist', 's...",The Dark Knight Rises,7.6
4,"['Action', 'Adventure', 'Science Fiction']",49529,"['based on novel', 'mars', 'medallion', 'space...",John Carter,6.1


##### Mergeando os dois datasets

In [5]:
movies = movies.merge(credits,left_on='id',right_on='movie_id',how='left')
movies = movies[['id','original_title','genres','cast','vote_average','director','keywords']]

In [6]:
movies.head()

Unnamed: 0,id,original_title,genres,cast,vote_average,director,keywords
0,19995,Avatar,"['Action', 'Adventure', 'Fantasy', 'Science Fi...","['Sam Worthington', 'Zoe Saldana', 'Sigourney ...",7.2,James Cameron,"['culture clash', 'future', 'space war', 'spac..."
1,285,Pirates of the Caribbean: At World's End,"['Adventure', 'Fantasy', 'Action']","['Johnny Depp', 'Orlando Bloom', 'Keira Knight...",6.9,Gore Verbinski,"['ocean', 'drug abuse', 'exotic island', 'east..."
2,206647,Spectre,"['Action', 'Adventure', 'Crime']","['Daniel Craig', 'Christoph Waltz', 'Léa Seydo...",6.3,Sam Mendes,"['spy', 'based on novel', 'secret agent', 'seq..."
3,49026,The Dark Knight Rises,"['Action', 'Crime', 'Drama', 'Thriller']","['Christian Bale', 'Michael Caine', 'Gary Oldm...",7.6,Christopher Nolan,"['dc comics', 'crime fighter', 'terrorist', 's..."
4,49529,John Carter,"['Action', 'Adventure', 'Science Fiction']","['Taylor Kitsch', 'Lynn Collins', 'Samantha Mo...",6.1,Andrew Stanton,"['based on novel', 'mars', 'medallion', 'space..."


## **Gêneros**

#### Ordenando alfabeticamente lista de gêneros

In [7]:
movies['genres'] = movies['genres'].str.strip('[]').str.replace(' ','').str.replace("'",'') # transformando em array
movies['genres'] = movies['genres'].str.split(',')
for i,j in zip(movies['genres'],movies.index): # ordenando os gêneros por ordem alfabética
    list2=[]
    list2=i
    list2.sort()
    movies.loc[j,'genres']=str(list2)
movies['genres'] = movies['genres'].str.strip('[]').str.replace(' ','').str.replace("'",'') # transformando em array novamente
movies['genres'] = movies['genres'].str.split(',')

In [8]:
movies.head()

Unnamed: 0,id,original_title,genres,cast,vote_average,director,keywords
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","['Sam Worthington', 'Zoe Saldana', 'Sigourney ...",7.2,James Cameron,"['culture clash', 'future', 'space war', 'spac..."
1,285,Pirates of the Caribbean: At World's End,"[Action, Adventure, Fantasy]","['Johnny Depp', 'Orlando Bloom', 'Keira Knight...",6.9,Gore Verbinski,"['ocean', 'drug abuse', 'exotic island', 'east..."
2,206647,Spectre,"[Action, Adventure, Crime]","['Daniel Craig', 'Christoph Waltz', 'Léa Seydo...",6.3,Sam Mendes,"['spy', 'based on novel', 'secret agent', 'seq..."
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","['Christian Bale', 'Michael Caine', 'Gary Oldm...",7.6,Christopher Nolan,"['dc comics', 'crime fighter', 'terrorist', 's..."
4,49529,John Carter,"[Action, Adventure, ScienceFiction]","['Taylor Kitsch', 'Lynn Collins', 'Samantha Mo...",6.1,Andrew Stanton,"['based on novel', 'mars', 'medallion', 'space..."


#### Lista única de gêneros dos filmes

In [9]:
genreList = []
for index, row in movies.iterrows():
    genres = row["genres"]
    
    for genre in genres:
        if genre not in genreList:
            genreList.append(genre)
genreList[:10] 

['Action',
 'Adventure',
 'Fantasy',
 'ScienceFiction',
 'Crime',
 'Drama',
 'Thriller',
 'Animation',
 'Family',
 'Western']

#### Adicionando coluna de gêneros como array binário

In [10]:
def binary(genre_list):
    binaryList = []
    
    for genre in genreList:
        if genre in genre_list:
            binaryList.append(1)
        else:
            binaryList.append(0)
    
    return binaryList

In [11]:
movies['genres_bin'] = movies['genres'].apply(lambda x: binary(x))

## **Elenco**
 

#### Tranformando o elenco em uma string

In [12]:
movies['cast'] = movies['cast'].str.strip('[]').str.replace(' ','').str.replace("'",'').str.replace('"','')
movies['cast'] = movies['cast'].str.split(',')
for i,j in zip(movies['cast'],movies.index):
    list2 = []
    list2 = i[:4]
    movies.loc[j,'cast'] = str(list2)
movies['cast'] = movies['cast'].str.strip('[]').str.replace(' ','').str.replace("'",'')
movies['cast'] = movies['cast'].str.split(',')
for i,j in zip(movies['cast'],movies.index):
    list2 = []
    list2 = i
    list2.sort()
    movies.loc[j,'cast'] = str(list2)
movies['cast']=movies['cast'].str.strip('[]').str.replace(' ','').str.replace("'",'')

In [13]:
castList = []
for index, row in movies.iterrows():
    cast = row["cast"]
    
    for i in cast:
        if i not in castList:
            castList.append(i)

In [14]:
movies.head()

Unnamed: 0,id,original_title,genres,cast,vote_average,director,keywords,genres_bin
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","SamWorthington,SigourneyWeaver,StephenLang,Zoe...",7.2,James Cameron,"['culture clash', 'future', 'space war', 'spac...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,285,Pirates of the Caribbean: At World's End,"[Action, Adventure, Fantasy]","JohnnyDepp,KeiraKnightley,OrlandoBloom,Stellan...",6.9,Gore Verbinski,"['ocean', 'drug abuse', 'exotic island', 'east...","[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,206647,Spectre,"[Action, Adventure, Crime]","ChristophWaltz,DanielCraig,LéaSeydoux,RalphFie...",6.3,Sam Mendes,"['spy', 'based on novel', 'secret agent', 'seq...","[1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","AnneHathaway,ChristianBale,GaryOldman,MichaelC...",7.6,Christopher Nolan,"['dc comics', 'crime fighter', 'terrorist', 's...","[1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,49529,John Carter,"[Action, Adventure, ScienceFiction]","LynnCollins,SamanthaMorton,TaylorKitsch,Willem...",6.1,Andrew Stanton,"['based on novel', 'mars', 'medallion', 'space...","[1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


#### Adicionando coluna de elenco como array binário

In [15]:
def binary(cast_list):
    binaryList = []
    
    for genre in castList:
        if genre in cast_list:
            binaryList.append(1)
        else:
            binaryList.append(0)
    
    return binaryList

In [16]:
movies['cast_bin'] = movies['cast'].apply(lambda x: binary(x))

## **Diretor**

#### Removendo entradas nulas

In [17]:
def nullToEmptyString(input):
    if input is None:
        return ''
    return str(input)
movies['director'] = movies['director'].apply(nullToEmptyString)
movies = movies[movies['director']!='']

In [18]:
directorList=[]
for i in movies['director']:
    if i not in directorList:
        directorList.append(i)

#### Adicionando coluna de diretor como array binário

In [19]:
def binary(director_list):
    binaryList = []  
    for direct in directorList:
        if direct in director_list:
            binaryList.append(1)
        else:
            binaryList.append(0)
    return binaryList

In [20]:
movies['director_bin'] = movies['director'].apply(lambda x: binary(x))

## **Palavras chave**

#### Limpando e ordenando as palavras chave

In [21]:
movies['keywords'] = movies['keywords'].str.strip('[]').str.replace(' ','').str.replace("'",'').str.replace('"','')
movies['keywords'] = movies['keywords'].str.split(',')
for i,j in zip(movies['keywords'],movies.index):
    list2 = []
    list2 = i
    movies.loc[j,'keywords'] = str(list2)
movies['keywords'] = movies['keywords'].str.strip('[]').str.replace(' ','').str.replace("'",'')
movies['keywords'] = movies['keywords'].str.split(',')
for i,j in zip(movies['keywords'],movies.index):
    list2 = []
    list2 = i
    list2.sort()
    movies.loc[j,'keywords'] = str(list2)
movies['keywords'] = movies['keywords'].str.strip('[]').str.replace(' ','').str.replace("'",'')
movies['keywords'] = movies['keywords'].str.split(',')

In [22]:
words_list = []
for index, row in movies.iterrows():
    keywords = row["keywords"]
    
    for keyword in keywords:
        if keyword not in words_list:
            words_list.append(keyword)

#### Adicionando coluna de palavras chave como array binário

In [23]:
def binary(words):
    binaryList = []
    for genre in words_list:
        if genre in words:
            binaryList.append(1)
        else:
            binaryList.append(0)
    return binaryList

In [24]:
movies['words_bin'] = movies['keywords'].apply(lambda x: binary(x))
movies = movies[(movies['vote_average']!=0)]

In [25]:
movies.head()

Unnamed: 0,id,original_title,genres,cast,vote_average,director,keywords,genres_bin,cast_bin,director_bin,words_bin
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","SamWorthington,SigourneyWeaver,StephenLang,Zoe...",7.2,James Cameron,"[3d, alien, alienplanet, antiwar, battle, cgi,...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,285,Pirates of the Caribbean: At World's End,"[Action, Adventure, Fantasy]","JohnnyDepp,KeiraKnightley,OrlandoBloom,Stellan...",6.9,Gore Verbinski,"[aftercreditsstinger, afterlife, alliance, cal...","[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,206647,Spectre,"[Action, Adventure, Crime]","ChristophWaltz,DanielCraig,LéaSeydoux,RalphFie...",6.3,Sam Mendes,"[basedonnovel, britishsecretservice, mi6, secr...","[1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","AnneHathaway,ChristianBale,GaryOldman,MichaelC...",7.6,Christopher Nolan,"[batman, burglar, catburglar, catwoman, cover-...","[1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,49529,John Carter,"[Action, Adventure, ScienceFiction]","LynnCollins,SamanthaMorton,TaylorKitsch,Willem...",6.1,Andrew Stanton,"[19thcentury, 3d, alien, alienrace, basedonnov...","[1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


## **Similariades entre filmes**

In [26]:
from math import sqrt

def dist(x, y):
    a = np.array(x)
    b = np.array(y)
    return sqrt((a-b) @ (a-b))

In [27]:
def Similarity(movieId1, movieId2):
    movie1 = movies.iloc[movieId1]
    movie2 = movies.iloc[movieId2]
    
    genresA, scoreA, directA, wordsA = movie1['genres_bin'], movie1['cast_bin'], movie1['director_bin'], movie1['words_bin']
    genresB, scoreB, directB, wordsB = movie2['genres_bin'], movie2['cast_bin'], movie2['director_bin'], movie2['words_bin']
    
    genreDistance = dist(genresA, genresB)
    scoreDistance = dist(scoreA, scoreB)
    directDistance = dist(directA, directB)
    wordsDistance = dist(wordsA, wordsB)
    
    return genreDistance + directDistance + scoreDistance + wordsDistance

In [28]:
id = list(range(0,movies.shape[0]))
movies['id']=id
movies=movies[['id', 'original_title','genres','vote_average','genres_bin','cast_bin','director','director_bin','words_bin']]

## **Indicação de filmes**

In [29]:
import operator
def getNeighbors(selected_movie, number_of_neighbors):
        distances = []
    
        for index, movie in movies.iterrows():
            if movie['id'] != selected_movie['id'].values[0]:
                dist = Similarity(selected_movie['id'].values[0], movie['id'])
                distances.append((movie['id'], dist))
    
        distances.sort(key=operator.itemgetter(1))
        neighbors = []
    
        for x in range(number_of_neighbors):
            neighbors.append(distances[x])
        return neighbors

In [30]:
def film_recommendation(name, number_of_recommendations):
    selected_movie = movies[movies['original_title'].str.contains(name)].iloc[0].to_frame().T
    print('Filme escolhido: %s | Pontuação: %.2f' %(selected_movie['original_title'].values[0],selected_movie['vote_average']))

    neighbors = getNeighbors(selected_movie, number_of_recommendations)
    
    print('\nFilmes recomendados: \n')
    for neighbor in neighbors:
        print('%s | Gêneros: %s | Pontuação: %.2f' %(movies.iloc[neighbor[0]][1],str(movies.iloc[neighbor[0]][2]).strip('[]').replace(' ','').replace("'","").replace(",", ", "), movies.iloc[neighbor[0]][3]))
    

## **Input**

Use a função "filme_recommendation(param A, param B)" tal que *param A* é o nome do filme e *param B* é a quantidade de recomendações

In [31]:
film_recommendation('Despicable Me 2', 10)

Filme escolhido: Despicable Me 2 | Pontuação: 7.00

Filmes recomendados: 

Cloudy with a Chance of Meatballs 2 | Gêneros: Animation, Comedy, Family | Pontuação: 6.40
Madagascar 3: Europe's Most Wanted | Gêneros: Animation, Family | Pontuação: 6.40
Cloudy with a Chance of Meatballs | Gêneros: Animation, Comedy, Family | Pontuação: 6.50
The Nut Job | Gêneros: Adventure, Animation, Comedy, Family | Pontuação: 5.50
Barnyard | Gêneros: Animation, Comedy, Family | Pontuação: 5.30
Despicable Me | Gêneros: Animation, Family | Pontuação: 7.10
Free Birds | Gêneros: Animation, Comedy, Family | Pontuação: 5.70
Cats & Dogs 2 : The Revenge of Kitty Galore | Gêneros: Comedy, Family | Pontuação: 4.90
Penguins of Madagascar | Gêneros: Adventure, Animation, Comedy, Family | Pontuação: 6.50
The Lion of Judah | Gêneros: Adventure, Animation, Comedy, Family | Pontuação: 5.80
