# Movie Recommendation

## Call Data

In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
movies = pd.read_csv('C:/Users/maste/Documents/대학교/대학교 3학년 2학기/기계학습/Final Project/tmdb_5000_movies.csv')
credits = pd.read_csv('C:/Users/maste/Documents/대학교/대학교 3학년 2학기/기계학습/Final Project/tmdb_5000_credits.csv')

## Data Cleaning

In [3]:
movies['genres'] = movies['genres'].apply(json.loads)
for index, row in movies.iterrows():
    genre_list = [genre['name'] for genre in row['genres']]
    movies.at[index, 'genres'] = str(genre_list)

In [4]:
movies['keywords'] = movies['keywords'].apply(json.loads)
for index, row in movies.iterrows():
    keywords_list = [keywords['name'] for keywords in row['keywords']]
    movies.at[index, 'keywords'] = str(keywords_list)

In [5]:
credits['cast'] = credits['cast'].apply(json.loads)
for index, row in credits.iterrows():
    cast_list = [cast['name'] for cast in row['cast']]
    credits.at[index, 'cast'] = str(cast_list)

In [6]:
credits['crew'] = credits['crew'].apply(json.loads)
def get_directors(x):
    directors = [crew['name'] for crew in x if crew['job'] == 'Director']
    return directors
credits['crew'] = credits['crew'].apply(get_directors)
credits.rename(columns={'crew':'director'},inplace=True)


In [7]:
movies = movies.merge(credits,left_on='id',right_on='movie_id',how='left')
movies = movies[['id','original_title','genres','cast','vote_average','director','keywords']]

In [8]:
new_id = list(range(0,movies.shape[0]))
movies['new_id'] = new_id
movies = movies[['new_id','original_title','genres','cast','vote_average','director','keywords']]

### Genre Column

In [9]:
movies['genres'] = movies['genres'].str.strip('[]').str.replace(' ','').str.replace("'",'')
movies['genres'] = movies['genres'].str.split(',')

In [10]:
for i,j in zip(movies['genres'],movies.index):
    list2=[]
    list2=i
    list2.sort()
    movies.loc[j,'genres'] = str(list2)
movies['genres'] = movies['genres'].str.strip('[]').str.replace(' ','').str.replace("'",'')
movies['genres'] = movies['genres'].str.split(',')

In [11]:
genreList = []
for index, row in movies.iterrows():
    genres = row["genres"]
    
    for genre in genres:
        if genre not in genreList:
            genreList.append(genre)

In [12]:
def binary(genre_list):
    binaryList = []
    
    for genre in genreList:
        if genre in genre_list:
            binaryList.append(1)
        else:
            binaryList.append(0)
    
    return binaryList

In [13]:
movies['binary_genres'] = movies['genres'].apply(lambda x: binary(x))

### Director Column

In [14]:
def xstr(s):
    if s is None:
        return ''
    return str(s)
movies['director'] = movies['director'].apply(xstr)

In [15]:
directorList=[]
for i in movies['director']:
    if i not in directorList:
        directorList.append(i)

In [16]:
def binary(director_list):
    binaryList = []  
    for direct in directorList:
        if direct in director_list:
            binaryList.append(1)
        else:
            binaryList.append(0)
    return binaryList

In [17]:
movies['binary_director'] = movies['director'].apply(lambda x: binary(x))

### Cast Column

In [18]:
movies['cast'] = movies['cast'].str.strip('[]').str.replace(' ','').str.replace("'",'').str.replace('"','')
movies['cast'] = movies['cast'].str.split(',')

In [19]:
movies['cast'] = movies['cast'].apply(lambda x: sorted(x[:4]))
movies['cast'] = movies['cast'].apply(lambda x: ','.join(x))

In [20]:
castList = []
for index, row in movies.iterrows():
    cast = row["cast"]
    
    for i in cast:
        if i not in castList:
            castList.append(i)

In [21]:
def binary(cast_list):
    binaryList = []
    
    for genre in castList:
        if genre in cast_list:
            binaryList.append(1)
        else:
            binaryList.append(0)
    
    return binaryList

In [22]:
movies['binary_cast'] = movies['cast'].apply(lambda x: binary(x))

### Keywords Column

In [23]:
movies['keywords'] = (
    movies['keywords']
    .str.strip('[]')
    .str.replace(' ', '')
    .str.replace("'", '')
    .str.replace('"', '')
)
movies['keywords'] = movies['keywords'].str.split(',')

In [24]:
words_list = []
for index, row in movies.iterrows():
    genres = row["keywords"]
    
    for genre in genres:
        if genre not in words_list:
            words_list.append(genre)

In [25]:
def binary(words):
    binaryList = []
    for genre in words_list:
        if genre in words:
            binaryList.append(1)
        else:
            binaryList.append(0)
    return binaryList

In [26]:
movies['binary_words'] = movies['keywords'].apply(lambda x: binary(x))

In [27]:
movies = movies[(movies['vote_average']!=0)]
movies = movies[movies['director']!='']

In [28]:
movies['new_id']= range(movies.shape[0])

## Similarity Between Movies

### Cosine Similarity

In [29]:
from scipy import spatial

def Similarity(movieId1, movieId2):
    a = movies.iloc[movieId1]
    b = movies.iloc[movieId2]
    
    genresA = a['binary_genres']
    genresB = b['binary_genres']
    
    genreDistance = spatial.distance.cosine(genresA, genresB)
    
    castA = a['binary_cast']
    castB = b['binary_cast']
    castDistance = spatial.distance.cosine(castA, castB)
    
    directA = a['binary_director']
    directB = b['binary_director']
    directDistance = spatial.distance.cosine(directA, directB)
    
    wordsA = a['binary_words']
    wordsB = b['binary_words']
    wordsDistance = spatial.distance.cosine(directA, directB)
    
    return genreDistance + directDistance + castDistance + wordsDistance

### Recommend Movie

In [30]:
import operator

def recommend_movie(name):
    new_movie = movies[movies['original_title'].str.contains(name)].iloc[0].to_frame().T
    
    print('Selected Movie: ', new_movie.original_title.values[0])
    
    def getNeighbors(baseMovie, K):
        distances = []

        for index, movie in movies.iterrows():
            if movie['new_id'] != baseMovie['new_id'].values[0].astype(int):
                dist = Similarity(baseMovie['new_id'].values[0], movie['new_id'])
                distances.append((movie['new_id'], dist))

        distances.sort(key=operator.itemgetter(1))
        neighbors = []

        for x in range(min(K, len(distances))):
            neighbors.append(distances[x])

        return neighbors
    
    K = 10
    neighbors = getNeighbors(new_movie, K)
    
    print('\nRecommended Movies: \n')
    for neighbor in neighbors:
        print(
        str(movies.iloc[neighbor[0]][1]) +
        " | Genres: " +
        str(movies.iloc[neighbor[0]][2]).strip('[]').replace(' ', '') +
        " | Rating: " +
        str(movies.iloc[neighbor[0]][4]))

In [31]:
recommend_movie('The Dark Knight')

Selected Movie:  The Dark Knight Rises


  dist = 1.0 - uv / np.sqrt(uu * vv)



Recommended Movies: 

The Dark Knight | Genres: 'Action','Crime','Drama','Thriller' | Rating: 8.2
Batman Begins | Genres: 'Action','Crime','Drama' | Rating: 7.5
The Prestige | Genres: 'Drama','Mystery','Thriller' | Rating: 8.0
Insomnia | Genres: 'Crime','Mystery','Thriller' | Rating: 6.8
Inception | Genres: 'Action','Adventure','Mystery','ScienceFiction','Thriller' | Rating: 8.1
Memento | Genres: 'Mystery','Thriller' | Rating: 8.1
Interstellar | Genres: 'Adventure','Drama','ScienceFiction' | Rating: 8.1
Takers | Genres: 'Action','Crime','Drama','Thriller' | Rating: 6.0
Mercury Rising | Genres: 'Action','Crime','Drama','Thriller' | Rating: 6.0
Harry Brown | Genres: 'Action','Crime','Drama','Thriller' | Rating: 6.7
