In [130]:
import pandas as pd

In [131]:
df = pd.read_csv("movie_dataset.csv")
df.columns

Index(['index', 'budget', 'genres', 'homepage', 'id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew', 'director'],
      dtype='object')

In [132]:
req = df[{'original_title','director','genres','cast'}]
features = ['original_title','director','genres','cast']

req

Unnamed: 0,original_title,director,genres,cast
0,Avatar,James Cameron,Action Adventure Fantasy Science Fiction,Sam Worthington Zoe Saldana Sigourney Weaver S...
1,Pirates of the Caribbean: At World's End,Gore Verbinski,Adventure Fantasy Action,Johnny Depp Orlando Bloom Keira Knightley Stel...
2,Spectre,Sam Mendes,Action Adventure Crime,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...
3,The Dark Knight Rises,Christopher Nolan,Action Crime Drama Thriller,Christian Bale Michael Caine Gary Oldman Anne ...
4,John Carter,Andrew Stanton,Action Adventure Science Fiction,Taylor Kitsch Lynn Collins Samantha Morton Wil...
...,...,...,...,...
4798,El Mariachi,Robert Rodriguez,Action Crime Thriller,Carlos Gallardo Jaime de Hoyos Peter Marquardt...
4799,Newlyweds,Edward Burns,Comedy Romance,Edward Burns Kerry Bish\u00e9 Marsha Dietlein ...
4800,"Signed, Sealed, Delivered",Scott Smith,Comedy Drama Romance TV Movie,Eric Mabius Kristin Booth Crystal Lowe Geoff G...
4801,Shanghai Calling,Daniel Hsia,,Daniel Henney Eliza Coupe Bill Paxton Alan Ruc...


In [133]:
# covert into single string
def combine_features(row):
    return row['cast']+' '+row['director']

In [134]:
# Preprocessing
for feature in features:
    df[feature] = df[feature].fillna('') 
df['combine_features'] = df.apply(combine_features,axis=1)
df['combine_features']

0       Sam Worthington Zoe Saldana Sigourney Weaver S...
1       Johnny Depp Orlando Bloom Keira Knightley Stel...
2       Daniel Craig Christoph Waltz L\u00e9a Seydoux ...
3       Christian Bale Michael Caine Gary Oldman Anne ...
4       Taylor Kitsch Lynn Collins Samantha Morton Wil...
                              ...                        
4798    Carlos Gallardo Jaime de Hoyos Peter Marquardt...
4799    Edward Burns Kerry Bish\u00e9 Marsha Dietlein ...
4800    Eric Mabius Kristin Booth Crystal Lowe Geoff G...
4801    Daniel Henney Eliza Coupe Bill Paxton Alan Ruc...
4802    Drew Barrymore Brian Herzlinger Corey Feldman ...
Name: combine_features, Length: 4803, dtype: object

In [135]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
matrix = cv.fit_transform(df['combine_features'])

from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(matrix)


In [136]:
def get_title_from_index(index):
    return df[df.index == index]['original_title'].values[0]
def get_index_from_title(title):
    return df[df.title == title]['index'].values[0]

In [139]:
movie_user_likes = "The Dark Knight Rises"
movie_index = get_index_from_title(movie_user_likes)
similar_movies =  list(enumerate(cosine_sim[movie_index]))

sorted_similar_movies = sorted(similar_movies,key=lambda x:x[1],reverse=True)[1:]

movie_index

3

In [140]:
i=0
print("Top 5 similar movies to "+movie_user_likes+" are:\n")
for element in sorted_similar_movies:
    print(get_title_from_index(element[0]))
    i+=1
    if i>=10:
        break

Top 5 similar movies to The Dark Knight Rises are:

Batman Begins
The Dark Knight
Interstellar
The Prestige
Child 44
True Romance
Inception
The Other Side of Heaven
Public Enemies
Atlantis: The Lost Empire
