In [1]:
import pandas as pd
import numpy as np
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In this project, I have built a movie recommendation system - It takes one movie name as user input and recommends 10 movies which the user might like based on his/her input.

**Dataset** - The dataset used contains around 5000 movies along with details like cast, crew, genere, budget etc.

**Modules:**

difflib - to get the closest match of the user input (In case they make spelling mistakes)

Tfidfvectorizer - to convert the text data into machine understandable vectors (numbers)

cosine_similarity - to get similarity score of the movies

**Limitations**

As the dataset contains only 4803 movies, your favourite movie may not be on the list. Hence, can't recommend movies for you. Also, lack of data may lead to some irrelavant recommendation


# Data and pre-processing

In [2]:
movies_df = pd.read_csv("../input/movies-data/movies.csv")

In [3]:
movies_df.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [5]:
# Get the number of null values
movies_df.isnull().sum()

index                      0
budget                     0
genres                    28
homepage                3091
id                         0
keywords                 412
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
cast                      43
crew                       0
director                  30
dtype: int64

In [6]:
#Selecting features
features = ["keywords", "tagline", "cast", "genres", "director"]

In [7]:
# replacing the missing values with null string
for ft in features:
    movies_df[ft] = movies_df[ft].fillna('')

# Building the recommendation system

In [8]:
# combining all the features
combined_features = movies_df['keywords']+" "+movies_df['tagline']+" "+movies_df['cast']+" "+movies_df['genres']+" "+movies_df['director']
combined_features

0       culture clash future space war space colony so...
1       ocean drug abuse exotic island east india trad...
2       spy based on novel secret agent sequel mi6 A P...
3       dc comics crime fighter terrorist secret ident...
4       based on novel mars medallion space travel pri...
                              ...                        
4798    united states\u2013mexico barrier legs arms pa...
4799     A newlywed couple's honeymoon is upended by t...
4800    date love at first sight narration investigati...
4801     A New Yorker in Shanghai Daniel Henney Eliza ...
4802    obsession camcorder crush dream girl  Drew Bar...
Length: 4803, dtype: object

In [9]:
# Vectorizing the texts
vectorizer = TfidfVectorizer()


In [10]:
feature_vectors = vectorizer.fit_transform(combined_features)

In [11]:
# Get similarity confidence
similarity = cosine_similarity(feature_vectors)
# similarity

In [12]:
similarity.shape

(4803, 4803)

In [13]:
# User input movie name
movie_name = input("Enter your favourite movie name:")

Enter your favourite movie name: Pulp Fiction


In [14]:
# Create a list with all the movie names in the dataset
list_of_all_titles = movies_df['title'].tolist()
len(list_of_all_titles)

4803

In [15]:
# Finding the closest match for user fav movie in the all movies list
find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)
find_close_match

['Pulp Fiction', 'Election']

In [16]:
close_match = find_close_match[0]
close_match

'Pulp Fiction'

In [17]:
# Get the index of the movie with title
index_of_movie = movies_df[movies_df.title  == close_match].index[0]
index_of_movie

3232

In [18]:
# getting list of similar movies 
similarity_score = list(enumerate(similarity[index_of_movie]))
# print(similarity_score)

In [19]:
type(similarity_score)

list

In [20]:
# sort the similarity score
sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True)
#sorted_similar_movies

In [21]:
#Get name of similar movies
print("Recommended Movies: \n")

x = 1

for movie in sorted_similar_movies:
    index = movie[0]
    title_of_movies = movies_df[movies_df.index == index]["title"].values[0] 
    if (x<= 10):
        print(x, ".", title_of_movies)
        x = x+1

Recommended Movies: 

1 . Pulp Fiction
2 . Kill Bill: Vol. 1
3 . Clerks
4 . Surrogates
5 . Kill Bill: Vol. 2
6 . Grown Ups 2
7 . Django Unchained
8 . Identity
9 . Bringing Out the Dead
10 . The Hateful Eight
