In [1]:
# Importing the necessary libraries
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Data extraction
data = pd.read_csv('E:/Projects/movies.csv')

In [3]:
data.head(2)

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski


In [4]:
data.shape

(4803, 24)

In [5]:
data.columns

Index(['index', 'budget', 'genres', 'homepage', 'id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew', 'director'],
      dtype='object')

In [6]:
features = ['genres', 'keywords', 'tagline', 'cast', 'director']

In [8]:
data[['genres', 'keywords', 'tagline', 'cast', 'director']].isna().sum()

genres       28
keywords    412
tagline     844
cast         43
director     30
dtype: int64

In [10]:
# replacing the null values of selected features with the null strings
for features in features:
    data[features] = data[features].fillna('')

In [11]:
data[['genres', 'keywords', 'tagline', 'cast', 'director']].isna().sum()

genres      0
keywords    0
tagline     0
cast        0
director    0
dtype: int64

In [12]:
# combining all the five selected features
combined_features = data['genres'] + ' ' + data['keywords'] + ' ' + data['tagline'] + ' ' + data['cast'] + ' ' + data['director']

In [13]:
# converting the textual data into numerical data
vec = TfidfVectorizer()
feature_vec = vec.fit_transform(combined_features)

In [14]:
# getting the similarity scores using cosine similarity
similarity = cosine_similarity(feature_vec)

In [None]:
# Movie recommendation system

In [None]:
# Movie Recommendation System 
movie_name = input('Enter your favourite movie name : ')
list_of_movies = data['title'].tolist()
close_match = difflib.get_close_matches(movie_name, list_of_movies)
close = close_match[0]
index_of_the_movie = data[data.title == close]['index'].values[0]
similarity_score = list(enumerate(similarity[index_of_the_movie]))
sorted_similarity_movies = sorted(similarity_score, key = lambda x: x[1], reverse=True)

print('Movies suggested for you : \n')

i = 1
for movie in sorted_similarity_movies:
    index = movie[0] # here we are getting the index of the movie that user gave
    title_from_index = data[data.index == index]['title'].values[0] # from that index it will give next 30 movies in the following order
    if (i<=15):
        print(i,'.', title_from_index)
        i = i+1