Importing the dependencies

In [None]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Data Collection and Pre-Processing

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Connecting to the dataset saved in my google drive
movies_data = pd.read_csv('/content/drive/MyDrive/Movies/Movies.csv')

In [None]:
movies_data.head(1)

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron


In [None]:
movies_data.shape # 4803 movies 

(4803, 24)

In [None]:
# selecting the relevant features for recommendation

selected_features = ['genres','keywords','tagline','cast','director']
print(selected_features)

['genres', 'keywords', 'tagline', 'cast', 'director']


In [None]:
# replacing the null valuess with null string

for feature in selected_features:
  movies_data[feature] = movies_data[feature].fillna('')

In [None]:
# combining all the 5 selected features

combined_features = movies_data['genres']+' '+movies_data['keywords']+' '+movies_data['tagline']+' '+movies_data['cast']+' '+movies_data['director']

In [None]:
print(combined_features)

0       Action Adventure Fantasy Science Fiction cultu...
1       Adventure Fantasy Action ocean drug abuse exot...
2       Action Adventure Crime spy based on novel secr...
3       Action Crime Drama Thriller dc comics crime fi...
4       Action Adventure Science Fiction based on nove...
                              ...                        
4798    Action Crime Thriller united states\u2013mexic...
4799    Comedy Romance  A newlywed couple's honeymoon ...
4800    Comedy Drama Romance TV Movie date love at fir...
4801      A New Yorker in Shanghai Daniel Henney Eliza...
4802    Documentary obsession camcorder crush dream gi...
Length: 4803, dtype: object


In [None]:
# converting the text data to feature vectors

vectorizer = TfidfVectorizer()

In [None]:
feature_vectors = vectorizer.fit_transform(combined_features)

In [None]:
print(feature_vectors)

Cosine Similarity

In [None]:
# getting the similarity scores using cosine similarity

similarity = cosine_similarity(feature_vectors)

In [None]:
print(similarity)

[[1.         0.07219487 0.037733   ... 0.         0.         0.        ]
 [0.07219487 1.         0.03281499 ... 0.03575545 0.         0.        ]
 [0.037733   0.03281499 1.         ... 0.         0.05389661 0.        ]
 ...
 [0.         0.03575545 0.         ... 1.         0.         0.02651502]
 [0.         0.         0.05389661 ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.02651502 0.         1.        ]]


In [None]:
print(similarity.shape)

(4803, 4803)


Getting the movie name from the user

In [None]:
# getting the movie name from the user

movie_name = input(' Enter your favourite movie name : ')

 Enter your favourite movie name : Top Gun


In [None]:
# creating a list with all the movie names given in the dataset

list_of_all_titles = movies_data['title'].tolist()
print(list_of_all_titles)

In [None]:
# finding the close match for the movie name given by the user

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)
print(find_close_match)

['Top Gun', 'Top Spin', '2 Guns']


In [None]:
close_match = find_close_match[1]
print(close_match)

Top Spin


In [None]:
# finding the index of the movie with title

index_of_the_movie = movies_data[movies_data.title == close_match]['index'].values[0]
print(index_of_the_movie)

4712


In [None]:
# getting a list of similar movies

similarity_score = list(enumerate(similarity[index_of_the_movie]))
print(similarity_score)

In [None]:
len(similarity_score)

In [None]:
# sorting the movies based on their similarity score

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True) 
print(sorted_similar_movies)

[(4712, 1.0000000000000002), (4322, 0.4552712253937112), (4698, 0.34085976070910157), (4616, 0.30117059978119487), (4689, 0.2923790169533819), (4517, 0.26717289623772944), (4710, 0.2643178373124384), (4491, 0.20276745703626145), (4685, 0.19974149489021933), (4520, 0.19373210043850292), (4755, 0.1847087349880994), (4597, 0.17095516537526567), (4431, 0.16912526222204072), (4737, 0.16233191522629664), (4387, 0.1580380565139152), (4606, 0.14175929698202092), (4534, 0.14174718750931545), (4561, 0.140570272819846), (3855, 0.13635149507483674), (4468, 0.13498780295923313), (4731, 0.13448690546770456), (4593, 0.13338338704678396), (3768, 0.1310252372017128), (2837, 0.13029640139626333), (4581, 0.128025824681614), (4757, 0.12080315461235486), (4617, 0.12020947488116619), (2846, 0.11887395481476959), (4306, 0.117684101662692), (4352, 0.11204292540965351), (3938, 0.1114290425950583), (4679, 0.11005481778247683), (4666, 0.10665695568734421), (1748, 0.10595466228714347), (4009, 0.09894739935846707)

In [None]:
# print the name of similar movies based on the index

print('Movies suggested for you : \n')

i = 1

for movie in sorted_similar_movies:
  index = movie[0]
  title_from_index = movies_data[movies_data.index==index]['title'].values[0]
  if (i<30):
    print(i, '.',title_from_index)
    i+=1

Movie Recommendation Sytem

In [None]:
movie_name = input(' Enter your favourite movie name : ')

list_of_all_titles = movies_data['title'].tolist()

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)

close_match = find_close_match[0]

index_of_the_movie = movies_data[movies_data.title == close_match]['index'].values[0]

similarity_score = list(enumerate(similarity[index_of_the_movie]))

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True) 

print('Movies suggested for you : \n')

i = 1

for movie in sorted_similar_movies:
  index = movie[0]
  title_from_index = movies_data[movies_data.index==index]['title'].values[0]
  if (i<30):
    print(i, '.',title_from_index)
    i+=1

 Enter your favourite movie name : The Revenant
Movies suggested for you : 

1 . The Revenant
2 . Birdman
3 . Babel
4 . 21 Grams
5 . Amores perros
6 . Biutiful
7 . About Time
8 . Inception
9 . Brooklyn
10 . Body of Lies
11 . Shutter Island
12 . He Got Game
13 . Big Fish
14 . Charlie and the Chocolate Factory
15 . Never Let Me Go
16 . Quinceañera
17 . True Romance
18 . Gangs of New York
19 . Child 44
20 . The Great Gatsby
21 . The Lego Movie
22 . Snitch
23 . Mr. Peabody & Sherman
24 . Red River
25 . The Ring Two
26 . Django Unchained
27 . Ex Machina
28 . The Man in the Iron Mask
29 . Escape from Alcatraz
