Importing Dependencies

In [1]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
movies_data = pd.read_csv('movies.csv')

Data collection and preprocessing

In [3]:
# find the structure of data
movies_data.shape

(4803, 24)

In [4]:
# printing first 5 rows of the dataframe
movies_data.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [5]:
# replace the null values with null string
selected_features = ['genres', 'keywords', 'tagline', 'cast', 'director']
for feature in selected_features:
    movies_data[feature] = movies_data[feature].fillna('')
    



In [6]:
# combining the selected feaures
combined_features = movies_data['genres']+' '+movies_data['keywords']+' '+movies_data['tagline']+' '+movies_data['cast']+' '+movies_data['director']


In [7]:
# Convert the combined_features Series to strings and replace NaN with empty strings
combined_features = combined_features.astype(str).replace('nan', '')
print(combined_features)


0       Action Adventure Fantasy Science Fiction cultu...
1       Adventure Fantasy Action ocean drug abuse exot...
2       Action Adventure Crime spy based on novel secr...
3       Action Crime Drama Thriller dc comics crime fi...
4       Action Adventure Science Fiction based on nove...
                              ...                        
4798    Action Crime Thriller united states\u2013mexic...
4799    Comedy Romance  A newlywed couple's honeymoon ...
4800    Comedy Drama Romance TV Movie date love at fir...
4801      A New Yorker in Shanghai Daniel Henney Eliza...
4802    Documentary obsession camcorder crush dream gi...
Length: 4803, dtype: object


In [8]:
# converting text data into feature vector
vectorizer = TfidfVectorizer()
feature_vectors = vectorizer.fit_transform(combined_features)
print(feature_vectors)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 124266 stored elements and shape (4803, 17318)>
  Coords	Values
  (0, 201)	0.07860022416510505
  (0, 274)	0.09021200873707368
  (0, 5274)	0.11108562744414445
  (0, 13599)	0.1036413987316636
  (0, 5437)	0.1036413987316636
  (0, 3678)	0.21392179219912877
  (0, 3065)	0.22208377802661425
  (0, 5836)	0.1646750903586285
  (0, 14378)	0.33962752210959823
  (0, 16587)	0.12549432354918996
  (0, 3225)	0.24960162956997736
  (0, 14271)	0.21392179219912877
  (0, 4945)	0.24025852494110758
  (0, 15261)	0.07095833561276566
  (0, 16998)	0.1282126322850579
  (0, 11192)	0.09049319826481456
  (0, 11503)	0.27211310056983656
  (0, 13349)	0.15021264094167086
  (0, 17007)	0.23643326319898797
  (0, 17290)	0.20197912553916567
  (0, 13319)	0.2177470539412484
  (0, 14064)	0.20596090415084142
  (0, 16668)	0.19843263965100372
  (0, 14608)	0.15150672398763912
  (0, 8756)	0.22709015857011816
  :	:
  (4801, 403)	0.17727585190343229
  (4801, 4835)	0.247137650

In [9]:
# getting the similarity scores using consine similarity
similarity = cosine_similarity(feature_vectors)
print(similarity)

[[1.         0.07219487 0.037733   ... 0.         0.         0.        ]
 [0.07219487 1.         0.03281499 ... 0.03575545 0.         0.        ]
 [0.037733   0.03281499 1.         ... 0.         0.05389661 0.        ]
 ...
 [0.         0.03575545 0.         ... 1.         0.         0.02651502]
 [0.         0.         0.05389661 ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.02651502 0.         1.        ]]


In [10]:
print(similarity.shape)

(4803, 4803)


In [11]:
# getting the movie name from user 
movie_name = input('Enter your favourite movie name: ')

Enter your favourite movie name:  terminator


In [12]:
# creating a list that contains all the movies name present in dataset
list_of_all_titles = movies_data['title'].tolist()
print(list_of_all_titles)

['Avatar', "Pirates of the Caribbean: At World's End", 'Spectre', 'The Dark Knight Rises', 'John Carter', 'Spider-Man 3', 'Tangled', 'Avengers: Age of Ultron', 'Harry Potter and the Half-Blood Prince', 'Batman v Superman: Dawn of Justice', 'Superman Returns', 'Quantum of Solace', "Pirates of the Caribbean: Dead Man's Chest", 'The Lone Ranger', 'Man of Steel', 'The Chronicles of Narnia: Prince Caspian', 'The Avengers', 'Pirates of the Caribbean: On Stranger Tides', 'Men in Black 3', 'The Hobbit: The Battle of the Five Armies', 'The Amazing Spider-Man', 'Robin Hood', 'The Hobbit: The Desolation of Smaug', 'The Golden Compass', 'King Kong', 'Titanic', 'Captain America: Civil War', 'Battleship', 'Jurassic World', 'Skyfall', 'Spider-Man 2', 'Iron Man 3', 'Alice in Wonderland', 'X-Men: The Last Stand', 'Monsters University', 'Transformers: Revenge of the Fallen', 'Transformers: Age of Extinction', 'Oz: The Great and Powerful', 'The Amazing Spider-Man 2', 'TRON: Legacy', 'Cars 2', 'Green Lant

In [13]:
# finding the close match of the movie name given by the user
find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)
close_match = find_close_match[0]  # Assign the value here
print(close_match)

The Terminator


In [14]:
# finding the index of the movie based on title
index_of_the_movie = movies_data[movies_data['title'] == close_match].index[0]
print(index_of_the_movie)

3439


In [15]:
# list of similar movies based of index value
similarity_score = list(enumerate(similarity[index_of_the_movie]))
print(similarity_score)

[(0, np.float64(0.12480340331169379)), (1, np.float64(0.03458421489863167)), (2, np.float64(0.0066396941531620055)), (3, np.float64(0.03251013237509891)), (4, np.float64(0.06836145403893923)), (5, np.float64(0.02604835979288664)), (6, np.float64(0.0)), (7, np.float64(0.02932851757880184)), (8, np.float64(0.011654119000802855)), (9, np.float64(0.006399755354855166)), (10, np.float64(0.11413977237022747)), (11, np.float64(0.011143131400476743)), (12, np.float64(0.0175371951316192)), (13, np.float64(0.02318044886614199)), (14, np.float64(0.09874753799596403)), (15, np.float64(0.0)), (16, np.float64(0.027809669026105526)), (17, np.float64(0.0063387447199471795)), (18, np.float64(0.08221837559639683)), (19, np.float64(0.014801088234824386)), (20, np.float64(0.011729762911338384)), (21, np.float64(0.005446368118487067)), (22, np.float64(0.004776871009758755)), (23, np.float64(0.009146954603137549)), (24, np.float64(0.0323920692024984)), (25, np.float64(0.05110596284613303)), (26, np.float64(

In [16]:
#sorting movies based on similarity score
sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True)
print(sorted_similar_movies)

[(3439, np.float64(1.0)), (279, np.float64(0.38860434541925903)), (108, np.float64(0.3634233834168681)), (2403, np.float64(0.2984370656233166)), (93, np.float64(0.29565948810730547)), (43, np.float64(0.23256770382028566)), (1725, np.float64(0.19418186280151034)), (286, np.float64(0.18450601523001736)), (2433, np.float64(0.18028960709200473)), (634, np.float64(0.17890200928001915)), (125, np.float64(0.17294328544120308)), (2444, np.float64(0.17123139598779907)), (3014, np.float64(0.16308572661113557)), (4401, np.float64(0.16249149799075474)), (322, np.float64(0.16101208066006156)), (123, np.float64(0.15829193667829297)), (510, np.float64(0.15750439394443252)), (95, np.float64(0.15542093215931804)), (282, np.float64(0.15421114508572226)), (2047, np.float64(0.1538340157808858)), (1959, np.float64(0.15167377243231847)), (224, np.float64(0.14927762749024487)), (422, np.float64(0.14884972659453485)), (4266, np.float64(0.1487413724104435)), (1608, np.float64(0.14586438323608736)), (2654, np.f

In [17]:
# print the name of similar movies based on the index
print("Movies that suggested for you : \n")
i = 1 
for movies in sorted_similar_movies:
  index = movies[0]
  title_from_index = movies_data[movies_data.index == index]['title'].values[0]
  if (i<30):
    print(i, '.', title_from_index)
    i+=1


Movies that suggested for you : 

1 . The Terminator
2 . Terminator 2: Judgment Day
3 . Terminator Genisys
4 . Aliens
5 . Terminator 3: Rise of the Machines
6 . Terminator Salvation
7 . Blade Runner
8 . Eraser
9 . Superman IV: The Quest for Peace
10 . The Matrix
11 . The Matrix Reloaded
12 . Damnation Alley
13 . The Dead Zone
14 . The Helix... Loaded
15 . The Fifth Element
16 . The Matrix Revolutions
17 . Children of Men
18 . Interstellar
19 . True Lies
20 . Megiddo: The Omega Code 2
21 . Star Trek IV: The Voyage Home
22 . RoboCop
23 . The 6th Day
24 . House at the End of the Drive
25 . The Running Man
26 . Automata
27 . Total Recall
28 . Shadow Conspiracy
29 . Roadside


Movies Recommendation System

In [18]:
movie_name = input('Enter your favourite movie name: ')
list_of_all_titles = movies_data['title'].tolist()
find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)
close_match = find_close_match[0]
index_of_the_movie = movies_data[movies_data['title'] == close_match].index[0]
similarity_score = list(enumerate(similarity[index_of_the_movie]))
sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True)
print("Movies that suggested for you : \n")
i = 1 
for movies in sorted_similar_movies:
  index = movies[0]
  title_from_index = movies_data[movies_data.index == index]['title'].values[0]
  if (i<30):
    print(i, '.', title_from_index)
    i+=1



Enter your favourite movie name:  toy story


Movies that suggested for you : 

1 . Toy Story
2 . Toy Story 2
3 . Toy Story 3
4 . Cars 2
5 . Cloud Atlas
6 . Home
7 . Larry Crowne
8 . Quest for Camelot
9 . Big
10 . That Thing You Do!
11 . Jingle All the Way
12 . A Guy Thing
13 . Vamps
14 . Admission
15 . Running Forever
16 . Dirty Work
17 . Cars
18 . Splash
19 . Saving Mr. Banks
20 . Valiant
21 . Small Soldiers
22 . My Favorite Martian
23 . Hachi: A Dog's Tale
24 . Pokémon: Spell of the Unknown
25 . Casino
26 . Sleeper
27 . Joe Somebody
28 . Monster House
29 . Wendy and Lucy
