Importing the dependencies

In [235]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Data Collection and Pre-Processing

In [236]:
# loading the data from the csv file to apandas dataframe
movies_data = pd.read_csv('movies.csv')

In [237]:
# printing the first 5 rows of the dataframe
movies_data.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [238]:
# number of rows and columns in the data frame

movies_data.shape

(4803, 24)

In [239]:
# selecting the relevant features for recommendation

selected_features = ['genres','keywords','tagline','cast','director']
print(selected_features)

['genres', 'keywords', 'tagline', 'cast', 'director']


In [240]:
# replacing the null valuess with null string

for feature in selected_features:
  movies_data[feature] = movies_data[feature].fillna('')

In [241]:
# combining all the 5 selected features

combined_features = movies_data['genres']+' '+movies_data['keywords']+' '+movies_data['tagline']+' '+movies_data['cast']+' '+movies_data['director']

In [242]:
print(combined_features)

0       Action Adventure Fantasy Science Fiction cultu...
1       Adventure Fantasy Action ocean drug abuse exot...
2       Action Adventure Crime spy based on novel secr...
3       Action Crime Drama Thriller dc comics crime fi...
4       Action Adventure Science Fiction based on nove...
                              ...                        
4798    Action Crime Thriller united states\u2013mexic...
4799    Comedy Romance  A newlywed couple's honeymoon ...
4800    Comedy Drama Romance TV Movie date love at fir...
4801      A New Yorker in Shanghai Daniel Henney Eliza...
4802    Documentary obsession camcorder crush dream gi...
Length: 4803, dtype: object


In [243]:
combined_features

0       Action Adventure Fantasy Science Fiction cultu...
1       Adventure Fantasy Action ocean drug abuse exot...
2       Action Adventure Crime spy based on novel secr...
3       Action Crime Drama Thriller dc comics crime fi...
4       Action Adventure Science Fiction based on nove...
                              ...                        
4798    Action Crime Thriller united states\u2013mexic...
4799    Comedy Romance  A newlywed couple's honeymoon ...
4800    Comedy Drama Romance TV Movie date love at fir...
4801      A New Yorker in Shanghai Daniel Henney Eliza...
4802    Documentary obsession camcorder crush dream gi...
Length: 4803, dtype: object

In [244]:
# converting the text data to feature vectors

vectorizer = TfidfVectorizer()

In [245]:
feature_vectors = vectorizer.fit_transform(combined_features).toarray()

In [246]:
print(feature_vectors)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


Cosine Similarity

In [247]:
# getting the similarity scores using cosine similarity

similarity = cosine_similarity(feature_vectors)

In [248]:
print(similarity)

[[1.         0.07219487 0.037733   ... 0.         0.         0.        ]
 [0.07219487 1.         0.03281499 ... 0.03575545 0.         0.        ]
 [0.037733   0.03281499 1.         ... 0.         0.05389661 0.        ]
 ...
 [0.         0.03575545 0.         ... 1.         0.         0.02651502]
 [0.         0.         0.05389661 ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.02651502 0.         1.        ]]


In [249]:
print(similarity.shape)

(4803, 4803)


Getting the movie name from the user

In [250]:
# getting the movie name from the user

movie_name = input(' Enter your favourite movie name : ')

In [251]:
# creating a list with all the movie names given in the dataset

list_of_all_titles = movies_data['title'].tolist()
print(list_of_all_titles)

['Avatar', "Pirates of the Caribbean: At World's End", 'Spectre', 'The Dark Knight Rises', 'John Carter', 'Spider-Man 3', 'Tangled', 'Avengers: Age of Ultron', 'Harry Potter and the Half-Blood Prince', 'Batman v Superman: Dawn of Justice', 'Superman Returns', 'Quantum of Solace', "Pirates of the Caribbean: Dead Man's Chest", 'The Lone Ranger', 'Man of Steel', 'The Chronicles of Narnia: Prince Caspian', 'The Avengers', 'Pirates of the Caribbean: On Stranger Tides', 'Men in Black 3', 'The Hobbit: The Battle of the Five Armies', 'The Amazing Spider-Man', 'Robin Hood', 'The Hobbit: The Desolation of Smaug', 'The Golden Compass', 'King Kong', 'Titanic', 'Captain America: Civil War', 'Battleship', 'Jurassic World', 'Skyfall', 'Spider-Man 2', 'Iron Man 3', 'Alice in Wonderland', 'X-Men: The Last Stand', 'Monsters University', 'Transformers: Revenge of the Fallen', 'Transformers: Age of Extinction', 'Oz: The Great and Powerful', 'The Amazing Spider-Man 2', 'TRON: Legacy', 'Cars 2', 'Green Lant

In [252]:
# finding the close match for the movie name given by the user

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)
print(find_close_match)

['The Other End of the Line', 'The Other Side of Heaven']


In [255]:
close_match = find_close_match[0]
print(close_match)

The Other End of the Line


In [256]:
# finding the index of the movie with title

index_of_the_movie = movies_data[movies_data.title == close_match]['index'].values[0]
print(index_of_the_movie)

4030


In [257]:
similarity[3]

array([0.0125202 , 0.0207415 , 0.05179498, ..., 0.00363016, 0.        ,
       0.        ])

In [258]:
# getting a list of similar movies

similarity_score = list(enumerate(similarity[index_of_the_movie]))
print(similarity_score)

[(0, 0.012936640250764396), (1, 0.0), (2, 0.0), (3, 0.012193512105915382), (4, 0.0), (5, 0.013401346472746047), (6, 0.0), (7, 0.0), (8, 0.011279366280457294), (9, 0.03251923185162693), (10, 0.014550742023765986), (11, 0.0), (12, 0.0), (13, 0.011925865164599543), (14, 0.010849335714901119), (15, 0.0), (16, 0.0), (17, 0.0), (18, 0.018815284519452677), (19, 0.0), (20, 0.0), (21, 0.0), (22, 0.0), (23, 0.0), (24, 0.0), (25, 0.020743314099906394), (26, 0.0), (27, 0.0), (28, 0.0), (29, 0.0), (30, 0.013038186784595637), (31, 0.0), (32, 0.0), (33, 0.0), (34, 0.0), (35, 0.011499311435026936), (36, 0.01036179726885876), (37, 0.013602373026621461), (38, 0.0), (39, 0.0), (40, 0.07410562746270741), (41, 0.0), (42, 0.01967789752700877), (43, 0.0), (44, 0.013944459353308633), (45, 0.011735484816057762), (46, 0.022932947225138528), (47, 0.0), (48, 0.0), (49, 0.0064148457042510435), (50, 0.0074109911819240076), (51, 0.0), (52, 0.00986511754662881), (53, 0.0), (54, 0.0), (55, 0.009229069219131402), (56, 

In [259]:
len(similarity_score)

4803

In [260]:
# sorting the movies based on their similarity score

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True)
print(sorted_similar_movies)

[(4030, 1.0), (4020, 0.14292224371441173), (4342, 0.13635140211626023), (4242, 0.13190856555430755), (2949, 0.10592715672950483), (4637, 0.09715215937941038), (3132, 0.09653610910985908), (2144, 0.09171959199953222), (3217, 0.09076296137447729), (2851, 0.09044069656135095), (2024, 0.08951468864591951), (1669, 0.08939324829387915), (3520, 0.08912908974287549), (814, 0.08896797017960598), (4239, 0.08841728988065367), (413, 0.08713700058943938), (1225, 0.08621839617671065), (3369, 0.08572203665430006), (2025, 0.0830142990821725), (2621, 0.08221714167406369), (2666, 0.08164547096394667), (3362, 0.08064517231774945), (815, 0.07963603067585795), (1067, 0.07951215181769493), (4645, 0.07939503110243817), (4279, 0.07860790189114676), (3389, 0.07857621536072798), (3377, 0.07809369827656455), (3032, 0.07604531659947969), (4599, 0.07520209460163484), (1806, 0.07412701227803123), (40, 0.07410562746270741), (3791, 0.07299790367627028), (3510, 0.07034475818423402), (1115, 0.0695935056268122), (412, 0

In [261]:
# print the name of similar movies based on the index

print('Movies suggested for you : \n')

i = 1

for movie in sorted_similar_movies:
  index = movie[0]
  title_from_index = movies_data[movies_data.index==index]['title'].values[0]
  if (i<30):
    print(i, '.',title_from_index)
    i+=1

Movies suggested for you : 

1 . The Other End of the Line
2 . Saving Face
3 . God's Not Dead 2
4 . Road Hard
5 . The Guru
6 . American Desi
7 . It's a Wonderful Afterlife
8 . A Very Harold & Kumar Christmas
9 . The Namesake
10 . Max Keeble's Big Move
11 . Gandhi
12 . The Promise
13 . Bend It Like Beckham
14 . The Nutty Professor
15 . Stolen Summer
16 . Nutty Professor II: The Klumps
17 . Mickey Blue Eyes
18 . Bride & Prejudice
19 . The Hundred-Foot Journey
20 . Whatever Works
21 . Slumdog Millionaire
22 . A Serious Man
23 . Hitch
24 . Kicking & Screaming
25 . Sugar Town
26 . The 41–Year–Old Virgin Who Knocked Up Sarah Marshall and Felt Superbad About It
27 . Chairman of the Board
28 . A Dog Of Flanders
29 . The Long Riders


Movie Recommendation Sytem

In [262]:
movie_name = input(' Enter your favourite movie name : ')

list_of_all_titles = movies_data['title'].tolist()

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)

close_match = find_close_match[0]

index_of_the_movie = movies_data[movies_data.title == close_match]['index'].values[0]

similarity_score = list(enumerate(similarity[index_of_the_movie]))

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True)

print('Movies suggested for you : \n')

i = 1

for movie in sorted_similar_movies:
  index = movie[0]
  title_from_index = movies_data[movies_data.index==index]['title'].values[0]
  if (i<30):
    print(i, '.',title_from_index)
    i+=1

Movies suggested for you : 

1 . Honey
2 . 8 Mile
3 . London
4 . B-Girl
5 . Footloose
6 . The Skeleton Key
7 . Once Upon a Time in the West
8 . Glitter
9 . Beauty Shop
10 . Hustle & Flow
11 . Rize
12 . Tupac: Resurrection
13 . You Got Served
14 . Dance Flick
15 . Becoming Jane
16 . Soul Food
17 . The Perfect Match
18 . The Good Girl
19 . O
20 . Flashdance
21 . Four Christmases
22 . Slow Burn
23 . The Nutcracker
24 . Straight Outta Compton
25 . Antwone Fisher
26 . In Her Line of Fire
27 . Hairspray
28 . That Thing You Do!
29 . Addicted
