In [3]:
import numpy as np
import pandas as pd
import difflib #this used for correcting the movie name
from sklearn.feature_extraction.text import TfidfVectorizer # it is used to convert the textual data to numerical data
from sklearn.metrics.pairwise import cosine_similarity # give some similarity score for all the different ways compared to other movies

In [4]:
movies_data = pd.read_csv('movies.csv', on_bad_lines='skip')

In [5]:
# printing the first 5 rows of the dataframe
movies_data.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [6]:
# number of rows and columns in the data frame

movies_data.shape

(4803, 24)

In [7]:
# selecting the relevant features for recommendation

selected_features = ['genres','keywords','tagline','cast','director']
print(selected_features)

['genres', 'keywords', 'tagline', 'cast', 'director']


In [8]:
# replacing the null valuess with null string

selected_features = ['genres','keywords','tagline','cast','director'] #remove 'keywords'
# Check if the columns exist in the dataframe before processing them
for feature in selected_features:
  if feature in movies_data.columns:  # Check if the column exists
    movies_data[feature] = movies_data[feature].fillna('')  # Fill NaN values only if the column exists
  else:
    print(f"Warning: Column '{feature}' not found in the dataframe.") # Print a warning if the column is not found

In [9]:
# combining all the 5 selected features

combined_features = movies_data['genres']+' '+movies_data['keywords']+' '+movies_data['tagline']+' '+movies_data['cast']+' '+movies_data['director']

In [10]:
print(combined_features)

0       Action Adventure Fantasy Science Fiction cultu...
1       Adventure Fantasy Action ocean drug abuse exot...
2       Action Adventure Crime spy based on novel secr...
3       Action Crime Drama Thriller dc comics crime fi...
4       Action Adventure Science Fiction based on nove...
                              ...                        
4798    Action Crime Thriller united states\u2013mexic...
4799    Comedy Romance  A newlywed couple's honeymoon ...
4800    Comedy Drama Romance TV Movie date love at fir...
4801      A New Yorker in Shanghai Daniel Henney Eliza...
4802    Documentary obsession camcorder crush dream gi...
Length: 4803, dtype: object


In [11]:
# converting the text data to feature vectors

vectorizer = TfidfVectorizer()

In [12]:
feature_vectors = vectorizer.fit_transform(combined_features)

In [13]:
print(feature_vectors)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 124266 stored elements and shape (4803, 17318)>
  Coords	Values
  (0, 201)	0.07860022416510505
  (0, 274)	0.09021200873707368
  (0, 5274)	0.11108562744414445
  (0, 13599)	0.1036413987316636
  (0, 5437)	0.1036413987316636
  (0, 3678)	0.21392179219912877
  (0, 3065)	0.22208377802661425
  (0, 5836)	0.1646750903586285
  (0, 14378)	0.33962752210959823
  (0, 16587)	0.12549432354918996
  (0, 3225)	0.24960162956997736
  (0, 14271)	0.21392179219912877
  (0, 4945)	0.24025852494110758
  (0, 15261)	0.07095833561276566
  (0, 16998)	0.1282126322850579
  (0, 11192)	0.09049319826481456
  (0, 11503)	0.27211310056983656
  (0, 13349)	0.15021264094167086
  (0, 17007)	0.23643326319898797
  (0, 17290)	0.20197912553916567
  (0, 13319)	0.2177470539412484
  (0, 14064)	0.20596090415084142
  (0, 16668)	0.19843263965100372
  (0, 14608)	0.15150672398763912
  (0, 8756)	0.22709015857011816
  :	:
  (4801, 403)	0.17727585190343229
  (4801, 4835)	0.247137650

In [14]:
# getting the similarity scores using cosine similarity

similarity = cosine_similarity(feature_vectors)

In [15]:
print(similarity)

[[1.         0.07219487 0.037733   ... 0.         0.         0.        ]
 [0.07219487 1.         0.03281499 ... 0.03575545 0.         0.        ]
 [0.037733   0.03281499 1.         ... 0.         0.05389661 0.        ]
 ...
 [0.         0.03575545 0.         ... 1.         0.         0.02651502]
 [0.         0.         0.05389661 ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.02651502 0.         1.        ]]


In [16]:
print(similarity.shape)

(4803, 4803)


In [17]:
# getting the movie name from the user

movie_name = input(' Enter your favourite movie name : ')

 Enter your favourite movie name : heart of stone


In [18]:
# creating a list with all the movie names given in the dataset

list_of_all_titles = movies_data['title'].tolist()
print(list_of_all_titles)

['Avatar', "Pirates of the Caribbean: At World's End", 'Spectre', 'The Dark Knight Rises', 'John Carter', 'Spider-Man 3', 'Tangled', 'Avengers: Age of Ultron', 'Harry Potter and the Half-Blood Prince', 'Batman v Superman: Dawn of Justice', 'Superman Returns', 'Quantum of Solace', "Pirates of the Caribbean: Dead Man's Chest", 'The Lone Ranger', 'Man of Steel', 'The Chronicles of Narnia: Prince Caspian', 'The Avengers', 'Pirates of the Caribbean: On Stranger Tides', 'Men in Black 3', 'The Hobbit: The Battle of the Five Armies', 'The Amazing Spider-Man', 'Robin Hood', 'The Hobbit: The Desolation of Smaug', 'The Golden Compass', 'King Kong', 'Titanic', 'Captain America: Civil War', 'Battleship', 'Jurassic World', 'Skyfall', 'Spider-Man 2', 'Iron Man 3', 'Alice in Wonderland', 'X-Men: The Last Stand', 'Monsters University', 'Transformers: Revenge of the Fallen', 'Transformers: Age of Extinction', 'Oz: The Great and Powerful', 'The Amazing Spider-Man 2', 'TRON: Legacy', 'Cars 2', 'Green Lant

In [19]:
# finding the close match for the movie name given by the user

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)
print(find_close_match)

['The Heart of Me', 'Hands of Stone', 'Sea of Love']


In [20]:
close_match = find_close_match[0]
print(close_match)

The Heart of Me


In [21]:
# finding the index of the movie with title

index_of_the_movie = movies_data[movies_data.title == close_match]['index'].values[0]
print(index_of_the_movie)

3393


In [22]:
# getting a list of similar movies

similarity_score = list(enumerate(similarity[index_of_the_movie]))
print(similarity_score)

[(0, np.float64(0.0)), (1, np.float64(0.0)), (2, np.float64(0.0)), (3, np.float64(0.02949357333798932)), (4, np.float64(0.0)), (5, np.float64(0.0)), (6, np.float64(0.0)), (7, np.float64(0.0)), (8, np.float64(0.023012160804872454)), (9, np.float64(0.0)), (10, np.float64(0.0)), (11, np.float64(0.0)), (12, np.float64(0.0)), (13, np.float64(0.15573600241677274)), (14, np.float64(0.0)), (15, np.float64(0.0)), (16, np.float64(0.0)), (17, np.float64(0.0)), (18, np.float64(0.0)), (19, np.float64(0.0)), (20, np.float64(0.0)), (21, np.float64(0.0)), (22, np.float64(0.0)), (23, np.float64(0.0)), (24, np.float64(0.0039004269548147956)), (25, np.float64(0.01405541743209826)), (26, np.float64(0.0)), (27, np.float64(0.0)), (28, np.float64(0.0)), (29, np.float64(0.0)), (30, np.float64(0.0)), (31, np.float64(0.0)), (32, np.float64(0.15629546849831544)), (33, np.float64(0.0)), (34, np.float64(0.0)), (35, np.float64(0.0)), (36, np.float64(0.0)), (37, np.float64(0.03212875393364804)), (38, np.float64(0.0)

In [23]:
len(similarity_score)

4803

In [24]:
# sorting the movies based on their similarity score

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True)
print(sorted_similar_movies)

[(3393, np.float64(1.0)), (3894, np.float64(0.199536423938765)), (1509, np.float64(0.1870168375893473)), (2517, np.float64(0.18410623145318913)), (326, np.float64(0.1833145424537966)), (1594, np.float64(0.18190584384943403)), (3270, np.float64(0.1796035014757541)), (3076, np.float64(0.17849447664089513)), (105, np.float64(0.17060242368851783)), (133, np.float64(0.16641853183578173)), (1587, np.float64(0.16524514467635487)), (583, np.float64(0.16450341233546448)), (4567, np.float64(0.15819070739025196)), (32, np.float64(0.15629546849831544)), (13, np.float64(0.15573600241677274)), (662, np.float64(0.15510505687415801)), (278, np.float64(0.148553177611705)), (2379, np.float64(0.12147084700460992)), (783, np.float64(0.12127427374843983)), (201, np.float64(0.11175778375950006)), (3500, np.float64(0.1114264696814149)), (3862, np.float64(0.10965012812595984)), (1444, np.float64(0.10384768590940183)), (444, np.float64(0.09837255374141743)), (1143, np.float64(0.0980363349307175)), (493, np.flo

In [25]:
# print the name of similar movies based on the index

print('Movies suggested for you : \n')

i = 1

for movie in sorted_similar_movies:
  index = movie[0]
  title_from_index = movies_data[movies_data.index==index]['title'].values[0]
  if (i<30):
    print(i, '.',title_from_index)
    i+=1

Movies suggested for you : 

1 . The Heart of Me
2 . A Room with a View
3 . The Young and Prodigious T.S. Spivet
4 . The King's Speech
5 . Cinderella
6 . Corpse Bride
7 . Howards End
8 . The House of Mirth
9 . Alice Through the Looking Glass
10 . Dark Shadows
11 . The Curse of the Were-Rabbit
12 . Big Fish
13 . Conversations with Other Women
14 . Alice in Wonderland
15 . The Lone Ranger
16 . Fight Club
17 . Planet of the Apes
18 . Restoration
19 . Mortdecai
20 . The Da Vinci Code
21 . Lucky Break
22 . Margin Call
23 . The Young Victoria
24 . Road to Perdition
25 . A Knight's Tale
26 . A Beautiful Mind
27 . The Sound of Music
28 . The Tourist
29 . Tremors


In [26]:
movie_name = input(' Enter your favourite movie name : ')

list_of_all_titles = movies_data['title'].tolist()

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)

close_match = find_close_match[0]

index_of_the_movie = movies_data[movies_data.title == close_match]['index'].values[0]

similarity_score = list(enumerate(similarity[index_of_the_movie]))

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True)

print('Movies suggested for you : \n')

i = 1

for movie in sorted_similar_movies:
  index = movie[0]
  title_from_index = movies_data[movies_data.index==index]['title'].values[0]
  if (i<30):
    print(i, '.',title_from_index)
    i+=1

 Enter your favourite movie name : babylon
Movies suggested for you : 

1 . Babylon A.D.
2 . Silent Trigger
3 . Restless
4 . The Helix... Loaded
5 . Friday the 13th Part VI: Jason Lives
6 . Pitch Black
7 . Furious 7
8 . Austin Powers: The Spy Who Shagged Me
9 . Melancholia
10 . xXx
11 . The Fast and the Furious
12 . Guardians of the Galaxy
13 . Riddick
14 . Birthday Girl
15 . The Iron Giant
16 . Sunshine
17 . The Chronicles of Riddick
18 . The Book of Eli
19 . Dolphins and Whales: Tribes of the Ocean
20 . The Pacifier
21 . Mad Max
22 . Eraserhead
23 . Fast Five
24 . Sheena
25 . The Last Witch Hunter
26 . The Fifth Element
27 . Quantum of Solace
28 . Amélie
29 . Find Me Guilty
