Importing the dependencies

In [1]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Data Collection and Pre-Processing

In [4]:
# loading the data from the csv file to apandas dataframe
movies_data = pd.read_csv(r'F:\miniproject\Top Indian Movies.csv')

In [5]:
# printing the first 5 rows of the dataframe
movies_data.head()

Unnamed: 0,Title,Year of Release,Rating,Number of Reviews,Popularity Index,Movie Cast,Director,Description
0,Anbe Sivam,2003,8.7,21K,11629,"['Kamal Haasan', 'Madhavan', 'Kiran Rathod', '...",Sundar C.,"Two men, one young and arrogant, the other dam..."
1,Golmaal,1979,8.5,19K,39634,"['Amol Palekar', 'Bindiya Goswami', 'Deven Ver...",Hrishikesh Mukherjee,A man's simple lie to secure his job escalates...
2,Jai Bhim,2021,8.9,193K,898,"['Suriya', 'Lijo Mol Jose', 'Manikandan K.', '...",T.J. Gnanavel,When a tribal man is arrested for a case of al...
3,Nayakan,1987,8.6,20K,17539,"['Kamal Haasan', 'Saranya Ponvannan', 'Delhi G...",Mani Ratnam,A common man's struggles against a corrupt pol...
4,Pariyerum Perumal,2018,8.7,16K,17434,"['Kathir', 'Anandhi', 'Yogi Babu', 'Lijeesh', ...",Mari Selvaraj,A law student from a lower caste begins a frie...


In [6]:
# number of rows and columns in the data frame

movies_data.shape

(250, 8)

In [39]:
# selecting the relevant features for recommendation

selected_features = ['Title','Movie Cast','Director','Description']
print(selected_features)

['Title', 'Movie Cast', 'Director', 'Description']


In [40]:
# replacing the null valuess with null string

for feature in selected_features:
  movies_data[feature] = movies_data[feature].fillna('')

In [41]:
# combining all the 5 selected features

combined_features = movies_data['Title']+'  '+movies_data['Movie Cast']+' '+movies_data['Director']+' '+movies_data['Description']

In [42]:
print(combined_features)

0      Anbe Sivam  ['Kamal Haasan', 'Madhavan', 'Kira...
1      Golmaal  ['Amol Palekar', 'Bindiya Goswami', '...
2      Jai Bhim  ['Suriya', 'Lijo Mol Jose', 'Manikan...
3      Nayakan  ['Kamal Haasan', 'Saranya Ponvannan',...
4      Pariyerum Perumal  ['Kathir', 'Anandhi', 'Yogi...
                             ...                        
245    24  ['Suriya', 'Samantha Ruth Prabhu', 'Nithya...
246    Kapoor and Sons  ['Rishi Kapoor', 'Rajat Kapoo...
247    Velaiilla Pattadhari  ['Dhanush', 'Saranya Pon...
248    Colour Photo  ['Suhas', 'Chandini Chowdary', '...
249    Rockstar  ['Ranbir Kapoor', 'Nargis Fakhri', '...
Length: 250, dtype: object


In [24]:
# converting the text data to feature vectors

vectorizer = TfidfVectorizer()

In [25]:
feature_vectors = vectorizer.fit_transform(combined_features)

In [11]:
print(feature_vectors)

  (0, 201)	0.07860022416510505
  (0, 274)	0.09021200873707368
  (0, 5274)	0.11108562744414445
  (0, 13599)	0.1036413987316636
  (0, 5437)	0.1036413987316636
  (0, 3678)	0.21392179219912877
  (0, 3065)	0.22208377802661425
  (0, 5836)	0.1646750903586285
  (0, 14378)	0.33962752210959823
  (0, 16587)	0.12549432354918996
  (0, 3225)	0.24960162956997736
  (0, 14271)	0.21392179219912877
  (0, 4945)	0.24025852494110758
  (0, 15261)	0.07095833561276566
  (0, 16998)	0.1282126322850579
  (0, 11192)	0.09049319826481456
  (0, 11503)	0.27211310056983656
  (0, 13349)	0.15021264094167086
  (0, 17007)	0.23643326319898797
  (0, 17290)	0.20197912553916567
  (0, 13319)	0.2177470539412484
  (0, 14064)	0.20596090415084142
  (0, 16668)	0.19843263965100372
  (0, 14608)	0.15150672398763912
  (0, 8756)	0.22709015857011816
  :	:
  (4801, 403)	0.17727585190343229
  (4801, 4835)	0.24713765026964
  (4801, 17266)	0.28860981849329476
  (4801, 13835)	0.27870029291200094
  (4801, 13175)	0.28860981849329476
  (4801, 171

Cosine Similarity

In [43]:
# getting the similarity scores using cosine similarity

similarity = cosine_similarity(feature_vectors)

In [27]:
print(similarity)

[[1.         0.         0.01927236 ... 0.02128514 0.03082508 0.00206493]
 [0.         1.         0.02314387 ... 0.04687896 0.         0.00920606]
 [0.01927236 0.02314387 1.         ... 0.0300872  0.02011913 0.01125063]
 ...
 [0.02128514 0.04687896 0.0300872  ... 1.         0.         0.        ]
 [0.03082508 0.         0.02011913 ... 0.         1.         0.02136623]
 [0.00206493 0.00920606 0.01125063 ... 0.         0.02136623 1.        ]]


In [28]:
print(similarity.shape)

(250, 250)


Getting the movie name from the user

In [29]:
# getting the movie name from the user

movie_name = input(' Enter your favourite movie name : ')

 Enter your favourite movie name : 3 Idiots


In [31]:
# creating a list with all the movie names given in the dataset

list_of_all_titles = movies_data['Title'].tolist()
print(list_of_all_titles)

['Anbe Sivam', 'Golmaal', 'Jai Bhim', 'Nayakan', 'Pariyerum Perumal', 'Rocketry: The Nambi Effect', '3 Idiots', 'Apur Sansar', 'Manichitrathazhu', 'Kumbalangi Nights', 'Black Friday', 'Like Stars on Earth', 'C/o Kancharapalem', '#Home', 'Soorarai Pottru', 'Dangal', 'Kireedam', 'Kaithi', 'Jersey', 'Thevar Magan', 'Asuran', '96', 'Visaaranai', 'Pather Panchali', 'Thalapathi', 'Natsamrat', 'Sarpatta Parambarai', 'Drishyam 2', 'Thani Oruvan', 'Sardar Udham', 'Aparajito', 'Vada Chennai', 'Jaane Bhi Do Yaaro', 'Khosla Ka Ghosla!', 'Vikram', 'Drishyam', 'Chupke Chupke', 'Peranbu', 'Agent Sai Srinivasa Athreya', 'Anniyan', 'Mahanati', 'Bangalore Days', 'Satya', 'Super Deluxe', 'Premam', 'Ratsasan', 'Gangs of Wasseypur', 'Devasuram', 'Bhaag Milkha Bhaag', 'Andhadhun', 'Drishyam', 'Aruvi', 'Kannathil Muthamittal', 'Guide', 'Chithram', 'Shahid', 'Iruvar', 'Vikram Vedha', 'Sairat', 'Zindage na milegi dobara', 'Paan Singh Tomar', 'Tumbbad', 'Mudhalvan', 'Chhichhore', '777 Charlie', 'Dhuruvangal Pat

In [44]:
# finding the close match for the movie name given by the user

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)
print(find_close_match)

['3 Idiots']


In [33]:
close_match = find_close_match[0]
print(close_match)

3 Idiots


In [36]:
# finding the index of the movie with title

index_of_the_movie = movies_data[movies_data.Title == close_match]['Rating'].values[0]
print(index_of_the_movie)

8.4


In [46]:
print(type(index_of_the_movie))
print(index_of_the_movie)


<class 'numpy.float64'>
8.4


In [47]:
print(len(similarity))
print(index_of_the_movie)


250
8.4


In [48]:
index_of_the_movie = int(index_of_the_movie)


In [49]:
if isinstance(index_of_the_movie, int) and 0 <= index_of_the_movie < len(similarity):
    similarity_score = list(enumerate(similarity[index_of_the_movie]))
    print(similarity_score)
else:
    print("Invalid index_of_the_movie:", index_of_the_movie)


[(0, 0.0042504420700198295), (1, 0.007700230289385753), (2, 0.02077104003314457), (3, 0.02907289739465101), (4, 0.004739255408475557), (5, 0.020084728887122063), (6, 0.012168955173591022), (7, 0.005820625667660371), (8, 0.9999999999999997), (9, 0.007960454215015391), (10, 0.008713426392057542), (11, 0.051091149190830076), (12, 0.011788531281424148), (13, 0.017437715491617947), (14, 0.017886325245575665), (15, 0.007596639068439659), (16, 0.05627305803275403), (17, 0.006284124130861831), (18, 0.004874703432205887), (19, 0.007664019445457961), (20, 0.02310479631450565), (21, 0.0191568712281221), (22, 0.006497687985416422), (23, 0.0057808473467190185), (24, 0.04962929089486259), (25, 0.00444604083492725), (26, 0.02253204287891527), (27, 0.04827579393236098), (28, 0.03251532015504278), (29, 0.00779343974209098), (30, 0.002462273274141402), (31, 0.01003816320366419), (32, 0.0016250744403867176), (33, 0.00368059774608561), (34, 0.05562070368296792), (35, 0.02257299973306777), (36, 0.0), (37, 

In [50]:
len(similarity_score)

250

In [51]:
# sorting the movies based on their similarity score

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True) 
print(sorted_similar_movies)

[(8, 0.9999999999999997), (54, 0.10622176128958066), (66, 0.09998273604640008), (47, 0.0917864713926435), (129, 0.08919438137037351), (115, 0.06280158047728548), (193, 0.06111354702577266), (177, 0.060463231006139984), (41, 0.05910180043114077), (122, 0.05717658575791902), (16, 0.05627305803275403), (34, 0.05562070368296792), (11, 0.051091149190830076), (24, 0.04962929089486259), (230, 0.048481882156928585), (27, 0.04827579393236098), (208, 0.04633729957448158), (124, 0.04467203197352431), (105, 0.04453228132141177), (56, 0.04431532357003235), (172, 0.042699520247100026), (126, 0.04158246550086091), (38, 0.04080070306799415), (238, 0.04060138115389711), (176, 0.04045130027260478), (65, 0.037366545044074995), (42, 0.036848798194324545), (210, 0.03661715338540502), (204, 0.0365012049476346), (168, 0.036148582240519096), (40, 0.03460801476655388), (61, 0.034256638248738355), (236, 0.03416050225173834), (39, 0.033661641957962556), (196, 0.032692415256425626), (28, 0.03251532015504278), (16

In [53]:
# print the name of similar movies based on the index

print('Movies suggested for you : \n')

i = 1

for movie in sorted_similar_movies:
  index = movie[0]
  title_from_index = movies_data[movies_data.index==index]['Title'].values[0]
  if (i<30):
    print(i, '.',title_from_index)
    i+=1

Movies suggested for you : 

1 . Manichitrathazhu
2 . Chithram
3 . Spadikam
4 . Devasuram
5 . Mughal-E-Azam
6 . Alai Payuthey
7 . Memories
8 . Charlie
9 . Bangalore Days
10 . Oru Vadakkan Veeragatha
11 . Kireedam
12 . Vikram
13 . Like Stars on Earth
14 . Thalapathi
15 . Minnal Murali
16 . Drishyam 2
17 . Unnaipol Oruvan
18 . Indian
19 . Iqbal
20 . Iruvar
21 . Deiva Thirumagal
22 . The Great Indian Kitchen
23 . Agent Sai Srinivasa Athreya
24 . Joji
25 . Vedam
26 . Dhuruvangal Pathinaaru
27 . Satya
28 . Pokiri
29 . Mumbai Police


In [57]:
if not find_close_match:
    print("No close match found!")
    exit()


In [58]:
print("Close Match:", close_match)
print("Index of Movie:", index_of_the_movie)


Close Match: 3 Idiots
Index of Movie: 8.4


In [59]:
if index_of_the_movie < 0 or index_of_the_movie >= len(similarity):
    print("Invalid index for similarity:", index_of_the_movie)
    exit()


In [60]:
if close_match not in movies_data.Title.values:
    print("Movie not found in dataset!")
    exit()


In [61]:
movie_name = input("Enter your favourite movie name: ")

list_of_all_titles = movies_data['Title'].tolist()
find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)

if not find_close_match:
    print("No close matches found! Please try again.")
    exit()

close_match = find_close_match[0]
print("Close Match Found:", close_match)

if close_match not in movies_data['Title'].values:
    print("Movie not found in dataset!")
    exit()

index_of_the_movie = movies_data[movies_data.Title == close_match].index[0]

# Ensure index_of_the_movie is valid for the similarity array
if index_of_the_movie < 0 or index_of_the_movie >= len(similarity):
    print("Invalid index for similarity:", index_of_the_movie)
    exit()

similarity_score = list(enumerate(similarity[index_of_the_movie]))
sorted_similar_movies = sorted(similarity_score, key=lambda x: x[1], reverse=True)

print("Movies suggested for you:\n")
i = 1
for movie in sorted_similar_movies:
    index = movie[0]
    title_from_index = movies_data.iloc[index]['Title']
    if i < 30:
        print(i, ".", title_from_index)
        i += 1


Enter your favourite movie name: 3 Idiots
Close Match Found: 3 Idiots
Movies suggested for you:

1 . 3 Idiots
2 . Color It Yellow
3 . PK
4 . Dil Chahta Hai
5 . Carry On, Munna Bhai
6 . Kapoor and Sons
7 . Mimi
8 . Aadukalam
9 . Ugly
10 . Jo Jeeta Wohi Sikandar
11 . The Legend of Bhagat Singh
12 . RRR
13 . Baby
14 . Andaz Apna Apna
15 . Munna Bhai M.B.B.S.
16 . Sarfarosh
17 . Nayattu
18 . Lagaan: Once Upon A Time in India
19 . Iruvar
20 . #Home
21 . Chhichhore
22 . M.S. Dhoni: The Untold Story
23 . Android Kunjappan Version 5.25
24 . Kai Po Che
25 . Pad Man
26 . Haider
27 . Dor
28 . Deewaar
29 . Hera Pheri


Movie Recommendation Sytem

In [63]:
import pickle
with open(r'C:\mini_proj\Movies_Rec\movies_list.pkl', 'rb') as file:
    movies_list = pickle.load(file)

with open(r'C:\mini_proj\Movies_Rec\similarity.pkl', 'rb') as file:
    similarity = pickle.load(file)


In [65]:
# Example: Recommend top 5 movies similar to a given movie
movie_index = movies_list[movies_list['Title'] == 'Inception'].index[0]
similar_movies = list(enumerate(similarity[movie_index]))
sorted_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)[1:6]

recommended_titles = [movies_list.iloc[i[0]]['Title'] for i in sorted_movies]
print(recommended_titles)


IndexError: index 1 is out of bounds for axis 0 with size 0

In [66]:
print(movies_list['Title'].head())  # Check the first few titles
print("Inception" in movies_list['Title'].values)  # Check if "Inception" exists


0           Anbe Sivam
1              Golmaal
2             Jai Bhim
3              Nayakan
4    Pariyerum Perumal
Name: Title, dtype: object
False


In [69]:
result = movies_list[movies_list['Title'] == 'Vikram']
if result.empty:
    print("The movie 'Inception' is not found in the dataset!")
else:
    movie_index = result.index[0]
    print(f"Movie index for 'Vikram': {movie_index}")


Movie index for 'Vikram': 34


In [70]:
print(movies_list.shape)
print(similarity.shape)


(250, 4)
(250, 250)


In [71]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies_list['Description'])


In [72]:
movie_index = movies_list[movies_list['Title'] == 'Vikram'].index[0]
similar_movies = list(enumerate(similarity[movie_index]))
sorted_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)[1:6]
recommended_titles = [movies_list.iloc[i[0]]['Title'] for i in sorted_movies]
print(recommended_titles)


['Unnaipol Oruvan', 'A Wednesday', 'Anjaam Pathiraa', 'Black Friday', 'Baishe Srabon']
