<a href="https://colab.research.google.com/github/SHAKTI-swain/Final-project/blob/main/Movie_Recommendation_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#used libaries in the project
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity



In [2]:
#connect the dataset to a pandas dataframe
movies_data = pd.read_csv('movies.csv')
movies_data.head()



Unnamed: 0,index,title,certificate,runtime,genre,desc,rating,votes
0,1,Freddy,UA 16+,124 min,"Drama, Mystery, Thriller",The lines between love and obsession blur in t...,7.9,16441
1,2,An Action Hero,U,130 min,Action,Youth Icon. Superstar. Action Hero. At the age...,8.1,15690
2,3,Kantara,UA,148 min,"Action, Adventure, Drama",It involves culture of Kambala and Bhootha Kol...,8.7,78358
3,4,Khakee: The Bihar Chapter,UA 13+,45 min,"Action, Crime, Drama",As a righteous cop pursues a merciless crimina...,8.3,4464
4,5,Drishyam 2,UA,140 min,"Crime, Drama, Mystery",A gripping tale of an investigation and a fami...,8.6,18743


In [3]:
#no. of rows and columns in the data frame
movies_data.shape

(10000, 8)

In [4]:
#selecting the relevant features for recommendation
selected_features = ['genre','desc','rating','votes']
print(selected_features)

['genre', 'desc', 'rating', 'votes']


In [5]:
#replacing the null values with null string

for feature in selected_features:
     movies_data[feature] = movies_data[feature].fillna('')


In [6]:
# Combine existing features, handling potential missing columns
combined_features = movies_data['genre'].astype(str) + ' ' + movies_data['desc'].astype(str) + ' ' + movies_data['rating'].astype(str) + ' ' + movies_data['votes'].astype(str)
print(combined_features)

0       Drama, Mystery, Thriller             The lines...
1       Action             Youth Icon. Superstar. Acti...
2       Action, Adventure, Drama             It involv...
3       Action, Crime, Drama             As a righteou...
4       Crime, Drama, Mystery             A gripping t...
                              ...                        
9995                 Action, Comedy, Drama               
9996                         Drama, Family               
9997             Action, Adventure, Comedy               
9998                                 Drama               
9999                                 Crime               
Length: 10000, dtype: object


In [7]:
#converting the text data to feature vectors
vectorizer= TfidfVectorizer()

In [8]:

feature_vectors= vectorizer.fit_transform(combined_features)
print(feature_vectors)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 237733 stored elements and shape (10000, 19837)>
  Coords	Values
  (0, 6016)	0.058099007077201927
  (0, 11931)	0.14502822017681893
  (0, 18051)	0.21316170457084885
  (0, 17953)	0.05987953209629127
  (0, 10534)	0.29737142109803744
  (0, 3035)	0.16428272722180703
  (0, 10682)	0.11417557536279699
  (0, 1976)	0.12496979955940776
  (0, 12491)	0.2661221738271596
  (0, 3333)	0.3204429634844519
  (0, 8783)	0.0715429355719832
  (0, 18016)	0.14690747218971667
  (0, 15179)	0.21694498267934328
  (0, 12802)	0.30500961443490276
  (0, 19576)	0.0929310044038085
  (0, 18725)	0.31457547649130546
  (0, 18493)	0.2567258591432835
  (0, 16137)	0.29737142109803744
  (0, 18465)	0.1909820916907644
  (0, 181)	0.21909602514665963
  (0, 605)	0.3094928280704285
  (1, 17953)	0.07677830879682826
  (1, 8783)	0.1375999128736728
  (1, 1448)	0.1140316883409368
  (1, 19766)	0.15369802066008814
  :	:
  (9986, 4452)	0.7095900229895048
  (9988, 6016)	1.0
  (9989,

In [9]:
#getting the similarity scores using cosine similarity
similarity = cosine_similarity(feature_vectors)
print(similarity)

[[1.         0.01444175 0.04714132 ... 0.         0.05809901 0.        ]
 [0.01444175 1.         0.03495193 ... 0.04966871 0.         0.        ]
 [0.04714132 0.03495193 1.         ... 0.11056719 0.03895791 0.        ]
 ...
 [0.         0.04966871 0.11056719 ... 1.         0.         0.        ]
 [0.05809901 0.         0.03895791 ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]


In [10]:
print(similarity.shape)

(10000, 10000)


In [11]:
#getting the movie name from the user
movie_name = input('Enter your favourite movie name: ')

In [12]:
#creating a list with all the movie names given in the dataset
list_of_all_titles = movies_data['title'].tolist()
print(list_of_all_titles)



In [13]:
#finding the close match for the movie name given by user
find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)
print(find_close_match)

['Drishyam 2', 'Drishyam 2', 'Drishya 2']


In [14]:
close_match = find_close_match[0]
print(close_match)

Drishyam 2


In [15]:
#finding the index of the movie with title
index_of_the_movie = movies_data[movies_data.title == close_match]['index'].values[0]
print(index_of_the_movie)

5


In [16]:
#getting a list of similar moviesw
similarity_score = list(enumerate(similarity[index_of_the_movie]))
print(similarity_score)

[(0, np.float64(0.030650221134189963)), (1, np.float64(0.048102331252611874)), (2, np.float64(0.02030640415243494)), (3, np.float64(0.00567956663711696)), (4, np.float64(0.02490378370418208)), (5, np.float64(0.9999999999999997)), (6, np.float64(0.0372415898301996)), (7, np.float64(0.03831552302779992)), (8, np.float64(0.01185309929819672)), (9, np.float64(0.04897918908417745)), (10, np.float64(0.053281132114817276)), (11, np.float64(0.02026746765284812)), (12, np.float64(0.026598015882182267)), (13, np.float64(0.02418478133446586)), (14, np.float64(0.05211396151227088)), (15, np.float64(0.029438613642513448)), (16, np.float64(0.02271512663025966)), (17, np.float64(0.003942619677969218)), (18, np.float64(0.029792109509775552)), (19, np.float64(0.0075509941588476626)), (20, np.float64(0.0067554356003536944)), (21, np.float64(0.002543628831587063)), (22, np.float64(0.023974735967929674)), (23, np.float64(0.03431827965784333)), (24, np.float64(0.05220020405549485)), (25, np.float64(0.08100

In [17]:
len(similarity_score)

10000

In [18]:
#sorting the movies based on their similarity score
sorted_similarity_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True)
print(sorted_similarity_movies)

[(5, np.float64(0.9999999999999997)), (6373, np.float64(0.18262152554411565)), (7389, np.float64(0.17763827230372853)), (9060, np.float64(0.17208488590080614)), (9309, np.float64(0.17208488590080614)), (1645, np.float64(0.16972351955245235)), (8961, np.float64(0.16466631226760062)), (9096, np.float64(0.16466631226760062)), (9185, np.float64(0.16466631226760062)), (9243, np.float64(0.16466631226760062)), (9307, np.float64(0.16466631226760062)), (9348, np.float64(0.16466631226760062)), (9363, np.float64(0.16466631226760062)), (9548, np.float64(0.16466631226760062)), (9718, np.float64(0.16466631226760062)), (9865, np.float64(0.16466631226760062)), (9895, np.float64(0.16466631226760062)), (9934, np.float64(0.16466631226760062)), (9949, np.float64(0.16466631226760062)), (9947, np.float64(0.16197609154518047)), (9977, np.float64(0.16197609154518047)), (4801, np.float64(0.15961769988593275)), (5296, np.float64(0.1585824000652253)), (9440, np.float64(0.15706054679454312)), (488, np.float64(0.1

In [22]:
#print the name of similar movies based on the index
print('Movies suggested for you : \n')
i=1

for movie in sorted_similarity_movies:
  index = movie[0]
  title_from_index = movies_data[movies_data.index==index]['title'].values[0]
  if (i<11):
    print(i, '.',title_from_index)
    i+=1



Movies suggested for you : 

1 . Qala
2 . Na Umra Ki Seema Ho
3 . Kaaviya Thalaivan
4 . Geet Sangeet
5 . Rudra Veena
6 . Bhabhi Maa
7 . Dilpreet Dhillon Feat. Mehar Vaani: Nain
8 . Guru Randhawa: Lahore
9 . Mankirt Aulakh feat. Dj Flow: Badnam
10 . Jaise Savan
