In [1]:
#import the libraries required

import pandas as pd


import difflib
#we take input(the movie name) from the user,there is a possibility that the user may give some spelling mistakes
#we need to find the similar match to it,for that we use difflib


from sklearn.feature_extraction.text import TfidfVectorizer
#this is used to convert textual data into numberical data(which is known as feature vector)


from sklearn.metrics.pairwise import cosine_similarity
#to find the similarities between all the movies

In [2]:
data=pd.read_csv('TeluguMovies_dataset.csv')

In [3]:
data.head()

Unnamed: 0,index,Movie,Year,Certificate,Genre,Overview,Runtime,Rating,No.of.Ratings
0,0,Bahubali: The Beginning,2015.0,UA,"Action, Drama","In ancient India, an adventurous and darin...",159,8.1,99114
1,1,Baahubali 2: The Conclusion,2017.0,UA,"Action, Drama","When Shiva, the son of Bahubali, learns ab...",167,8.2,71458
2,2,1 - Nenokkadine,2014.0,UA,"Action, Thriller",A rock star must overcome his psychologica...,170,8.1,42372
3,3,Dhoom:3,2013.0,UA,"Action, Thriller","When Sahir, a circus entertainer trained i...",172,5.4,42112
4,4,Ra.One,2011.0,U,"Action, Adventure, Sci-Fi",When the titular antagonist of an action g...,156,4.6,37211


In [4]:
data.shape  #to know about rows and columns

(1400, 9)

In [5]:
#this is based on content and popularity based

#selecting the specific columns that are required
selected_columns =['Certificate','Genre','Overview','Rating']


#iterating throw the list and filling the null values with the ""(empty space)
for i in selected_columns:
    data[i]=data[i].fillna("")

In [6]:
#combining all the columns,so that it will be easy to find the similarities between them

combined_columns=data['Certificate'] +' '+ data['Genre'] +' '+ data['Overview']+ ' '+ data['Rating'].astype(str) 
#as the rating column contains float values it doesnt directly covert into string

print(combined_columns)

0       UA Action, Drama                 In ancient In...
1       UA Action, Drama                 When Shiva, t...
2       UA Action, Thriller                 A rock sta...
3       UA Action, Thriller                 When Sahir...
4       U Action, Adventure, Sci-Fi                 Wh...
                              ...                        
1395                                                  8.6
1396     Comedy, Drama                 The movie is ab...
1397     Drama, Romance                 Muvva Gopaludu...
1398    U      Hero Charan (Tarun) a middle class fami...
1399    U Drama                 Surendra marries Savit...
Length: 1400, dtype: object


In [7]:
#converting the textual data to feature vectors(numerical data)
#the reason to this is we cannot find the cosine similarity with textual data


vectorizer = TfidfVectorizer()
vectorized_columns=vectorizer.fit_transform(combined_columns)
print(vectorized_columns)


  (0, 3437)	0.21282108647960016
  (0, 5300)	0.31313718332904555
  (0, 5024)	0.1746837880230217
  (0, 571)	0.19430924318711293
  (0, 1723)	0.2875874638510906
  (0, 3282)	0.2353055058169436
  (0, 1205)	0.3029493808004103
  (0, 2374)	0.2559412587188551
  (0, 531)	0.21819856793233897
  (0, 2847)	0.14276337291811478
  (0, 1174)	0.3447833884612811
  (0, 277)	0.07960786177632717
  (0, 163)	0.31313718332904555
  (0, 266)	0.11756651256786595
  (0, 2286)	0.20162981887152423
  (0, 276)	0.3029493808004103
  (0, 2273)	0.17471932899702775
  (0, 1422)	0.07643052274953387
  (0, 117)	0.08185598720533066
  (0, 5029)	0.11152910281987476
  (1, 2572)	0.23284782938663662
  (1, 2826)	0.2852029721380556
  (1, 5070)	0.26989007452331404
  (1, 4836)	0.11994702798959948
  (1, 1598)	0.2091076360901349
  :	:
  (1399, 4704)	0.4349514410898509
  (1399, 2886)	0.17285243226648891
  (1399, 3167)	0.20193037319305188
  (1399, 2214)	0.1582581563209773
  (1399, 3319)	0.16653359096482614
  (1399, 3020)	0.1474038928751451
  (

In [8]:
#finding the similarities with one row to all other rows using the cosine similarity

similarity=cosine_similarity(vectorized_columns)
print(similarity)

[[1.         0.03328978 0.02071398 ... 0.02271785 0.01174459 0.04593058]
 [0.03328978 1.         0.06797627 ... 0.03420216 0.08691213 0.06325907]
 [0.02071398 0.06797627 1.         ... 0.         0.04529189 0.01253716]
 ...
 [0.02271785 0.03420216 0.         ... 1.         0.05559826 0.0149011 ]
 [0.01174459 0.08691213 0.04529189 ... 0.05559826 1.         0.0312492 ]
 [0.04593058 0.06325907 0.01253716 ... 0.0149011  0.0312492  1.        ]]


In [9]:
similarity.shape

(1400, 1400)

In [10]:
#take the movie name from the user,then we will check if the movie is in the dataset or not  
#then return the similar movies to that


movie_name=input("enter your favourite name....")

enter your favourite name....prema kavali


In [11]:
list_of_all_movies = data['Movie'].tolist()
print(list_of_all_movies)

['Bahubali: The Beginning', 'Baahubali 2: The Conclusion', '1 - Nenokkadine', 'Dhoom:3', 'Ra.One', 'Dhoom:2', 'Eega', 'Krrish 3', 'Arjun Reddy', 'Rangasthalam', 'Magadheera', 'War', 'Bharat Ane Nenu', 'Saaho', 'Theri', 'Dookudu', 'Pokiri', 'Sarkar', 'Athadu', 'The Ghazi Attack', 'Kabali', 'MSG: The Messenger of God', 'Nanban', 'Srimanthudu', 'Veer - Vivegam', 'Billa 2', 'Manam', '7 Aum Arivu', 'Bigil', 'Business Man', 'Geetha Govindam', 'Mahanati', 'Spyder', 'Nannaku Prematho', 'Dabangg 3', 'MSG 2 the Messenger', 'Manikarnika: The Queen of Jhansi', 'Race Gurram', 'Okkadu', 'Bommarillu', 'Atharintiki Daaredi', 'Khaleja', 'Yennai Arindhaal', 'Thalaivaa', 'Kaala', 'Bairavaa', 'Goodachari', 'Puli', 'Pulimurugan', 'Veeram', 'Vedam', 'Yevadu', 'Aravindha Sametha Veera Raghava', 'Billa', 'Jersey', 'Sye Raa Narasimha Reddy', 'Ala Vaikunthapurramuloo', 'Janatha Garage', 'Gabbar Singh', 'Temper', 'Game Over', 'Singam 2', 'Dhruva', 'Jalsa', 'Maharshi', 'Pelli Choopulu', 'Arya 2', 'Chekka Chivanth

In [12]:
#all the close matches that are in the data set
all_close_matches=difflib.get_close_matches(movie_name,list_of_all_movies);
print(all_close_matches)

['Prema Kavali', 'Prema Katha']


In [13]:
#the first in the list is the very close match
close_match=all_close_matches[0];
print(close_match)

Prema Kavali


In [14]:
#getting the index 
index_of_the_movie=data[data.Movie==close_match]["index"]
print(index_of_the_movie)

940    940
Name: index, dtype: int64


In [15]:
#the above output is in the form of a list i.e 144 144
#inorder to get only one we use values[0]
index_of_the_movie=data[data.Movie==close_match]["index"].values[0]
print(index_of_the_movie)

940


In [16]:
#getting the list of similar values
#list(enumerate()) ,this means it will find the similarity from the 144th row to all other rows
#the one that has higher value has the higher similarity
#the one that has lower value has the lowest similarity
similarity_score=list(enumerate(similarity[index_of_the_movie]))
print(similarity_score)

[(0, 0.03430795841893329), (1, 0.09511195857674826), (2, 0.058552408571184565), (3, 0.04469277271095233), (4, 0.03973470673477822), (5, 0.02177817867852071), (6, 0.04558972706389338), (7, 0.06936215215319724), (8, 0.04846713004968673), (9, 0.035349853661421585), (10, 0.061037480786342774), (11, 0.06726641254422122), (12, 0.035392838587153586), (13, 0.03896725278925161), (14, 0.04361181419792991), (15, 0.023264825688027256), (16, 0.029178729641654678), (17, 0.04052864091651917), (18, 0.006293917895576081), (19, 0.00852014269769809), (20, 0.07246132104639665), (21, 0.016799437716901403), (22, 0.03789366935501707), (23, 0.06435711835442062), (24, 0.055321667737826284), (25, 0.011682351134001255), (26, 0.01957769602249658), (27, 0.03111599927305862), (28, 0.03458716891386232), (29, 0.06085541109137174), (30, 0.0772657630264815), (31, 0.02556941208441166), (32, 0.03194208597705869), (33, 0.033652177178800965), (34, 0.09099133998282036), (35, 0.01848855927205291), (36, 0.029195449578769992),

In [17]:
#the length of the similarity_score


print(len(similarity_score))

1400


In [18]:
#out of all these we only require the higher similarity_score
#so we need to sort them


sorted_similarity_score=sorted(similarity_score,key = lambda x:x[1],reverse=True )
#lambda function describe that we only sort the similarity values not the index values
#reverse =True defines descending order


print(sorted_similarity_score)

[(940, 1.0), (855, 0.15562258547514246), (1049, 0.15259288343024854), (630, 0.141358734296048), (402, 0.14085602960208246), (998, 0.13495110941022573), (235, 0.13135633300258107), (1027, 0.1276533492201557), (526, 0.12599822778232986), (525, 0.125336129520792), (1198, 0.1250389456589112), (737, 0.12461681957862204), (1134, 0.12343279748984443), (410, 0.12252421797269249), (178, 0.12186949820665391), (1036, 0.121071996389108), (902, 0.11987019554109385), (1258, 0.11881584210266181), (920, 0.1178365393624421), (791, 0.11766199210704957), (1221, 0.11758754943487246), (488, 0.11587374923576362), (1199, 0.11553957307841861), (1259, 0.11545228313063917), (694, 0.11462930887210428), (499, 0.11446657698366206), (783, 0.11284346485310194), (246, 0.1126732943057932), (585, 0.1125568580650928), (993, 0.11092946078349714), (943, 0.11082055533503406), (1235, 0.11021319852053969), (972, 0.10917573887343085), (304, 0.10884794251992373), (459, 0.10822348233800279), (573, 0.10817089870892876), (429, 0.

In [19]:
#now we need to find the movie name based on the index

print("Movies recommended for you....\n")
j=1
for i in sorted_similarity_score:
    ind=i[0]
    suggested_movie=data[data.index==ind]["Movie"].values[0]
    if(j<=10):
       print(j,'.',suggested_movie)
    j=j+1    
    
#i have maintained j variable inorder to print only specific number of values (here only 10 values)

Movies recommended for you....

1 . Prema Kavali
2 . Chiru Navvuto
3 . Idi Ma Prema Katha
4 . Shivam
5 . Krishnam Vande Jagadgurum
6 . Jayammu Nischayammu Raa!
7 . Teen Maar
8 . Soggadu
9 . Gokulamlo Seetha
10 . Rayalaseema Ramanna Chowdary
