In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import matplotlib.pyplot as plt
import difflib

In [5]:
df = pd.read_csv('IMDb_All_Genres_etf_clean1.csv')
df.head(30)

Unnamed: 0,Movie_Title,Year,Director,Actors,Rating,Runtime(Mins),Censor,Total_Gross,main_genre,side_genre
0,Kantara,2022,Rishab Shetty,"Rishab Shetty, Sapthami Gowda, Kishore Kumar G...",9.3,148,UA,Gross Unkown,Action,"Adventure, Drama"
1,The Dark Knight,2008,Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",9.0,152,UA,$534.86M,Action,"Crime, Drama"
2,The Lord of the Rings: The Return of the King,2003,Peter Jackson,"Elijah Wood, Viggo Mortensen, Ian McKellen, Or...",9.0,201,U,$377.85M,Action,"Adventure, Drama"
3,Inception,2010,Christopher Nolan,"Leonardo DiCaprio, Joseph Gordon-Levitt, Ellio...",8.8,148,UA,$292.58M,Action,"Adventure, Sci-Fi"
4,The Lord of the Rings: The Two Towers,2002,Peter Jackson,"Elijah Wood, Ian McKellen, Viggo Mortensen, Or...",8.8,179,UA,$342.55M,Action,"Adventure, Drama"
5,The Lord of the Rings: The Fellowship of the Ring,2001,Peter Jackson,"Elijah Wood, Ian McKellen, Orlando Bloom, Sean...",8.8,178,U,$315.54M,Action,"Adventure, Drama"
6,The Matrix,1999,"Directors:Lana Wachowski, Lilly Wachowski","Keanu Reeves, Laurence Fishburne, Carrie-Anne ...",8.7,136,A,$171.48M,Action,Sci-Fi
7,The Empire Strikes Back,1980,Irvin Kershner,"Mark Hamill, Harrison Ford, Carrie Fisher, Bil...",8.7,124,UA,$290.48M,Action,"Adventure, Fantasy"
8,Terminator 2: Judgment Day,1991,James Cameron,"Arnold Schwarzenegger, Linda Hamilton, Edward ...",8.6,137,A,$204.84M,Action,Sci-Fi
9,Star Wars,1977,George Lucas,"Mark Hamill, Harrison Ford, Carrie Fisher, Ale...",8.6,121,U,$322.74M,Action,"Adventure, Fantasy"


In [6]:
# Create a new index column
df['index_column_name'] = range(1, len(df) + 1)


In [7]:
df.columns


Index(['Movie_Title', 'Year', 'Director', 'Actors', 'Rating', 'Runtime(Mins)',
       'Censor', 'Total_Gross', 'main_genre', 'side_genre',
       'index_column_name'],
      dtype='object')

In [8]:
df.isnull().sum()

Movie_Title          0
Year                 0
Director             0
Actors               0
Rating               0
Runtime(Mins)        0
Censor               0
Total_Gross          0
main_genre           0
side_genre           0
index_column_name    0
dtype: int64

In [9]:
df.shape


(5562, 11)

In [10]:
df.columns



Index(['Movie_Title', 'Year', 'Director', 'Actors', 'Rating', 'Runtime(Mins)',
       'Censor', 'Total_Gross', 'main_genre', 'side_genre',
       'index_column_name'],
      dtype='object')

In [11]:
selected_features = ['Movie_Title','Director','main_genre','side_genre','side_genre','Rating']
print(selected_features)

['Movie_Title', 'Director', 'main_genre', 'side_genre', 'side_genre', 'Rating']


In [12]:
combines_features = df['Movie_Title']+' '+df['Director']+' '+df['main_genre']+' '+df['side_genre']

In [13]:
print(combines_features)

0         Kantara Rishab Shetty Action  Adventure,  Drama
1       The Dark Knight Christopher Nolan Action  Crim...
2       The Lord of the Rings: The Return of the King ...
3       Inception Christopher Nolan Action  Adventure,...
4       The Lord of the Rings: The Two Towers Peter Ja...
                              ...                        
5557    Disaster Movie Directors:Jason Friedberg, Aaro...
5558    The Hottie & the Nottie Tom Putnam Comedy  Rom...
5559    From Justin to Kelly Robert Iscove Comedy  Mus...
5560    Superbabies: Baby Geniuses 2 Bob Clark Comedy ...
5561    Cumali Ceber: Allah Seni Alsin Gökhan Gök Come...
Length: 5562, dtype: object


In [14]:
verctorizer = TfidfVectorizer()

In [15]:
feature_verctors = verctorizer.fit_transform(combines_features)
print(feature_verctors)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 42267 stored elements and shape (5562, 8388)>
  Coords	Values
  (0, 4001)	0.5791235150894453
  (0, 6254)	0.5791235150894453
  (0, 6759)	0.5197056868923121
  (0, 139)	0.14397264499955992
  (0, 168)	0.16667424361278993
  (0, 2132)	0.10309937711070488
  (1, 139)	0.15880099013367188
  (1, 2132)	0.1137180133586822
  (1, 7430)	0.1682398746938621
  (1, 1829)	0.47058820462554957
  (1, 4151)	0.4900382114961702
  (1, 1438)	0.4227345581686533
  (1, 5339)	0.5106146165799971
  (1, 1701)	0.18034218347094078
  (2, 139)	0.10744434085872985
  (2, 168)	0.12438615851753172
  (2, 2132)	0.07694131490491939
  (2, 7430)	0.45532266335169214
  (2, 4510)	0.34160698739495626
  (2, 5418)	0.35303464561967923
  (2, 6245)	0.37156446197774345
  (2, 6194)	0.36510241038839036
  (2, 4110)	0.2995512034314213
  (2, 5711)	0.23653264223031176
  (2, 3757)	0.3183987670311394
  :	:
  (5559, 6315)	0.1650661688452045
  (5559, 7523)	0.2947374683163896
  (5559, 2781)	0.

In [16]:
similarlity = cosine_similarity(feature_verctors)

In [17]:
print(similarlity)

[[1.         0.03458725 0.04413362 ... 0.         0.         0.        ]
 [0.03458725 1.         0.10241531 ... 0.         0.         0.        ]
 [0.04413362 0.10241531 1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         0.012337   0.0188797 ]
 [0.         0.         0.         ... 0.012337   1.         0.01771772]
 [0.         0.         0.         ... 0.0188797  0.01771772 1.        ]]


In [18]:
print(similarlity.shape)

(5562, 5562)


In [19]:
movie_mame = input("Enter Movie name :")

Enter Movie name : iron man


In [20]:
list_of_movies = df['Movie_Title'].tolist()
print(list_of_movies)

['Kantara', 'The Dark Knight', 'The Lord of the Rings: The Return of the King', 'Inception', 'The Lord of the Rings: The Two Towers', 'The Lord of the Rings: The Fellowship of the Ring', 'The Matrix', 'The Empire Strikes Back', 'Terminator 2: Judgment Day', 'Star Wars', 'Seppuku', 'Shichinin no samurai', 'Kaithi', 'Asuran', 'Sita Ramam', 'Gladiator', 'Léon', 'Vikram', 'Spider-Man: Into the Spider-Verse', 'Avengers: Endgame', 'Avengers: Infinity War', 'Top Gun: Maverick', 'The Dark Knight Rises', 'K.G.F: Chapter 2', 'Shershaah', 'Oldeuboi', 'Mononoke-hime', 'Aliens', 'Raiders of the Lost Ark', 'Vikram Vedha', 'Dangal', 'Spider-Man: No Way Home', 'Heat', 'Star Wars: Episode VI - Return of the Jedi', 'North by Northwest', 'Major', '1917', 'Uri: The Surgical Strike', 'K.G.F: Chapter 1', 'Dag II', 'Baahubali 2: The Conclusion', 'Gangs of Wasseypur', 'Paan Singh Tomar', 'Warrior', 'Kimetsu no Yaiba: Mugen Ressha-Hen', 'V for Vendetta', 'Batman Begins', 'Kill Bill: Vol. 1', 'Lock, Stock and T

In [24]:
find_close_match = difflib.get_close_matches(movie_mame,list_of_movies)
print(find_close_match)

['Iron Man', 'Iron Man 2', 'Gridiron Gang']


In [25]:
close_match = find_close_match[0]
print(close_match)


Iron Man


In [27]:
index_of_the_movie = df[df.Movie_Title == close_match]['index_column_name'].values[0]
print(index_of_the_movie)

115


In [32]:
similarity_score = list(enumerate(similarlity[index_of_the_movie]))
print(similarity_score)

[(0, 0.05407714467882798), (1, 0.03162742927241597), (2, 0.040356855050878625), (3, 0.04814493409459656), (4, 0.04296036633568548), (5, 0.03823305503878516), (6, 0.013851048756420374), (7, 0.03755837686375934), (8, 0.01788709963837208), (9, 0.04880547405546741), (10, 0.02760203347969809), (11, 0.026202915365325544), (12, 0.044777645475405005), (13, 0.03422025465563879), (14, 0.024108897058978253), (15, 0.06501658144418707), (16, 0.032209614155413895), (17, 0.019643915534071413), (18, 0.024619383796606078), (19, 0.038499089296197594), (20, 0.0303846620464598), (21, 0.025756298745528452), (22, 0.026967023946534958), (23, 0.030314112679462317), (24, 0.03299103902592876), (25, 0.028123461341896396), (26, 0.040985840052431846), (27, 0.0518521502974024), (28, 0.03936311139563162), (29, 0.019803921087334257), (30, 0.02823593347256632), (31, 0.038235372056517475), (32, 0.03735924996450582), (33, 0.03076670312048202), (34, 0.039129220363721254), (35, 0.02457975967241353), (36, 0.031278941624605

In [33]:
len(similarity_score)

5562

In [35]:
sorted_score = sorted(similarity_score,key = lambda x:x[1],reverse = True)
print(sorted_score)

[(115, 1.0), (1263, 0.47784061068139966), (252, 0.38646863640160506), (2947, 0.3202272691635682), (503, 0.2748522884471176), (2925, 0.2519240623474859), (587, 0.18311715834802406), (467, 0.0743703831654378), (1428, 0.06966685193071491), (944, 0.0672050629352596), (1078, 0.06689106545615418), (1454, 0.0667112628516221), (399, 0.06571284751978343), (1238, 0.06559555679202925), (15, 0.06501658144418707), (571, 0.06492193939693809), (610, 0.06390175016364051), (1201, 0.0630876326950135), (1313, 0.0628659182008644), (366, 0.06285129602702415), (603, 0.06240454245309719), (1063, 0.062375499228025766), (1294, 0.06230825991965186), (563, 0.06229717082358433), (1315, 0.06187797860618532), (749, 0.061702504011831555), (797, 0.06136881438208155), (386, 0.060915652047808216), (720, 0.06084025254155035), (436, 0.06029766229790803), (1004, 0.05915412246949574), (1181, 0.05873572942535163), (375, 0.05847910268236658), (143, 0.058393719843608555), (716, 0.05832475668194649), (1246, 0.05824983447423361

Movie Recomnadation System 

In [43]:
import difflib

Movie_name = input("Enter the Movie name: ")
list_of_movies = df['Movie_Title'].tolist()

# Find the closest match to the movie name entered by the user
find_close_match = difflib.get_close_matches(Movie_name, list_of_movies)
close_match = find_close_match[0]

# Get the index of the closest match in the DataFrame
index_of_the_movie = df[df.Movie_Title == close_match].index[0]

# Calculate similarity score for the closest match
similarity_score = list(enumerate(similarlity[index_of_the_movie]))

# Sort the movies by similarity score
sorted_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)

# Display the top 20 similar movies
i = 1
for item in sorted_score:
    index = item[0]  # Use the correct index from sorted_score
    title_from_index = df.iloc[index]['Movie_Title']  # Fetch title using the correct index
    if i < 20:
        print(f"{i}. {title_from_index}")
        i += 1

    
    



Enter the Movie name:  kantara


1. Kantara
2. Dilwale
3. Sooryavanshi
4. Chennai Express
5. The Poseidon Adventure
6. R.I.P.D.
7. Last Action Hero
8. Zathura: A Space Adventure
9. Pompeii
10. Willow
11. S.W.A.T.
12. Gladiator
13. The Edge
14. Hanna
15. The Promise
16. Noah
17. War Horse
18. The Grey
19. The Eagle
