In [16]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Import Libraries**

In [17]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# **Import Dataset**

In [18]:
df = pd.read_csv('/content/drive/MyDrive/cleaned.csv')

In [19]:
df.head()

Unnamed: 0,Movie_ID,Movie_Title,Movie_Genre,Movie_Language,Movie_Budget,Movie_Popularity,Movie_Release_Date,Movie_Revenue,Movie_Runtime,Movie_Vote,Movie_Vote_Count,Movie_Keywords,Movie_Overview,Movie_Tagline,Movie_Cast,Movie_Director
0,1,Four Rooms,Crime Comedy,en,4000000,22.87623,1995-12-09,4300000,98.0,6.5,530,hotel new year's eve witch bet hotel room,It's Ted the Bellhop's first night on the job....,Twelve outrageous guests. Four scandalous requ...,Tim Roth Antonio Banderas Jennifer Beals Madon...,Allison Anders
1,2,Star Wars,Adventure Action Science Fiction,en,11000000,126.393695,1977-05-25,775398007,121.0,8.1,6624,android galaxy hermit death star lightsaber,Princess Leia is captured and held hostage by ...,"A long time ago in a galaxy far, far away...",Mark Hamill Harrison Ford Carrie Fisher Peter ...,George Lucas
2,3,Finding Nemo,Animation Family,en,94000000,85.688789,2003-05-30,940335536,100.0,7.6,6122,father son relationship harbor underwater fish...,"Nemo, an adventurous young clownfish, is unexp...","There are 3.7 trillion fish in the ocean, they...",Albert Brooks Ellen DeGeneres Alexander Gould ...,Andrew Stanton
3,4,Forrest Gump,Comedy Drama Romance,en,55000000,138.133331,1994-07-06,677945399,142.0,8.2,7927,vietnam veteran hippie mentally disabled runni...,A man with a low IQ has accomplished great thi...,"The world will never be the same, once you've ...",Tom Hanks Robin Wright Gary Sinise Mykelti Wil...,Robert Zemeckis
4,5,American Beauty,Drama,en,15000000,80.878605,1999-09-15,356296601,122.0,7.9,3313,male nudity female nudity adultery midlife cri...,"Lester Burnham, a depressed suburban father in...",Look closer.,Kevin Spacey Annette Bening Thora Birch Wes Be...,Sam Mendes


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3736 entries, 0 to 3735
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Movie_ID            3736 non-null   int64  
 1   Movie_Title         3736 non-null   object 
 2   Movie_Genre         3736 non-null   object 
 3   Movie_Language      3736 non-null   object 
 4   Movie_Budget        3736 non-null   int64  
 5   Movie_Popularity    3736 non-null   float64
 6   Movie_Release_Date  3736 non-null   object 
 7   Movie_Revenue       3736 non-null   int64  
 8   Movie_Runtime       3736 non-null   float64
 9   Movie_Vote          3736 non-null   float64
 10  Movie_Vote_Count    3736 non-null   int64  
 11  Movie_Keywords      3736 non-null   object 
 12  Movie_Overview      3736 non-null   object 
 13  Movie_Tagline       3736 non-null   object 
 14  Movie_Cast          3736 non-null   object 
 15  Movie_Director      3736 non-null   object 
dtypes: flo

In [21]:
df.shape

(3736, 16)

In [22]:
df.columns

Index(['Movie_ID', 'Movie_Title', 'Movie_Genre', 'Movie_Language',
       'Movie_Budget', 'Movie_Popularity', 'Movie_Release_Date',
       'Movie_Revenue', 'Movie_Runtime', 'Movie_Vote', 'Movie_Vote_Count',
       'Movie_Keywords', 'Movie_Overview', 'Movie_Tagline', 'Movie_Cast',
       'Movie_Director'],
      dtype='object')

In [23]:
# Select the relevant columns
df[['Movie_Overview', 'Movie_Vote']]

Unnamed: 0,Movie_Overview,Movie_Vote
0,It's Ted the Bellhop's first night on the job....,6.5
1,Princess Leia is captured and held hostage by ...,8.1
2,"Nemo, an adventurous young clownfish, is unexp...",7.6
3,A man with a low IQ has accomplished great thi...,8.2
4,"Lester Burnham, a depressed suburban father in...",7.9
...,...,...
3731,When his hard-earned kicks get snatched by a l...,7.5
3732,Sara Gold is a young girl on a quest to save m...,5.3
3733,A squad of soldiers fight in the Korean War's ...,5.8
3734,"Long before he even met Shrek, the notorious f...",6.4


In [24]:
# Define a function to clean the text data
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert text to lowercase
    return text

In [25]:
# Apply the clean_text function to the 'Movie_Overview' column
df['Movie_Overview'] = df['Movie_Overview'].apply(clean_text)

In [26]:
# Convert 'Movie_Vote' into binary sentiment labels (1 for positive, 0 for negative)
df['Sentiment'] = df['Movie_Vote'].apply(lambda x: 1 if x >= 5 else 0)

In [27]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['Movie_Overview'], df['Sentiment'], test_size=0.2, random_state=42)

In [28]:
# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [29]:
# Train and evaluate different models

# Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train_vec, y_train)
nb_pred = nb_model.predict(X_test_vec)
nb_accuracy = accuracy_score(y_test, nb_pred)
nb_report = classification_report(y_test, nb_pred)

# Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_vec, y_train)
rf_pred = rf_model.predict(X_test_vec)
rf_accuracy = accuracy_score(y_test, rf_pred)
rf_report = classification_report(y_test, rf_pred)

# Support Vector Machine
svm_model = SVC(random_state=42)
svm_model.fit(X_train_vec, y_train)
svm_pred = svm_model.predict(X_test_vec)
svm_accuracy = accuracy_score(y_test, svm_pred)
svm_report = classification_report(y_test, svm_pred)

# Print results
print(f'Naive Bayes Accuracy: {nb_accuracy}')
print(f'Naive Bayes Classification Report:\n{nb_report}')

print(f'Random Forest Accuracy: {rf_accuracy}')
print(f'Random Forest Classification Report:\n{rf_report}')

print(f'SVM Accuracy: {svm_accuracy}')
print(f'SVM Classification Report:\n{svm_report}')


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Naive Bayes Accuracy: 0.9264705882352942
Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        55
           1       0.93      1.00      0.96       693

    accuracy                           0.93       748
   macro avg       0.46      0.50      0.48       748
weighted avg       0.86      0.93      0.89       748

Random Forest Accuracy: 0.9264705882352942
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        55
           1       0.93      1.00      0.96       693

    accuracy                           0.93       748
   macro avg       0.46      0.50      0.48       748
weighted avg       0.86      0.93      0.89       748

SVM Accuracy: 0.9264705882352942
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        55
           1       0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# **Get Feature Selection**



In [30]:
df_features = df[[ 'Movie_Genre', 'Movie_Keywords', 'Movie_Tagline', 'Movie_Cast', 'Movie_Director']].fillna('')

In [31]:
df_features.shape

(3736, 5)

In [32]:
df_features

Unnamed: 0,Movie_Genre,Movie_Keywords,Movie_Tagline,Movie_Cast,Movie_Director
0,Crime Comedy,hotel new year's eve witch bet hotel room,Twelve outrageous guests. Four scandalous requ...,Tim Roth Antonio Banderas Jennifer Beals Madon...,Allison Anders
1,Adventure Action Science Fiction,android galaxy hermit death star lightsaber,"A long time ago in a galaxy far, far away...",Mark Hamill Harrison Ford Carrie Fisher Peter ...,George Lucas
2,Animation Family,father son relationship harbor underwater fish...,"There are 3.7 trillion fish in the ocean, they...",Albert Brooks Ellen DeGeneres Alexander Gould ...,Andrew Stanton
3,Comedy Drama Romance,vietnam veteran hippie mentally disabled runni...,"The world will never be the same, once you've ...",Tom Hanks Robin Wright Gary Sinise Mykelti Wil...,Robert Zemeckis
4,Drama,male nudity female nudity adultery midlife cri...,Look closer.,Kevin Spacey Annette Bening Thora Birch Wes Be...,Sam Mendes
...,...,...,...,...,...
3731,Adventure,blow job cigarette smoking illegal drugs smoki...,They aren't just shoes,Mahershala Ali Kofi Siriboe Christopher Meyer ...,Justin Tipping
3732,Drama,undercover dog animal welfare animal rights or...,She ended up on the wrong side of right.,Allison Paige James Remar Lea Thompson Jayson ...,Alex Ranarivelo
3733,History Drama War Action,"korea fictionalized history operation ""trudy j...",The Odds Were 5000 to 1 … One was all They Nee...,Lee Jung-jae Liam Neeson Lee Beom-soo Jin Se-y...,John H. Lee
3734,Action Adventure Animation Family Fantasy,adventure fairy-tale figure,Live for danger. Fight for justice. Pray for m...,Antonio Banderas Salma Hayek Zach Galifianakis...,Chris Miller


In [33]:
x = df_features['Movie_Genre'] + '' + df_features['Movie_Keywords'] + '' + df_features['Movie_Tagline'] +''+ df_features['Movie_Cast']+ ''+ df_features['Movie_Director']

In [34]:
x

0       Crime Comedyhotel new year's eve witch bet hot...
1       Adventure Action Science Fictionandroid galaxy...
2       Animation Familyfather son relationship harbor...
3       Comedy Drama Romancevietnam veteran hippie men...
4       Dramamale nudity female nudity adultery midlif...
                              ...                        
3731    Adventureblow job cigarette smoking illegal dr...
3732    Dramaundercover dog animal welfare animal righ...
3733    History Drama War Actionkorea fictionalized hi...
3734    Action Adventure Animation Family Fantasyadven...
3735    Thriller Dramachristian film sex traffickingSh...
Length: 3736, dtype: object

In [35]:
x.shape

(3736,)

# Get Feature Text Conversion to Tokens

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [37]:
tfidf = TfidfVectorizer()


In [38]:
x = tfidf.fit_transform(x)


In [39]:
x.shape

(3736, 23144)

In [40]:
print(x)

  (0, 882)	0.16399376793991047
  (0, 20873)	0.19580522160421626
  (0, 13250)	0.1418479473495443
  (0, 13012)	0.17477899443518957
  (0, 1811)	0.17989949477206338
  (0, 11153)	0.09897970055040688
  (0, 1608)	0.14296754077088375
  (0, 1044)	0.14078044728365882
  (0, 17755)	0.14296754077088375
  (0, 20800)	0.11455042707933032
  (0, 12495)	0.09036048613230543
  (0, 15157)	0.060577321769720346
  (0, 22538)	0.17477899443518957
  (0, 8162)	0.08519367475403691
  (0, 22484)	0.11031131269716153
  (0, 11222)	0.13608100670933915
  (0, 20286)	0.10235662073927458
  (0, 15235)	0.07121741273261348
  (0, 5236)	0.11893459322907225
  (0, 8023)	0.11700512841834275
  (0, 10000)	0.19795940110081375
  (0, 10720)	0.1502023992069678
  (0, 1959)	0.19580522160421626
  (0, 12692)	0.15285276512076806
  (0, 15238)	0.08384609896687899
  :	:
  (3734, 379)	0.0847408618864912
  (3734, 1608)	0.18880589898275651
  (3734, 1044)	0.1859175779709501
  (3734, 8162)	0.3375256004862958
  (3735, 2432)	0.28444824987693956
  (3735,

# **Get Similarity Score using Cosine Similarity**


In [41]:
 from sklearn.metrics.pairwise import cosine_similarity


In [42]:
Similarity_Score = cosine_similarity(x)


In [43]:
Similarity_Score

array([[1.        , 0.01382658, 0.03604693, ..., 0.00676268, 0.08192172,
        0.        ],
       [0.01382658, 1.        , 0.00808918, ..., 0.        , 0.01172742,
        0.        ],
       [0.03604693, 0.00808918, 1.        , ..., 0.01954575, 0.04699168,
        0.        ],
       ...,
       [0.00676268, 0.        , 0.01954575, ..., 1.        , 0.        ,
        0.005821  ],
       [0.08192172, 0.01172742, 0.04699168, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.005821  , 0.        ,
        1.        ]])

In [44]:
Similarity_Score.shape

(3736, 3736)

**Get Movie Name as Input from User and Validate for Closest Spelling**

In [45]:
Favourite_Movie_Name = input(' Enter your favourite movie name :')

 Enter your favourite movie name :harry potter


In [46]:
All_Movies_Title_List = df['Movie_Title'].tolist()

In [47]:
import difflib

In [48]:
Movie_Recommendation = difflib.get_close_matches (Favourite_Movie_Name, All_Movies_Title_List)
print(Movie_Recommendation)

['Party Monster']


In [49]:
Close_Match = Movie_Recommendation[0]
print (Close_Match)

Party Monster


In [50]:

Index_of_Close_Match_Movie = df [df.Movie_Title == Close_Match]['Movie_ID'].values[0]
print(Index_of_Close_Match_Movie)


480


In [51]:
# getting a list of similar movies

Recommendation_Score = list(enumerate(Similarity_Score[Index_of_Close_Match_Movie]))
print (Recommendation_Score)




[(0, 0.03688559038014728), (1, 0.010732773884911277), (2, 0.009941982019048418), (3, 0.008680662688723244), (4, 0.0), (5, 0.004019446921864555), (6, 0.013059235501939439), (7, 0.014099145425574767), (8, 0.012021032077714293), (9, 0.005498083192474922), (10, 0.004222534344622976), (11, 0.004496990375007899), (12, 0.013006434588129062), (13, 0.010044823515759454), (14, 0.022647117239262257), (15, 0.003529725007697225), (16, 0.007105976900328582), (17, 0.003810607578816616), (18, 0.016820398238821675), (19, 0.0), (20, 0.004591214317861938), (21, 0.005913979326318338), (22, 0.004223348495390256), (23, 0.014043343078026621), (24, 0.018186842739306833), (25, 0.0), (26, 0.006863116608641675), (27, 0.0), (28, 0.02090334877371906), (29, 0.0), (30, 0.0038376177303312085), (31, 0.020182667669229416), (32, 0.013906061417945929), (33, 0.00430227687077725), (34, 0.011925766965777965), (35, 0.012186933838817058), (36, 0.0061107371413322955), (37, 0.0056424079940923435), (38, 0.0050164561916965665), (

In [52]:
 len(Recommendation_Score)



3736

# **Get All Movies Sorted Based on Recommendation Score for your Favourite Movie**


In [53]:
#sorting the movies based on their similarity score

Sorted_Similar_Movies = sorted(Recommendation_Score, key = lambda x:x[1], reverse=True)
print (Sorted_Similar_Movies)


[(480, 1.0), (749, 0.13403480391857736), (1711, 0.09705339581663155), (1651, 0.09359874657063783), (1150, 0.07490643638689187), (1039, 0.07046188561254592), (1669, 0.06886706390168676), (320, 0.06755705451804525), (1774, 0.06612334595457367), (1871, 0.06551061348112444), (3133, 0.0649672329677061), (1303, 0.06407074457397305), (2950, 0.06262953176563953), (1682, 0.06253570172564697), (89, 0.062303366828237894), (297, 0.06127926390679003), (1954, 0.05960640200577683), (1061, 0.05918538466783155), (2082, 0.05888540598549537), (162, 0.05860143896033232), (87, 0.05748858097925567), (1638, 0.05733865125164254), (2161, 0.05631145217115622), (3360, 0.05610534186409006), (1944, 0.056042649274691436), (341, 0.053839383146945134), (3734, 0.05382628467016462), (109, 0.053768136422593205), (3083, 0.05346298045488385), (2079, 0.05304048156063892), (1431, 0.05271480064166616), (1163, 0.052565063070893175), (1144, 0.052362241701449264), (2210, 0.05199436366038447), (1340, 0.050808287056539475), (1875

In [54]:
# print the name of similar movies based on the index

print('Top 30 Movies Suggested for You :\ n ')

i=1

for movie in Sorted_Similar_Movies:
   index = movie[0]
   title_from_index = df [df.index==index]['Movie_Title'].values[0]
   if (i<31):
    print(i, '.',title_from_index)
    i+=1

Top 30 Movies Suggested for You :\ n 
1 . Apocalypto
2 . Flash Gordon
3 . Darkness
4 . Little Shop of Horrors
5 . The Man in the Iron Mask
6 . Mad Max 2: The Road Warrior
7 . Switchback
8 . From Hell
9 . The Count of Monte Cristo
10 . Death Sentence
11 . Jeff, Who Lives at Home
12 . Assassins
13 . Kick-Ass 2
14 . Under the Tuscan Sun
15 . Braveheart
16 . A Streetcar Named Desire
17 . Underworld: Rise of the Lycans
18 . Wanted
19 . Nancy Drew
20 . AVP: Alien vs. Predator
21 . Star Trek: Generations
22 . Birth
23 . This Christmas
24 . Tomorrowland
25 . Spy Kids 3-D: Game Over
26 . Shrek 2
27 . Puss in Boots
28 . The Godfather: Part III
29 . 12 Years a Slave
30 . The Hammer


# **Top 10 Movies Recommended Based on Your Favorite Movie**


In [55]:
import difflib

Movie_Name = input('Enter your favorite movie name: ')

list_of_all_titles = df['Movie_Title'].tolist()

# Find close matches to the input movie name
close_matches = difflib.get_close_matches(Movie_Name, list_of_all_titles)

if close_matches:
    closest_match = close_matches[0]  # Get the closest match
    Index_of_Movie = df[df.Movie_Title == closest_match]['Movie_ID'].values[0]

    Recommendation_Score = list(enumerate(Similarity_Score[Index_of_Movie]))

    sorted_similar_movies = sorted(Recommendation_Score, key=lambda x: x[1], reverse=True)

    print('Top 10 Movies suggested for you: \n')

    i = 1

    for movie in sorted_similar_movies:
        index = movie[0]
        if index < len(df):
            title_from_index = df[df.Movie_ID == index]['Movie_Title'].values[0]
            print(i, '.', title_from_index)
            i += 1
        else:
            print("Invalid index:", index)

        if i > 10:
            break
else:
    print('No close matches found for the entered movie name.')


Enter your favorite movie name: harry potter
Top 10 Movies suggested for you: 

1 . Party Monster
2 . The Blair Witch Project
3 . The Doors
4 . Moby Dick
5 . Alfie
6 . Run, Fatboy, Run
7 . The Replacements
8 . A Hard Day's Night
9 . Darkness Falls
10 . The Animal
