In [31]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Import Libraries**

In [32]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics.pairwise import cosine_similarity

# **Import Dataset**

In [33]:
df = pd.read_csv('/content/drive/MyDrive/cleaned.csv')

In [34]:
df.head()

Unnamed: 0,Movie_ID,Movie_Title,Movie_Genre,Movie_Language,Movie_Budget,Movie_Popularity,Movie_Release_Date,Movie_Revenue,Movie_Runtime,Movie_Vote,Movie_Vote_Count,Movie_Keywords,Movie_Overview,Movie_Tagline,Movie_Cast,Movie_Director
0,1,Four Rooms,Crime Comedy,en,4000000,22.87623,1995-12-09,4300000,98.0,6.5,530,hotel new year's eve witch bet hotel room,It's Ted the Bellhop's first night on the job....,Twelve outrageous guests. Four scandalous requ...,Tim Roth Antonio Banderas Jennifer Beals Madon...,Allison Anders
1,2,Star Wars,Adventure Action Science Fiction,en,11000000,126.393695,1977-05-25,775398007,121.0,8.1,6624,android galaxy hermit death star lightsaber,Princess Leia is captured and held hostage by ...,"A long time ago in a galaxy far, far away...",Mark Hamill Harrison Ford Carrie Fisher Peter ...,George Lucas
2,3,Finding Nemo,Animation Family,en,94000000,85.688789,2003-05-30,940335536,100.0,7.6,6122,father son relationship harbor underwater fish...,"Nemo, an adventurous young clownfish, is unexp...","There are 3.7 trillion fish in the ocean, they...",Albert Brooks Ellen DeGeneres Alexander Gould ...,Andrew Stanton
3,4,Forrest Gump,Comedy Drama Romance,en,55000000,138.133331,1994-07-06,677945399,142.0,8.2,7927,vietnam veteran hippie mentally disabled runni...,A man with a low IQ has accomplished great thi...,"The world will never be the same, once you've ...",Tom Hanks Robin Wright Gary Sinise Mykelti Wil...,Robert Zemeckis
4,5,American Beauty,Drama,en,15000000,80.878605,1999-09-15,356296601,122.0,7.9,3313,male nudity female nudity adultery midlife cri...,"Lester Burnham, a depressed suburban father in...",Look closer.,Kevin Spacey Annette Bening Thora Birch Wes Be...,Sam Mendes


# **Cleaning The Dataset and Performing Sentiment Analysis**

In [35]:
# Check for missing values
print(df.isna().sum())

# Select the relevant columns
X = df[['Movie_Overview', 'Movie_Popularity', 'Movie_Revenue']]
y = df['Movie_Vote']

# Define a function to clean the text data
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert text to lowercase
    return text

# Apply the clean_text function to the 'Movie_Overview' column
X['Movie_Overview'] = X['Movie_Overview'].apply(clean_text)

# Convert 'Movie_Vote' into binary sentiment labels (1 for positive, 0 for negative)
df['Sentiment'] = y.apply(lambda x: 1 if x >= 5 else 0)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, df['Sentiment'], random_state=42, stratify=df['Sentiment'])

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train['Movie_Overview'])
X_test_vec = vectorizer.transform(X_test['Movie_Overview'])

# Combine TF-IDF features with numerical features
X_train_combined = np.hstack((X_train_vec.toarray(), X_train[['Movie_Popularity', 'Movie_Revenue']].values))
X_test_combined = np.hstack((X_test_vec.toarray(), X_test[['Movie_Popularity', 'Movie_Revenue']].values))

# Train and evaluate different models
models = {
    'Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

for name, model in models.items():
    model.fit(X_train_combined, y_train)
    predictions = model.predict(X_test_combined)
    accuracy = accuracy_score(y_test, predictions)
    report = classification_report(y_test, predictions, zero_division=1)
    print(f'{name} Accuracy: {accuracy}')
    print(f'{name} Classification Report:\n{report}')


Movie_ID              0
Movie_Title           0
Movie_Genre           0
Movie_Language        0
Movie_Budget          0
Movie_Popularity      0
Movie_Release_Date    0
Movie_Revenue         0
Movie_Runtime         0
Movie_Vote            0
Movie_Vote_Count      0
Movie_Keywords        0
Movie_Overview        0
Movie_Tagline         0
Movie_Cast            0
Movie_Director        0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Movie_Overview'] = X['Movie_Overview'].apply(clean_text)


Naive Bayes Accuracy: 0.7269807280513919
Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.13      0.48      0.21        71
           1       0.95      0.75      0.83       863

    accuracy                           0.73       934
   macro avg       0.54      0.61      0.52       934
weighted avg       0.88      0.73      0.79       934

Random Forest Accuracy: 0.923982869379015
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.00      0.00        71
           1       0.92      1.00      0.96       863

    accuracy                           0.92       934
   macro avg       0.96      0.50      0.48       934
weighted avg       0.93      0.92      0.89       934

SVM Accuracy: 0.923982869379015
SVM Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.00      0.00        71
           1       0.92

# **Top 10 and Top 50 Movies Recommended Based on Your Favorite Movie**


In [76]:
# Combine features to create a 'soup' for each movie
df['soup'] = (df['Movie_Genre'] + ' ' + df['Movie_Keywords'] + ' ' + df['Movie_Tagline'] + ' ' + df['Movie_Cast'] + ' ' + df['Movie_Director'])

# Create a TF-IDF Vectorizer to convert text data into feature vectors
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['soup'])

# Compute cosine similarity between movies
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Function to get movie recommendations based on user's favorite movie
def get_recommendations(favorite_movie, num_recommendations=5, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = df.index[df['Movie_Title'].str.contains(favorite_movie, case=False)].tolist()

    if len(idx) == 0:
        return "Sorry, your favorite movie is not in the database."

    # Use the first found index if multiple movies match
    idx = idx[0]

    # Get the pairwise similarity scores of all movies with the favorite movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the top most similar movies as specified by num_recommendations
    sim_scores = sim_scores[1:num_recommendations + 1]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top similar movies
    return df['Movie_Title'].iloc[movie_indices]

# Ask the user for their favorite movie
favorite_movie = input("Enter your favorite movie: ")

# Get top 10 recommendations based on the user's input
top_10_recommendations = get_recommendations(favorite_movie, num_recommendations=10)
print(f"\nTop 10 movies similar to '{favorite_movie}':\n")
print(top_10_recommendations)

# Get top 50 recommendations based on the user's input
top_50_recommendations = get_recommendations(favorite_movie, num_recommendations=50)
print(f"\n\nTop 50 movies similar to '{favorite_movie}':\n")
print(top_50_recommendations)


Enter your favorite movie: avengers

Top 10 movies similar to 'avengers':

3217                Avengers: Age of Ultron
3221    Captain America: The Winter Soldier
3589             Captain America: Civil War
1456                             Iron Man 2
3085                   Thor: The Dark World
523                     The Incredible Hulk
2602                                  X-Men
1478                                   Thor
3229                                Ant-Man
3533                      X-Men: Apocalypse
Name: Movie_Title, dtype: object


Top 50 movies similar to 'avengers':

3217                             Avengers: Age of Ultron
3221                 Captain America: The Winter Soldier
3589                          Captain America: Civil War
1456                                          Iron Man 2
3085                                Thor: The Dark World
523                                  The Incredible Hulk
2602                                               X-Men
1478         