<a href="https://colab.research.google.com/github/TRISHA16-design/hello-world/blob/main/MovieRecomender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from google.colab import files
uploaded = files.upload()

Saving rotten_tomatoes_movie_reviews.csv to rotten_tomatoes_movie_reviews.csv


In [8]:
from google.colab import files
uploaded = files.upload()

Saving rotten_tomatoes_movies.csv to rotten_tomatoes_movies (1).csv


In [5]:
# STEP 2: Import Libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import warnings
warnings.filterwarnings('ignore')

nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [9]:
# STEP 3: Load Datasets
movies_df = pd.read_csv("rotten_tomatoes_movies.csv")
reviews_df = pd.read_csv("rotten_tomatoes_movie_reviews.csv")

# Drop rows with missing titles
movies_df.dropna(subset=['title'], inplace=True)

# Check structure
movies_df.head()

Unnamed: 0,id,title,audienceScore,tomatoMeter,rating,ratingContents,releaseDateTheaters,releaseDateStreaming,runtimeMinutes,genre,originalLanguage,director,writer,boxOffice,distributor,soundMix
0,space-zombie-bingo,Space Zombie Bingo!,50.0,,,,,2018-08-25,75.0,"Comedy, Horror, Sci-fi",English,George Ormrod,"George Ormrod,John Sabotta",,,
1,the_green_grass,The Green Grass,,,,,,2020-02-11,114.0,Drama,English,Tiffany Edwards,Tiffany Edwards,,,
2,love_lies,"Love, Lies",43.0,,,,,,120.0,Drama,Korean,"Park Heung-Sik,Heung-Sik Park","Ha Young-Joon,Jeon Yun-su,Song Hye-jin",,,
3,the_sore_losers_1997,Sore Losers,60.0,,,,,2020-10-23,90.0,"Action, Mystery & thriller",English,John Michael McCarthy,John Michael McCarthy,,,
4,dinosaur_island_2002,Dinosaur Island,70.0,,,,,2017-03-27,80.0,"Fantasy, Adventure, Animation",English,Will Meugniot,John Loy,,,


In [None]:
# STEP 4: Sentiment Score Aggregation
# Use VADER to compute sentiment polarity for each review
reviews_df['vader_sentiment'] = reviews_df['reviewText'].astype(str).apply(lambda x: sia.polarity_scores(x)['compound'])

# Group by movie ID to get average sentiment score
avg_sentiment = reviews_df.groupby('id')['vader_sentiment'].mean().reset_index()
avg_sentiment.columns = ['id', 'avg_sentiment']

# Merge with movie dataframe
movies_df = pd.merge(movies_df, avg_sentiment, on='id', how='left')
movies_df['avg_sentiment'] = movies_df['avg_sentiment'].fillna(0)

In [11]:
# STEP 5: Create Combined Features
# Combine important text features into a single string for each movie
def combine_features(row):
    return f"{row['genre']} {row['director']} {row['rating']}"

movies_df['combined_features'] = movies_df.fillna('').apply(combine_features, axis=1)


In [12]:
# STEP 6: Vectorize the Features using TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies_df['combined_features'])

# Combine TF-IDF similarity with numeric features (audience score and sentiment)
numeric_features = movies_df[['audienceScore', 'avg_sentiment']].fillna(0)
numeric_features_scaled = (numeric_features - numeric_features.min()) / (numeric_features.max() - numeric_features.min())

# Combine everything
from scipy.sparse import hstack
final_features = hstack([tfidf_matrix, numeric_features_scaled])


In [None]:
# STEP 7: Compute Similarity Matrix
cosine_sim = cosine_similarity(final_features, final_features)

In [None]:
# STEP 8: Recommendation Function
indices = pd.Series(movies_df.index, index=movies_df['title'].str.lower())

def recommend_movies(title, num_recommendations=5):
    title = title.lower()
    if title not in indices:
        return f"Movie '{title}' not found in dataset."

    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations+1]

    movie_indices = [i[0] for i in sim_scores]
    return movies_df[['title', 'genre', 'audienceScore']].iloc[movie_indices]

In [None]:
# STEP 9: Try it!
recommend_movies("The Dark Knight", 5)
