<a href="https://colab.research.google.com/github/TRISHA16-design/hello-world/blob/main/movie_recomender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import files
import pandas as pd
import io

print("Please upload your two CSV files:")
# This will open a dialog. Select both your CSV files.
uploaded = files.upload()

# Check if files were uploaded
if not uploaded:
    print("No files were uploaded.")
else:
    print(f"\nSuccessfully uploaded {len(uploaded)} file(s):")

    # Create a dictionary to store your dataframes
    dataframes = {}

    for file_name, file_content in uploaded.items():
        print(f"- {file_name}")
        if file_name.lower().endswith('.csv'):
            try:
                # Read the CSV into a pandas DataFrame
                df = pd.read_csv(io.BytesIO(file_content))
                dataframes[file_name] = df
                print(f"  Successfully loaded '{file_name}' into a DataFrame.")
                # Optionally display the head of each dataframe
                # print("  First 5 rows:")
                # display(df.head())
            except Exception as e:
                print(f"  Error reading CSV '{file_name}': {e}")
        else:
            print(f"  '{file_name}' is not a CSV file and will be ignored for DataFrame creation.")

Please upload your two CSV files:


Saving rotten_tomatoes_movie_reviews-1.csv to rotten_tomatoes_movie_reviews-1.csv
Saving rotten_tomatoes_movies.csv to rotten_tomatoes_movies.csv

Successfully uploaded 2 file(s):
- rotten_tomatoes_movie_reviews-1.csv
  Successfully loaded 'rotten_tomatoes_movie_reviews-1.csv' into a DataFrame.
- rotten_tomatoes_movies.csv
  Successfully loaded 'rotten_tomatoes_movies.csv' into a DataFrame.


In [5]:
# Cell 1: Imports and Initial Data Loading
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

print("Step 1: Loading and Preparing Data (Part 1 - Loading CSVs)...")
try:
    movies_df_full = pd.read_csv('rotten_tomatoes_movies.csv')
    reviews_df_full = pd.read_csv('rotten_tomatoes_movie_reviews-1.csv')
    print("Successfully loaded 'rotten_tomatoes_movies.csv' and 'rotten_tomatoes_movie_reviews.csv'.")
    print(f"Movies DataFrame Full Shape: {movies_df_full.shape}")
    print(f"Reviews DataFrame Full Shape: {reviews_df_full.shape}")
except FileNotFoundError:
    print("Error: Ensure 'rotten_tomatoes_movies.csv' and 'rotten_tomatoes_movie_reviews.csv' are in the same directory.")
    # In a notebook, you might want to stop execution here or handle it by exiting.
    # For this script-like structure, we'll allow it to proceed to show where errors would occur.
    movies_df_full = pd.DataFrame() # Create empty DFs to avoid immediate errors in subsequent cells if files not found
    reviews_df_full = pd.DataFrame()

Step 1: Loading and Preparing Data (Part 1 - Loading CSVs)...
Successfully loaded 'rotten_tomatoes_movies.csv' and 'rotten_tomatoes_movie_reviews.csv'.
Movies DataFrame Full Shape: (143258, 16)
Reviews DataFrame Full Shape: (241071, 11)


In [6]:
# Cell 2: Data Preprocessing - Aggregating Reviews and Merging
print("\nStep 1: Loading and Preparing Data (Part 2 - Aggregating Reviews and Merging)...")
if not movies_df_full.empty and not reviews_df_full.empty:
    # Preprocess Reviews: Fill NaN in reviewText and aggregate reviews by movie ID
    reviews_df_full['reviewText'] = reviews_df_full['reviewText'].fillna('')
    critics_reviews_agg = reviews_df_full.groupby('id')['reviewText'].apply(lambda x: ' '.join(x)).reset_index()
    critics_reviews_agg.rename(columns={'reviewText': 'aggregated_reviews'}, inplace=True)
    print(f"Aggregated reviews shape: {critics_reviews_agg.shape}")

    # Merge aggregated reviews with the movies DataFrame
    movies_df_merged = pd.merge(movies_df_full, critics_reviews_agg, on='id', how='left')
    movies_df_merged['aggregated_reviews'] = movies_df_merged['aggregated_reviews'].fillna('')
    print(f"Merged DataFrame shape: {movies_df_merged.shape}")
    # display(movies_df_merged.head()) # Use display in a Jupyter notebook
else:
    print("Skipping review aggregation and merge as one or both initial DataFrames are empty.")
    movies_df_merged = pd.DataFrame() # Ensure movies_df_merged is defined


Step 1: Loading and Preparing Data (Part 2 - Aggregating Reviews and Merging)...
Aggregated reviews shape: (11936, 2)
Merged DataFrame shape: (143258, 17)


In [7]:
# Cell 3: Limit DataFrame Size and Reset Index
print("\nStep 2: Limiting DataFrame to a maximum of 5000 rows and Resetting Index...")
if not movies_df_merged.empty:
    if len(movies_df_merged) >= 5000:
        movies_df = movies_df_merged.head(5000).copy()
        print(f"DataFrame limited to first 5000 rows.")
    else:
        print(f"DataFrame has {len(movies_df_merged)} rows (less than 5000). Using all available data.")
        movies_df = movies_df_merged.copy()

    movies_df.reset_index(drop=True, inplace=True)
    print(f"Working DataFrame shape: {movies_df.shape}")
    # display(movies_df.head())
else:
    print("Skipping DataFrame limiting as merged DataFrame is empty.")
    movies_df = pd.DataFrame() # Ensure movies_df is defined


Step 2: Limiting DataFrame to a maximum of 5000 rows and Resetting Index...
DataFrame limited to first 5000 rows.
Working DataFrame shape: (5000, 17)


In [8]:
# Cell 4: Feature Engineering - Helper Functions and Initial Cleaning
print("\nStep 3: Feature Engineering (Part 1 - Helper Functions and Initial NaN/Type Handling)...")

if not movies_df.empty:
    # Fill NaN values for key features that will be part of the "soup"
    movies_df['genre'] = movies_df['genre'].fillna('UnknownGenre')
    movies_df['director'] = movies_df['director'].fillna('UnknownDirector')
    movies_df['ratingContents'] = movies_df['ratingContents'].fillna('')
    movies_df['rating'] = movies_df['rating'].fillna('UnknownRating')

    # For numerical scores, fill NaN then convert to string with a prefix
    movies_df['audienceScore'] = movies_df['audienceScore'].fillna(-1).astype(int)
    movies_df['tomatoMeter'] = movies_df['tomatoMeter'].fillna(-1).astype(int)

    # Helper function to clean text features (lowercase, remove spaces)
    def clean_text_feature(text_input):
        return str(text_input).lower().replace(' ', '')

    # Helper function to process comma-separated strings (like genre, director)
    def process_comma_separated_string(text_series, clean_func):
        # Ensure input is string before splitting
        return text_series.apply(lambda x: ' '.join([clean_func(i.strip()) for i in str(x).split(',')]))

    print("Helper functions defined and initial NaN/type handling complete.")
else:
    print("Skipping Feature Engineering Part 1 as DataFrame is empty.")


Step 3: Feature Engineering (Part 1 - Helper Functions and Initial NaN/Type Handling)...
Helper functions defined and initial NaN/type handling complete.


In [9]:
# Cell 5: Feature Engineering - Creating the "Content Soup"
print("\nStep 3: Feature Engineering (Part 2 - Cleaning Features and Creating 'Content Soup')...")

if not movies_df.empty and 'genre' in movies_df.columns: # Check if previous cell ran successfully
    # Clean and prepare features
    movies_df['genre_cleaned'] = process_comma_separated_string(movies_df['genre'], clean_text_feature)
    movies_df['director_cleaned'] = process_comma_separated_string(movies_df['director'], clean_text_feature)
    movies_df['rating_cleaned'] = movies_df['rating'].apply(clean_text_feature)

    # For ratingContents, clean each phrase if comma-separated
    movies_df['ratingContents_cleaned'] = movies_df['ratingContents'].apply(
        lambda x: ' '.join([clean_text_feature(i.strip()) for i in str(x).split(',') if i.strip()]) # Ensure x is str
    )

    # Stringify scores
    movies_df['audienceScore_str'] = movies_df['audienceScore'].apply(lambda x: f"audscore{x}" if x != -1 else "audscoreunknown")
    movies_df['tomatoMeter_str'] = movies_df['tomatoMeter'].apply(lambda x: f"tomscore{x}" if x != -1 else "tomscoreunknown")

    # Create the content soup - repeating genre and director for higher weight
    movies_df['content_soup'] = (
        movies_df['genre_cleaned'] + ' ' + movies_df['genre_cleaned'] + ' ' + movies_df['genre_cleaned'] +
        movies_df['director_cleaned'] + ' ' + movies_df['director_cleaned'] + ' ' + movies_df['director_cleaned'] +
        movies_df['rating_cleaned'] + ' ' +
        movies_df['ratingContents_cleaned'] + ' ' +
        movies_df['audienceScore_str'] + ' ' +
        movies_df['tomatoMeter_str'] + ' ' +
        movies_df['aggregated_reviews'].str.lower() # aggregated_reviews should exist from merge
    )
    print("'content_soup' created.")
    # display(movies_df[['title', 'content_soup']].head())
else:
    print("Skipping Content Soup creation as DataFrame is not ready.")
    if 'movies_df' in locals() and not movies_df.empty: # If df exists but soup creation failed
        movies_df['content_soup'] = "" # Add empty column to avoid error in next cell


Step 3: Feature Engineering (Part 2 - Cleaning Features and Creating 'Content Soup')...
'content_soup' created.


In [10]:
# Cell 6: Vectorization using TF-IDF
print("\nStep 4: Vectorization using TF-IDF...")
tfidf_matrix = None # Initialize to None
if 'content_soup' in movies_df.columns and not movies_df['content_soup'].empty:
    if movies_df['content_soup'].isnull().all() or (movies_df['content_soup'] == "").all():
        print("Warning: 'content_soup' column is empty or all NaNs. TF-IDF will result in an empty matrix or error.")
    try:
        tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
        tfidf_matrix = tfidf_vectorizer.fit_transform(movies_df['content_soup'])
        print(f"TF-IDF matrix created with shape: {tfidf_matrix.shape}")
    except Exception as e:
        print(f"Error during TF-IDF Vectorization: {e}")
else:
    print("Skipping TF-IDF. 'content_soup' column is missing, empty, or DataFrame is not ready.")


Step 4: Vectorization using TF-IDF...
TF-IDF matrix created with shape: (5000, 516696)


In [11]:
# Cell 7: Calculating Cosine Similarity
print("\nStep 5: Calculating Cosine Similarity...")
cosine_sim_matrix = None # Initialize to None
if tfidf_matrix is not None:
    try:
        cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
        print(f"Cosine similarity matrix created with shape: {cosine_sim_matrix.shape}")
        # print(cosine_sim_matrix[:2, :2]) # Display a small part
    except Exception as e:
        print(f"Error during Cosine Similarity calculation: {e}")
else:
    print("Skipping cosine similarity. TF-IDF matrix not available.")


Step 5: Calculating Cosine Similarity...
Cosine similarity matrix created with shape: (5000, 5000)


In [12]:
# Cell 8: Recommendation Function Setup - Title to Index Mapping
print("\nStep 6: Setting up Recommendation Function (Part 1 - Title to Index Mapping)...")
title_to_indices = pd.Series(dtype='int') # Initialize as empty Series
movies_df_indexed = pd.DataFrame() # Initialize

if not movies_df.empty and 'title' in movies_df.columns:
    # The DataFrame 'movies_df' is already the limited and correctly indexed version.
    movies_df_indexed = movies_df # This is our working DataFrame for recommendations

    # Ensure indices map directly to the current movies_df_indexed (0 to N-1)
    title_to_indices = pd.Series(movies_df_indexed.index, index=movies_df_indexed['title']).drop_duplicates()
    print(f"Title to Index mapping created. Size: {len(title_to_indices)}")
else:
    print("Cannot create title_to_indices mapping. DataFrame or 'title' column is missing.")


Step 6: Setting up Recommendation Function (Part 1 - Title to Index Mapping)...
Title to Index mapping created. Size: 5000


In [13]:
# Cell 9: Recommendation Function Definition
print("\nStep 6: Setting up Recommendation Function (Part 2 - Defining get_movie_recommendations)...")

def get_movie_recommendations(movie_title_input, num_recommendations=10):
    if title_to_indices.empty or cosine_sim_matrix is None or movies_df_indexed.empty:
        return "Recommender system is not properly initialized (missing data, index, or similarity matrix)."

    current_movie_title = movie_title_input # Use a temporary variable

    if current_movie_title not in title_to_indices:
        # Try a case-insensitive partial match
        possible_matches = movies_df_indexed[movies_df_indexed['title'].str.contains(current_movie_title, case=False, na=False)]
        if not possible_matches.empty:
            actual_title = possible_matches['title'].iloc[0]
            print(f"Input '{current_movie_title}' not found directly. Did you mean '{actual_title}'? Showing recommendations for this movie.")
            current_movie_title = actual_title # Update to the found title
        else:
            return f"Movie '{movie_title_input}' not found in the dataset."

    if current_movie_title not in title_to_indices:
        return f"Movie '{current_movie_title}' (derived from '{movie_title_input}') still not found in the dataset index."

    movie_idx = title_to_indices[current_movie_title]

    if movie_idx >= cosine_sim_matrix.shape[0]:
        return f"Error: Movie index {movie_idx} is out of bounds for similarity matrix (size: {cosine_sim_matrix.shape[0]}). This indicates a mismatch."

    sim_scores = list(enumerate(cosine_sim_matrix[movie_idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations + 1] # Exclude the movie itself

    recommended_movie_indices = [i[0] for i in sim_scores]

    recommendations = []
    for i in range(len(recommended_movie_indices)):
        rec_idx = recommended_movie_indices[i]
        if rec_idx >= len(movies_df_indexed):
            print(f"Warning: Recommended index {rec_idx} is out of bounds for DataFrame (size: {len(movies_df_indexed)}). Skipping.")
            continue
        title = movies_df_indexed['title'].iloc[rec_idx]
        score = sim_scores[i][1]
        recommendations.append(f"{title} (Similarity: {score:.2f})")

    return pd.Series(recommendations) if recommendations else "No similar recommendations found."

print("get_movie_recommendations function defined.")


Step 6: Setting up Recommendation Function (Part 2 - Defining get_movie_recommendations)...
get_movie_recommendations function defined.


In [14]:
# Cell 10: Getting Recommendations - Testing
print("\nStep 7: Getting Recommendations (Testing)...")

if not movies_df.empty and cosine_sim_matrix is not None and not title_to_indices.empty:
    # Test with the first movie in the (potentially limited) dataset
    if not movies_df['title'].empty:
        sample_movie_title_1 = movies_df['title'].iloc[0]
        if pd.notna(sample_movie_title_1):
            print(f"\n--- Recommendations for '{sample_movie_title_1}' ---")
            recommendations1 = get_movie_recommendations(sample_movie_title_1, num_recommendations=5)
            if isinstance(recommendations1, pd.Series):
                for i, rec in enumerate(recommendations1):
                    print(f"{i+1}. {rec}")
            else:
                print(recommendations1)
        else:
            print("First movie title in the dataset is NaN, cannot use for testing.")
    else:
        print("Movie DataFrame has no titles to pick a sample for testing.")

    # Example 2: Test with a specific movie title (e.g., 'Toy Story')
    # This will also test the partial match functionality if the exact casing/wording isn't present
    test_movie_2 = "Toy Story"
    print(f"\n--- Recommendations for '{test_movie_2}' (or similar) ---")
    recommendations2 = get_movie_recommendations(test_movie_2, num_recommendations=5)
    if isinstance(recommendations2, pd.Series):
        for i, rec in enumerate(recommendations2):
            print(f"{i+1}. {rec}")
    else:
        print(recommendations2)

    # Example 3: Test with another specific movie title (e.g., 'Inception')
    test_movie_3 = "Inception"
    print(f"\n--- Recommendations for '{test_movie_3}' (or similar) ---")
    recommendations3 = get_movie_recommendations(test_movie_3, num_recommendations=5)
    if isinstance(recommendations3, pd.Series):
        for i, rec in enumerate(recommendations3):
            print(f"{i+1}. {rec}")
    else:
        print(recommendations3)

    # Example 4: Test with a movie title that likely doesn't exist
    test_movie_4 = "A Truly Unique Movie Title That Does Not Exist 12345"
    print(f"\n--- Recommendations for '{test_movie_4}' ---")
    recommendations4 = get_movie_recommendations(test_movie_4, num_recommendations=5)
    print(recommendations4)
else:
    print("\nCannot generate recommendations. System was not fully initialized (check previous cell outputs for errors).")


Step 7: Getting Recommendations (Testing)...

--- Recommendations for 'Space Zombie Bingo!' ---
1. Hobgoblins 2 (Similarity: 0.70)
2. Teenage Zombies (Similarity: 0.46)
3. They Found Hell (Similarity: 0.46)
4. Death Dive (Similarity: 0.40)
5. Empty (Similarity: 0.39)

--- Recommendations for 'Toy Story' (or similar) ---
Input 'Toy Story' not found directly. Did you mean 'Charlie: A Toy Story'? Showing recommendations for this movie.
1. Behind Office Doors (Similarity: 0.16)
2. Pup Tales: Chicken Diddle (Similarity: 0.12)
3. Beartooth (Similarity: 0.12)
4. Hard to Do Good (Similarity: 0.12)
5. Lady Kung Fu (Similarity: 0.12)

--- Recommendations for 'Inception' (or similar) ---
Movie 'Inception' not found in the dataset.

--- Recommendations for 'A Truly Unique Movie Title That Does Not Exist 12345' ---
Movie 'A Truly Unique Movie Title That Does Not Exist 12345' not found in the dataset.
