<a href="https://colab.research.google.com/github/TRISHA16-design/hello-world/blob/main/movie_recomender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files
import pandas as pd
import io

print("Please upload your two CSV files:")
# This will open a dialog. Select both your CSV files.
uploaded = files.upload()

# Check if files were uploaded
if not uploaded:
    print("No files were uploaded.")
else:
    print(f"\nSuccessfully uploaded {len(uploaded)} file(s):")

    # Create a dictionary to store your dataframes
    dataframes = {}

    for file_name, file_content in uploaded.items():
        print(f"- {file_name}")
        if file_name.lower().endswith('.csv'):
            try:
                # Read the CSV into a pandas DataFrame
                df = pd.read_csv(io.BytesIO(file_content))
                dataframes[file_name] = df
                print(f"  Successfully loaded '{file_name}' into a DataFrame.")
                # Optionally display the head of each dataframe
                # print("  First 5 rows:")
                # display(df.head())
            except Exception as e:
                print(f"  Error reading CSV '{file_name}': {e}")
        else:
            print(f"  '{file_name}' is not a CSV file and will be ignored for DataFrame creation.")

Please upload your two CSV files:


Saving rotten_tomatoes_movie_reviews-1.csv to rotten_tomatoes_movie_reviews-1.csv
Saving rotten_tomatoes_movies.csv to rotten_tomatoes_movies.csv

Successfully uploaded 2 file(s):
- rotten_tomatoes_movie_reviews-1.csv
  Successfully loaded 'rotten_tomatoes_movie_reviews-1.csv' into a DataFrame.
- rotten_tomatoes_movies.csv
  Successfully loaded 'rotten_tomatoes_movies.csv' into a DataFrame.


In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# --- 1. Load and Prepare Data ---
print("Step 1: Loading and Preparing Data...")
try:
    movies_df_full = pd.read_csv('rotten_tomatoes_movies.csv')
    reviews_df_full = pd.read_csv('rotten_tomatoes_movie_reviews-1.csv')
except FileNotFoundError:
    print("Ensure 'rotten_tomatoes_movies.csv' and 'rotten_tomatoes_movie_reviews.csv' are in the same directory.")
    exit()

if not movies_df_full.empty and not reviews_df_full.empty:
    reviews_df_full['reviewText'] = reviews_df_full['reviewText'].fillna('')
    critics_reviews_agg = reviews_df_full.groupby('id')['reviewText'].apply(lambda x: ' '.join(x)).reset_index()
    critics_reviews_agg.rename(columns={'reviewText': 'aggregated_reviews'}, inplace=True)

    movies_df_merged = pd.merge(movies_df_full, critics_reviews_agg, on='id', how='left')
    movies_df_merged['aggregated_reviews'] = movies_df_merged['aggregated_reviews'].fillna('')
    print("Data loaded and merged.")
else:
    print("Could not load one or both CSV files.")
    exit()

# --- 2. Limit DataFrame to 5000 rows and Reset Index ---
print("\nStep 2: Limiting DataFrame to 5000 rows and Resetting Index...")
if len(movies_df_merged) >= 5000:
    movies_df = movies_df_merged.head(5000).copy() # Use .copy() to avoid SettingWithCopyWarning
else:
    print(f"DataFrame has less than 5000 rows ({len(movies_df_merged)} rows). Using all available data.")
    movies_df = movies_df_merged.copy()

movies_df.reset_index(drop=True, inplace=True) # Ensure clean 0 to N-1 index
print(f"DataFrame limited. New shape: {movies_df.shape}")


# --- 3. Feature Engineering: Creating the "Content Soup" ---
print("\nStep 3: Feature Engineering...")
if not movies_df.empty:
    movies_df['genre'] = movies_df['genre'].fillna('UnknownGenre')
    movies_df['director'] = movies_df['director'].fillna('UnknownDirector')
    movies_df['ratingContents'] = movies_df['ratingContents'].fillna('')
    movies_df['rating'] = movies_df['rating'].fillna('UnknownRating')
    movies_df['audienceScore'] = movies_df['audienceScore'].fillna(-1).astype(int)
    movies_df['tomatoMeter'] = movies_df['tomatoMeter'].fillna(-1).astype(int)

    def clean_text_feature(text_input):
        return str(text_input).lower().replace(' ', '')

    def process_comma_separated_string(text_series, clean_func):
        return text_series.apply(lambda x: ' '.join([clean_func(i.strip()) for i in str(x).split(',')]))

    movies_df['genre_cleaned'] = process_comma_separated_string(movies_df['genre'], clean_text_feature)
    movies_df['director_cleaned'] = process_comma_separated_string(movies_df['director'], clean_text_feature)
    movies_df['rating_cleaned'] = movies_df['rating'].apply(clean_text_feature)
    movies_df['ratingContents_cleaned'] = movies_df['ratingContents'].apply(
        lambda x: ' '.join([clean_text_feature(i.strip()) for i in str(x).split(',') if i.strip()])
    )
    movies_df['audienceScore_str'] = movies_df['audienceScore'].apply(lambda x: f"audscore{x}" if x != -1 else "audscoreunknown")
    movies_df['tomatoMeter_str'] = movies_df['tomatoMeter'].apply(lambda x: f"tomscore{x}" if x != -1 else "tomscoreunknown")

    movies_df['content_soup'] = (
        movies_df['genre_cleaned'] + ' ' + movies_df['genre_cleaned'] + ' ' + movies_df['genre_cleaned'] +
        movies_df['director_cleaned'] + ' ' + movies_df['director_cleaned'] + ' ' + movies_df['director_cleaned'] +
        movies_df['rating_cleaned'] + ' ' +
        movies_df['ratingContents_cleaned'] + ' ' +
        movies_df['audienceScore_str'] + ' ' +
        movies_df['tomatoMeter_str'] + ' ' +
        movies_df['aggregated_reviews'].str.lower()
    )
    print("Feature engineering complete. 'content_soup' created.")
else:
    print("Movies DataFrame is empty after limiting. Cannot proceed.")
    exit()

# --- 4. Vectorization using TF-IDF ---
print("\nStep 4: Vectorization using TF-IDF...")
if 'content_soup' in movies_df.columns and not movies_df['content_soup'].empty:
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
    tfidf_matrix = tfidf_vectorizer.fit_transform(movies_df['content_soup'])
    print(f"TF-IDF matrix created with shape: {tfidf_matrix.shape}") # Should be (5000, num_features) or less
else:
    print("Skipping TF-IDF. 'content_soup' is missing or empty.")
    tfidf_matrix = None
    exit()

# --- 5. Calculating Cosine Similarity ---
print("\nStep 5: Calculating Cosine Similarity...")
if tfidf_matrix is not None:
    cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
    print(f"Cosine similarity matrix created with shape: {cosine_sim_matrix.shape}") # Should be (5000, 5000) or less
else:
    print("Skipping cosine similarity. TF-IDF matrix not available.")
    cosine_sim_matrix = None
    exit()

# --- 6. Building the Recommendation Function ---
print("\nStep 6: Setting up Recommendation Function...")
# The DataFrame 'movies_df' is already the limited (e.g., 5000 rows) and correctly indexed version.
# So, 'movies_df_indexed' will be this 'movies_df'.
movies_df_indexed = movies_df # This is our working DataFrame

if 'title' in movies_df_indexed.columns:
    # Ensure indices map directly to the current movies_df_indexed (0 to N-1)
    title_to_indices = pd.Series(movies_df_indexed.index, index=movies_df_indexed['title']).drop_duplicates()
else:
    print("Critical error: 'title' column missing from DataFrame.")
    title_to_indices = pd.Series() # Empty series to prevent further errors
    exit()

def get_movie_recommendations(movie_title_input, num_recommendations=10):
    if title_to_indices.empty or cosine_sim_matrix is None:
        return "Recommender system is not properly initialized."

    # Use a temporary variable for the title to handle case where input is not found
    current_movie_title = movie_title_input

    if current_movie_title not in title_to_indices:
        # Try a case-insensitive partial match
        possible_matches = movies_df_indexed[movies_df_indexed['title'].str.contains(current_movie_title, case=False, na=False)]
        if not possible_matches.empty:
            actual_title = possible_matches['title'].iloc[0]
            print(f"Input '{current_movie_title}' not found. Did you mean '{actual_title}'? Showing recommendations for this movie.")
            current_movie_title = actual_title # Update to the found title
        else:
            return f"Movie '{movie_title_input}' not found in the dataset."

    # Check again if the (potentially updated) title is in our index
    if current_movie_title not in title_to_indices:
        return f"Movie '{current_movie_title}' (derived from '{movie_title_input}') not found in the dataset index."

    movie_idx = title_to_indices[current_movie_title]

    # Boundary check for movie_idx against cosine_sim_matrix dimensions
    if movie_idx >= cosine_sim_matrix.shape[0]:
        return f"Error: Movie index {movie_idx} is out of bounds for similarity matrix (size: {cosine_sim_matrix.shape[0]})."

    sim_scores = list(enumerate(cosine_sim_matrix[movie_idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations + 1]

    recommended_movie_indices = [i[0] for i in sim_scores]

    recommendations = []
    for i in range(len(recommended_movie_indices)):
        rec_idx = recommended_movie_indices[i]
        # Boundary check for rec_idx against DataFrame dimensions
        if rec_idx >= len(movies_df_indexed):
            print(f"Warning: Recommended index {rec_idx} is out of bounds for DataFrame (size: {len(movies_df_indexed)}). Skipping.")
            continue
        title = movies_df_indexed['title'].iloc[rec_idx]
        score = sim_scores[i][1]
        recommendations.append(f"{title} (Similarity: {score:.2f})")

    return pd.Series(recommendations) if recommendations else "No recommendations found."

# --- 7. Getting Recommendations (Testing) ---
print("\nStep 7: Getting Recommendations...")
if not movies_df.empty and cosine_sim_matrix is not None and not title_to_indices.empty:
    # Test with the first movie in the (potentially limited) dataset
    if not movies_df['title'].empty:
        sample_movie_title = movies_df['title'].iloc[0]
        if pd.notna(sample_movie_title):
            print(f"\nRecommendations for '{sample_movie_title}':")
            recommendations = get_movie_recommendations(sample_movie_title, num_recommendations=5)
            if isinstance(recommendations, pd.Series):
                for i, rec in enumerate(recommendations):
                    print(f"{i+1}. {rec}")
            else:
                print(recommendations)
        else:
            print("First movie title is NaN, cannot test.")
    else:
        print("No titles available in the dataframe to test.")

    # Check if 'Toy Story' exists in the current movies_df (limited dataset)
    toy_story_exists = movies_df[movies_df['title'].str.contains('Toy Story', case=False, na=False)]
    if not toy_story_exists.empty:
        # If multiple matches, use the first one found
        toy_story_title_to_use = toy_story_exists['title'].iloc[0]
        print(f"\nRecommendations for '{toy_story_title_to_use}' (found via 'Toy Story' search):")
        toy_story_recs = get_movie_recommendations(toy_story_title_to_use, num_recommendations=5)
        if isinstance(toy_story_recs, pd.Series):
            for i, rec in enumerate(toy_story_recs):
                print(f"{i+1}. {rec}")
        else:
            print(toy_story_recs)
    else:
        print("\n'Toy Story' not found in the current dataset portion.")

    # Check if 'Inception' exists
    inception_exists = movies_df[movies_df['title'].str.contains('Inception', case=False, na=False)]
    if not inception_exists.empty:
        inception_title_to_use = inception_exists['title'].iloc[0]
        print(f"\nRecommendations for '{inception_title_to_use}' (found via 'Inception' search):")
        inception_recs = get_movie_recommendations(inception_title_to_use, num_recommendations=5)
        if isinstance(inception_recs, pd.Series):
            for i, rec in enumerate(inception_recs):
                print(f"{i+1}. {rec}")
        else:
            print(inception_recs)
    else:
        print("\n'Inception' not found in the current dataset portion.")
else:
    print("\nCannot generate recommendations. System not fully initialized.")

Step 1: Loading and Preparing Data...
Data loaded and merged.

Step 2: Limiting DataFrame to 5000 rows and Resetting Index...
DataFrame limited. New shape: (5000, 17)

Step 3: Feature Engineering...
Feature engineering complete. 'content_soup' created.

Step 4: Vectorization using TF-IDF...
TF-IDF matrix created with shape: (5000, 516696)

Step 5: Calculating Cosine Similarity...
Cosine similarity matrix created with shape: (5000, 5000)

Step 6: Setting up Recommendation Function...

Step 7: Getting Recommendations...

Recommendations for 'Space Zombie Bingo!':
1. Hobgoblins 2 (Similarity: 0.70)
2. Teenage Zombies (Similarity: 0.46)
3. They Found Hell (Similarity: 0.46)
4. Death Dive (Similarity: 0.40)
5. Empty (Similarity: 0.39)

Recommendations for 'Charlie: A Toy Story' (found via 'Toy Story' search):
1. Behind Office Doors (Similarity: 0.16)
2. Pup Tales: Chicken Diddle (Similarity: 0.12)
3. Beartooth (Similarity: 0.12)
4. Hard to Do Good (Similarity: 0.12)
5. Lady Kung Fu (Similar