<a href="https://colab.research.google.com/github/TRISHA16-design/hello-world/blob/main/Untitled5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
from google.colab import files
import pandas as pd
import io

print("Please upload your two CSV files:")
# This will open a dialog. Select both your CSV files.
uploaded = files.upload()

# Check if files were uploaded
if not uploaded:
    print("No files were uploaded.")
else:
    print(f"\nSuccessfully uploaded {len(uploaded)} file(s):")

    # Create a dictionary to store your dataframes
    dataframes = {}

    for file_name, file_content in uploaded.items():
        print(f"- {file_name}")
        if file_name.lower().endswith('.csv'):
            try:
                # Read the CSV into a pandas DataFrame
                df = pd.read_csv(io.BytesIO(file_content))
                dataframes[file_name] = df
                print(f"  Successfully loaded '{file_name}' into a DataFrame.")
                # Optionally display the head of each dataframe
                # print("  First 5 rows:")
                # display(df.head())
            except Exception as e:
                print(f"  Error reading CSV '{file_name}': {e}")
        else:
            print(f"  '{file_name}' is not a CSV file and will be ignored for DataFrame creation.")

Please upload your two CSV files:


Saving rotten_tomatoes_movie_reviews.csv to rotten_tomatoes_movie_reviews.csv
Saving rotten_tomatoes_movies.csv to rotten_tomatoes_movies.csv

Successfully uploaded 2 file(s):
- rotten_tomatoes_movie_reviews.csv
  Successfully loaded 'rotten_tomatoes_movie_reviews.csv' into a DataFrame.
- rotten_tomatoes_movies.csv
  Successfully loaded 'rotten_tomatoes_movies.csv' into a DataFrame.


In [3]:
# Cell 1: Imports and Data Loading
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re # For basic text cleaning

# IMPORTANT: Upload your 'rotten_tomatoes_movies.csv' and 'rotten_tomatoes_movie_reviews.csv'
# files to your Colab environment before running this cell.
# You can do this by clicking on the "Files" icon on the left sidebar, then "Upload".

try:
    movies_df_raw = pd.read_csv('rotten_tomatoes_movies.csv')
    reviews_df_raw = pd.read_csv('rotten_tomatoes_movie_reviews.csv')
    print("Datasets loaded successfully!")
except FileNotFoundError:
    print("ERROR: Make sure 'rotten_tomatoes_movies.csv' and 'rotten_tomatoes_movie_reviews.csv' are uploaded to Colab.")
    print("Creating dummy DataFrames for demonstration purposes. Functionality will be limited.")
    # Create dummy dataframes to allow the rest of the script to run without errors for demonstration
    movies_df_raw = pd.DataFrame({
        'id': ['m1', 'm2', 'm3'],
        'title': ['Dummy Movie 1', 'Dummy Movie 2', 'Another Dummy Film'],
        'audienceScore': [60, 70, 80],
        'tomatoMeter': [65, 75, 85],
        'rating': ['PG', 'PG-13', 'R'],
        'ratingContents': ['Mild action', 'Some violence', 'Strong language'],
        'genre': ['Action|Adventure', 'Comedy', 'Drama|Thriller'],
        'director': ['Director A', 'Director B', 'Director C'],
        'writer': ['Writer X', 'Writer Y', 'Writer Z']
    })
    reviews_df_raw = pd.DataFrame({
        'id': ['m1', 'm1', 'm2', 'm3'],
        'reviewText': ['Good fun', 'Enjoyable for family', 'Very funny', 'Gripping story'],
        'scoreSentiment': ['POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE']
    })

print("\n--- Raw Movies DataFrame Head ---")
print(movies_df_raw.head())
print(f"\nRaw Movies DataFrame Shape: {movies_df_raw.shape}")

print("\n--- Raw Reviews DataFrame Head ---")
print(reviews_df_raw.head())
print(f"\nRaw Reviews DataFrame Shape: {reviews_df_raw.shape}")

# Make copies to work with, preserving the raw loaded data
movies_df = movies_df_raw.copy()
reviews_df = reviews_df_raw.copy()

Datasets loaded successfully!

--- Raw Movies DataFrame Head ---
                     id                title  audienceScore  tomatoMeter  \
0    space-zombie-bingo  Space Zombie Bingo!           50.0          NaN   
1       the_green_grass      The Green Grass            NaN          NaN   
2             love_lies           Love, Lies           43.0          NaN   
3  the_sore_losers_1997          Sore Losers           60.0          NaN   
4  dinosaur_island_2002      Dinosaur Island           70.0          NaN   

  rating ratingContents releaseDateTheaters releaseDateStreaming  \
0    NaN            NaN                 NaN           2018-08-25   
1    NaN            NaN                 NaN           2020-02-11   
2    NaN            NaN                 NaN                  NaN   
3    NaN            NaN                 NaN           2020-10-23   
4    NaN            NaN                 NaN           2017-03-27   

   runtimeMinutes                          genre originalLanguage  \


In [4]:
# Cell 2: Preprocess Movies Data
print("\n--- Preprocessing Movies Data ---")

# Select relevant columns from movies_df
# Added 'id' to ensure it's available for merging if not already first col
movies_cols = ['id', 'title', 'audienceScore', 'tomatoMeter', 'rating', 'ratingContents', 'genre', 'director', 'writer']
# Ensure all selected columns exist, otherwise, only use existing ones
movies_cols_exist = [col for col in movies_cols if col in movies_df.columns]
movies_df = movies_df[movies_cols_exist]


# Handle missing values for key textual features
for col in ['genre', 'director', 'ratingContents', 'writer', 'title']:
    if col in movies_df.columns:
        movies_df[col] = movies_df[col].fillna('')
        if col == 'title': # Specifically ensure 'title' is not empty for lookup
             movies_df[col] = movies_df[col].apply(lambda x: 'Unknown Title' if x == '' else x)


# Handle missing numerical scores (optional: fill with mean/median, or 0)
for col in ['audienceScore', 'tomatoMeter']:
    if col in movies_df.columns:
        movies_df[col] = movies_df[col].fillna(0)

print("\n--- Processed Movies DataFrame Head ---")
print(movies_df.head())
print(f"\nProcessed Movies DataFrame Shape: {movies_df.shape}")


--- Preprocessing Movies Data ---

--- Processed Movies DataFrame Head ---
                     id                title  audienceScore  tomatoMeter  \
0    space-zombie-bingo  Space Zombie Bingo!           50.0          0.0   
1       the_green_grass      The Green Grass            0.0          0.0   
2             love_lies           Love, Lies           43.0          0.0   
3  the_sore_losers_1997          Sore Losers           60.0          0.0   
4  dinosaur_island_2002      Dinosaur Island           70.0          0.0   

  rating ratingContents                          genre  \
0    NaN                        Comedy, Horror, Sci-fi   
1    NaN                                         Drama   
2    NaN                                         Drama   
3    NaN                    Action, Mystery & thriller   
4    NaN                 Fantasy, Adventure, Animation   

                        director                                  writer  
0                  George Ormrod           

In [5]:
# Cell 3: Preprocess and Aggregate Reviews Data
print("\n--- Preprocessing and Aggregating Reviews Data ---")

if not reviews_df.empty and 'id' in reviews_df.columns:
    # Select relevant columns
    reviews_cols = ['id', 'reviewText', 'scoreSentiment']
    reviews_cols_exist = [col for col in reviews_cols if col in reviews_df.columns]
    reviews_df_processed = reviews_df[reviews_cols_exist].copy() # Use .copy() to avoid SettingWithCopyWarning

    if 'reviewText' in reviews_df_processed.columns:
        reviews_df_processed.loc[:, 'reviewText'] = reviews_df_processed['reviewText'].fillna('')
    else:
        reviews_df_processed.loc[:, 'reviewText'] = '' # Create column if it doesn't exist

    # Aggregate review text for each movie
    if 'reviewText' in reviews_df_processed.columns and 'id' in reviews_df_processed.columns:
        aggregated_reviews = reviews_df_processed.groupby('id')['reviewText'].apply(lambda x: ' '.join(x)).reset_index()
        aggregated_reviews.rename(columns={'reviewText': 'aggregatedReviewText'}, inplace=True)
        print("\nAggregated Reviews Head:")
        print(aggregated_reviews.head())
    else:
        print("Could not aggregate reviews: 'id' or 'reviewText' column missing in reviews_df_processed.")
        aggregated_reviews = pd.DataFrame({'id': [], 'aggregatedReviewText': []})

else:
    print("Reviews DataFrame is empty or 'id' column is missing. Skipping review aggregation.")
    aggregated_reviews = pd.DataFrame({'id': [], 'aggregatedReviewText': []})

print(f"\nAggregated Reviews Shape: {aggregated_reviews.shape}")


--- Preprocessing and Aggregating Reviews Data ---

Aggregated Reviews Head:
              id                               aggregatedReviewText
0       $5_a_day  $5 a Day isn't perfect, but it does examine so...
1  009_re_cyborg  Despite its good looks, there's no escaping 00...
2         00_mhz  ...a stylishly minimalist plot, executed beaut...
3        0814255  The Lightning Thief is good juvenile entertain...
4        0878835  Formless and complicated.  [Full review in Spa...

Aggregated Reviews Shape: (69263, 2)


In [6]:
# Cell 4: Merge DataFrames
print("\n--- Merging DataFrames ---")

if not movies_df.empty and 'id' in movies_df.columns:
    if not aggregated_reviews.empty and 'id' in aggregated_reviews.columns:
        # Merge movies_df with aggregated_reviews
        merged_df = pd.merge(movies_df, aggregated_reviews, on='id', how='left')
        # Fill NaNs in 'aggregatedReviewText' that might result from movies without reviews
        if 'aggregatedReviewText' in merged_df.columns:
            merged_df['aggregatedReviewText'] = merged_df['aggregatedReviewText'].fillna('')
        else:
             merged_df['aggregatedReviewText'] = '' # Add column if merge didn't create it
    else:
        print("Aggregated reviews are empty or missing 'id'. Using only movies_df.")
        merged_df = movies_df.copy()
        merged_df['aggregatedReviewText'] = '' # Add empty column if no reviews
else:
    print("Movies DataFrame is empty or missing 'id'. Cannot proceed with merging.")
    merged_df = pd.DataFrame() # Empty dataframe

if not merged_df.empty:
    print("\nMerged DataFrame Head:")
    print(merged_df.head())
    print(f"\nMerged DataFrame Shape: {merged_df.shape}")
else:
    print("Merged DataFrame is empty.")


--- Merging DataFrames ---

Merged DataFrame Head:
                     id                title  audienceScore  tomatoMeter  \
0    space-zombie-bingo  Space Zombie Bingo!           50.0          0.0   
1       the_green_grass      The Green Grass            0.0          0.0   
2             love_lies           Love, Lies           43.0          0.0   
3  the_sore_losers_1997          Sore Losers           60.0          0.0   
4  dinosaur_island_2002      Dinosaur Island           70.0          0.0   

  rating ratingContents                          genre  \
0    NaN                        Comedy, Horror, Sci-fi   
1    NaN                                         Drama   
2    NaN                                         Drama   
3    NaN                    Action, Mystery & thriller   
4    NaN                 Fantasy, Adventure, Animation   

                        director                                  writer  \
0                  George Ormrod              George Ormrod,John S

In [7]:
# Cell 5: Feature Engineering - Text Cleaning Function
print("\n--- Defining Text Cleaning Function ---")

def clean_text(text):
    """A simple function to clean text data."""
    if isinstance(text, list): # Handles cases where genre might be a list of strings
        text = ' '.join(map(str, text)) # Ensure all elements are strings before joining
    if not isinstance(text, str): # Ensure text is a string
        text = str(text)
    text = text.lower() # Lowercase
    text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with single space
    text = re.sub(r'[^a-z0-9\s]', '', text) # Remove special characters (alphanumeric and spaces only)
    return text.strip()

print("clean_text function defined.")
# Test the function
print(f"Test clean_text('Action|Adventure!'): {clean_text('Action|Adventure!')}")
print(f"Test clean_text(123): {clean_text(123)}")


--- Defining Text Cleaning Function ---
clean_text function defined.
Test clean_text('Action|Adventure!'): actionadventure
Test clean_text(123): 123


In [8]:
# Cell 6: Feature Engineering - Apply Cleaning and Create Content Soup
print("\n--- Applying Cleaning and Creating Content Soup ---")

if not merged_df.empty:
    # Apply cleaning to relevant text columns
    text_cols_to_clean = ['title', 'genre', 'director', 'writer', 'ratingContents', 'aggregatedReviewText']
    for col in text_cols_to_clean:
        if col in merged_df.columns:
             merged_df[col] = merged_df[col].apply(clean_text)
        else:
            print(f"Warning: Column '{col}' not found in merged_df for cleaning.")

    # Create the "content soup" - a single string combining relevant features
    def create_soup(x):
        # Ensure all components are strings and handle potential missing columns gracefully
        genre = str(x.get('genre', '')) * 3 # Weight genre
        director = str(x.get('director', '')) * 2 # Weight director
        writer = str(x.get('writer', ''))
        rating_contents = str(x.get('ratingContents', ''))
        reviews = str(x.get('aggregatedReviewText', ''))
        # title = str(x.get('title', '')) # Title can be added if desired for soup

        return f"{genre} {director} {writer} {rating_contents} {reviews}".strip()

    merged_df['soup'] = merged_df.apply(create_soup, axis=1)

    print("\nContent Soup Example (first few movies):")
    for i in range(min(3, len(merged_df))): # Show for first 3 or fewer if less data
        title_display = merged_df['title'].iloc[i] if 'title' in merged_df.columns else 'N/A'
        soup_display = merged_df['soup'].iloc[i][:200] if 'soup' in merged_df.columns else 'N/A'
        print(f"Movie: {title_display}")
        print(f"Soup: {soup_display}...")
        print("-" * 20)
else:
    print("Merged DataFrame is empty. Skipping feature engineering.")


--- Applying Cleaning and Creating Content Soup ---

Content Soup Example (first few movies):
Movie: space zombie bingo
Soup: comedy horror scificomedy horror scificomedy horror scifi george ormrodgeorge ormrod george ormrodjohn sabotta...
--------------------
Movie: the green grass
Soup: dramadramadrama tiffany edwardstiffany edwards tiffany edwards...
--------------------
Movie: love lies
Soup: dramadramadrama park heungsikheungsik parkpark heungsikheungsik park ha youngjoonjeon yunsusong hyejin  though let down by its routine love triangle narrative love lies has a lot going for it and is a...
--------------------


In [9]:
# Cell 7: TF-IDF Vectorization
print("\n--- TF-IDF Vectorization ---")

# Initialize tfidf_matrix to None outside the if blocks
# This ensures the variable exists even if vectorization is skipped.
tfidf_matrix = None

if not merged_df.empty:
    if 'soup' in merged_df.columns:
        print("Processing 'soup' column for TF-IDF...")
        # Filter out rows where 'soup' is empty or NaN before vectorizing
        soup_series = merged_df['soup'].dropna()
        soup_series = soup_series[soup_series != '']

        if not soup_series.empty:
            print(f"Found {len(soup_series)} non-empty 'soup' entries for TF-IDF.")
            tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,2), min_df=1) # min_df=1 to handle small vocab
            try:
                tfidf_matrix = tfidf.fit_transform(soup_series)
                print("\nTF-IDF Matrix Shape:")
                print(tfidf_matrix.shape) # (number of movies with non-empty soup, number of unique words/phrases)
            except ValueError as e:
                print(f"Error during TF-IDF fitting: {e}")
                print("This might happen if the vocabulary is empty after processing (e.g., all soups are empty or only stop words).")
        else:
            print("No non-empty 'soup' data available for TF-IDF after cleaning and filtering.")
    else:
        print("Cannot perform TF-IDF: 'soup' column is missing in merged_df.")
else:
    print("Cannot perform TF-IDF: merged_df is empty.")


--- TF-IDF Vectorization ---
Processing 'soup' column for TF-IDF...
Found 142815 non-empty 'soup' entries for TF-IDF.

TF-IDF Matrix Shape:
(142815, 9129733)


In [None]:
# Cell 8 (Modified): Cosine Similarity Calculation
import numpy as np # Make sure numpy is imported if you use np.nan_to_num

print("\n--- Cosine Similarity Calculation ---")

cosine_sim = None # Initialize

# Check if tfidf_matrix is valid and has both rows (movies) and columns (features)
if tfidf_matrix is not None and tfidf_matrix.shape[0] > 0 and tfidf_matrix.shape[1] > 0:
    try:
        cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

        # Optional: Handle potential NaN values if any document vectors were all zeros.
        # Scikit-learn's cosine_similarity often handles this by returning 0 for similarity
        # with a zero vector. If you encounter NaNs and they cause issues later (e.g., in sorting),
        # you can convert them to zeros:
        # cosine_sim = np.nan_to_num(cosine_sim) # Uncomment if needed

        print("\nCosine Similarity Matrix Shape:")
        print(cosine_sim.shape)
    except Exception as e:
        print(f"Error calculating cosine similarity: {e}")
elif tfidf_matrix is not None:
    # This case handles when tfidf_matrix exists but is not suitable (e.g., no movies or no features)
    print(f"TF-IDF matrix is not suitable for similarity calculation. Shape: {tfidf_matrix.shape}. Skipping cosine similarity.")
    print("This can happen if there are no documents with valid 'soup' or if the vocabulary extracted is empty.")
else: # This case handles when tfidf_matrix itself is None
    print("TF-IDF matrix is None (likely not created successfully in Cell 7). Skipping cosine similarity.")




--- Cosine Similarity Calculation ---


In [None]:
# Cell 9: Recommendation Function Definition
print("\n--- Defining Recommendation Function ---")

import pandas as pd # Ensure pandas is imported

# Create a Series for reverse mapping of movie titles to indices
# This mapping should be based on the movies that were actually included in the tfidf_matrix
# (i.e., those with non-empty soup)
indices_map = pd.Series(dtype='float64') # Initialize with dtype to avoid issues with empty Series

if not merged_df.empty and 'title' in merged_df.columns and 'soup' in merged_df.columns:
    # Filter merged_df to include only movies that contributed to tfidf_matrix
    # These are movies with non-empty soup
    df_for_indices = merged_df[merged_df['soup'].fillna('') != ''].copy()
    df_for_indices.reset_index(drop=True, inplace=True) # Reset index to match tfidf_matrix rows

    if not df_for_indices.empty:
        # Ensure titles used for mapping are the cleaned ones from df_for_indices
        # Use .iloc if title is not unique, though 'id' would be better for non-unique titles
        if df_for_indices['title'].duplicated().any():
            print("Warning: Duplicate movie titles exist in the filtered data. Using first occurrence for recommendations.")
        indices_map = pd.Series(df_for_indices.index, index=df_for_indices['title']).drop_duplicates()
        print(f"Indices map created with {len(indices_map)} entries.")
    else:
        print("No data with non-empty soup to create indices map.")
else:
    print("Merged DataFrame, 'title', or 'soup' column missing. Cannot create indices map.")


def get_recommendations(title, cosine_sim_matrix=cosine_sim, data=merged_df, idx_map=indices_map):
    """
    Generates movie recommendations based on cosine similarity.
    Args:
        title (str): The title of the movie to get recommendations for.
        cosine_sim_matrix (np.array): The precomputed cosine similarity matrix.
        data (pd.DataFrame): The DataFrame containing original movie data (used for displaying results).
                            It's assumed this 'data' df aligns with the original full movie list.
        idx_map (pd.Series): A mapping from cleaned movie titles to their DataFrame indices
                             (indices in the cosine_sim_matrix).
    Returns:
        pd.DataFrame: A DataFrame of recommended movies with their scores.
    """
    title_cleaned = clean_text(title)

    if cosine_sim_matrix is None:
        print("Error: Cosine similarity matrix is not available.")
        return pd.DataFrame()
    if data.empty: # Check if the original data (merged_df) is empty
        print("Error: Main data for recommendations (merged_df) is empty.")
        return pd.DataFrame()
    if idx_map.empty:
        print("Error: Indices map is empty.")
        return pd.DataFrame()

    if title_cleaned not in idx_map:
        print(f"Error: Movie '{title}' (cleaned: '{title_cleaned}') not found in the recommendation index.")
        possible_matches = [t for t in idx_map.index if title_cleaned in t or t in title_cleaned]
        if possible_matches:
            print(f"Did you mean one of these? {possible_matches[:5]}")
        return pd.DataFrame()

    # Get the index of the movie that matches the title in the TF-IDF matrix
    matrix_idx = idx_map[title_cleaned]

    # Get pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim_matrix[matrix_idx]))

    # Sort movies based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get scores of the 10 most similar movies (excluding the movie itself, which will have score 1.0)
    # sim_scores[0] will be the movie itself.
    sim_scores = sim_scores[1:11]

    # Get movie indices from the TF-IDF matrix perspective
    tfidf_movie_indices = [i[0] for i in sim_scores]

    # recommendation_data_source should be the same df that was used to create the indices_map and tfidf_matrix
    recommendation_data_source = merged_df[merged_df['soup'].fillna('') != ''].reset_index(drop=True)

    # Columns to display in recommendations
    display_cols = ['title', 'genre', 'director', 'audienceScore', 'tomatoMeter']
    # Ensure these columns exist in the recommendation_data_source
    actual_display_cols = [col for col in display_cols if col in recommendation_data_source.columns]

    # Ensure we only try to iloc if tfidf_movie_indices are valid for recommendation_data_source
    valid_indices = [idx for idx in tfidf_movie_indices if idx < len(recommendation_data_source)]
    if not valid_indices:
        print("No valid movie indices found for recommendations after filtering.")
        return pd.DataFrame()


    recommendations = recommendation_data_source[actual_display_cols].iloc[valid_indices].copy()
    # Ensure similarity scores align with the potentially filtered valid_indices
    recommendations['similarityScore'] = [s[1] for i, s in enumerate(sim_scores) if tfidf_movie_indices[i] in valid_indices]


    return recommendations

print("get_recommendations function defined.")

In [3]:
# Cell 10: Test Recommendations
print("\n--- Testing Recommendations ---")

if cosine_sim is not None and not merged_df.empty and not indices_map.empty:
    # Try to find a movie that exists in the indices_map for testing
    test_movie_title = ""
    if not indices_map.empty:
        test_movie_title = indices_map.index[0] # Get the first movie title from the map
        print(f"\nAttempting to get recommendations for: '{test_movie_title}' (using cleaned version from dataset)")

        recommendations = get_recommendations(test_movie_title, cosine_sim_matrix=cosine_sim, data=merged_df, idx_map=indices_map)

        if not recommendations.empty:
            print(recommendations)
        else:
            print(f"Could not get recommendations for '{test_movie_title}'. Check if it's in 'indices_map' and if 'soup' was generated.")
    else:
        print("No titles available in `indices_map` for testing.")

    print("\n--- Try another example (you might need to change the title) ---")
    # For dummy data, let's try 'Dummy Movie 2' if it exists in indices_map
    another_test_title = "Dummy Movie 2" # Example, change if not in your data (or 'Another Dummy Film')

    # Check if the title (after cleaning) is in the indices_map
    if clean_text(another_test_title) in indices_map:
        print(f"Attempting to get recommendations for: '{another_test_title}'")
        recommendations_dk = get_recommendations(another_test_title, cosine_sim_matrix=cosine_sim, data=merged_df, idx_map=indices_map)
        if not recommendations_dk.empty:
            print(recommendations_dk)
        else:
            print(f"Could not get recommendations for '{another_test_title}'.")
    else:
        print(f"'{another_test_title}' (cleaned: '{clean_text(another_test_title)}') not in indices_map. Try a title from your actual dataset or one of these from the dummy set (if available): {list(indices_map.index[:3])}")

else:
    print("Cannot test recommendations: cosine similarity matrix, merged_df, or indices_map is not available/empty.")
    if cosine_sim is None:
        print("Reason: cosine_sim is None.")
    if merged_df.empty:
        print("Reason: merged_df is empty.")
    if 'indices_map' not in globals() or indices_map.empty :
         print("Reason: indices_map is not defined or empty.")


--- Testing Recommendations ---
Cannot test recommendations: cosine similarity matrix, merged_df, or indices_map is not available/empty.
Reason: cosine_sim is None.


NameError: name 'merged_df' is not defined

In [4]:
# Cell 11: Further Potential Enhancements (Informational)

print("\n--- Further Potential Enhancements ---")
print("1. More Sophisticated Text Cleaning: Lemmatization, stemming, handling specific phrases.")
print("2. Advanced NLP: Use Word Embeddings (Word2Vec, GloVe, Sentence Transformers) instead of TF-IDF for richer semantic understanding.")
print("3. Incorporate More Features: Use numerical features like 'audienceScore' and 'tomatoMeter' directly in the similarity calculation (after scaling), or use them to re-rank TF-IDF based recommendations.")
print("4. Handling Duplicates & IDs: More robust handling of duplicate movie titles by primarily using movie 'id' for mapping and lookups, rather than 'title'. The current 'indices_map' uses titles which might not be unique.")
print("5. User Interface: Build a simple UI (e.g., using Streamlit or Flask) to interact with the recommender.")
print("6. Evaluation: Implement methods to evaluate the quality of recommendations (this is complex for content-based systems without explicit user feedback).")
print("7. Scalability: For very large datasets, explore approximate nearest neighbor algorithms (e.g., Annoy, Faiss) instead of exact cosine similarity.")
print("8. Parameter Tuning: Experiment with TF-IDF parameters (min_df, max_df, ngram_range) and soup creation weights.")



--- Further Potential Enhancements ---
1. More Sophisticated Text Cleaning: Lemmatization, stemming, handling specific phrases.
2. Advanced NLP: Use Word Embeddings (Word2Vec, GloVe, Sentence Transformers) instead of TF-IDF for richer semantic understanding.
3. Incorporate More Features: Use numerical features like 'audienceScore' and 'tomatoMeter' directly in the similarity calculation (after scaling), or use them to re-rank TF-IDF based recommendations.
4. Handling Duplicates & IDs: More robust handling of duplicate movie titles by primarily using movie 'id' for mapping and lookups, rather than 'title'. The current 'indices_map' uses titles which might not be unique.
5. User Interface: Build a simple UI (e.g., using Streamlit or Flask) to interact with the recommender.
6. Evaluation: Implement methods to evaluate the quality of recommendations (this is complex for content-based systems without explicit user feedback).
7. Scalability: For very large datasets, explore approximate near