In [None]:
!pip install nltk



In [None]:
!pip install pandas nltk scikit-learn



In [None]:
# -*- coding: utf-8 -*-
# Add encoding declaration for potentially non-ascii characters in comments/code

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import heapq # For efficient summarization sentence selection
import sys # To potentially exit if downloads fail

# --- Configuration ---
INPUT_CSV_FILE = 'movies_subtitles.csv'  # <--- CHANGE THIS if needed
OUTPUT_CSV_FILE = 'movie_analysis_output.csv' # <--- CHANGE THIS (Optional output file)
MOVIE_ID_COLUMN = 'imdb_id'          # <--- CHANGE THIS if your column name is different
SUBTITLE_TEXT_COLUMN = 'text' # <--- CHANGE THIS if your column name is different

MAX_OVERVIEW_CHARS = 1000
NUM_KEYWORDS = 10

# --- Download necessary NLTK data (if not already downloaded) ---
# Use correct exception handling (LookupError)
try:
    nltk.data.find('tokenizers/punkt')
    print("NLTK 'punkt' resource found.")
except LookupError:
    print("Downloading NLTK 'punkt' tokenizer...")
    nltk.download('punkt', quiet=True)
    try:
        nltk.data.find('tokenizers/punkt')
        print("'punkt' downloaded successfully.")
    except LookupError:
        print("\n--- !!! ERROR !!! ---")
        print("Failed to download or locate the NLTK 'punkt' resource even after download attempt.")
        print("Please check your internet connection and NLTK setup.")
        print("You might need to manually download NLTK data (e.g., nltk.download('all')).")
        print("See: https://www.nltk.org/data.html")
        print("Exiting script.")
        print("---------------------\n")
        sys.exit("Required NLTK resource 'punkt' missing.")

try:
    nltk.data.find('corpora/stopwords')
    print("NLTK 'stopwords' resource found.")
except LookupError:
    print("Downloading NLTK 'stopwords'...")
    nltk.download('stopwords', quiet=True)
    try:
        nltk.data.find('corpora/stopwords')
        print("'stopwords' downloaded successfully.")
    except LookupError:
        print("\n--- !!! ERROR !!! ---")
        print("Failed to download or locate the NLTK 'stopwords' resource.")
        print("Exiting script.")
        print("---------------------\n")
        sys.exit("Required NLTK resource 'stopwords' missing.")

# Define STOP_WORDS *after* ensuring NLTK data is available
try:
    STOP_WORDS = set(stopwords.words('english'))
except Exception as e:
     print(f"\n--- !!! ERROR !!! ---")
     print(f"Failed to load NLTK stopwords. Error: {e}")
     print("Proceeding without stopwords, keyword quality may be affected.")
     print("---------------------\n")
     STOP_WORDS = set() # Use empty set if loading fails

# --- Genre Keywords (Heuristic - Expand as needed) ---
GENRE_KEYWORDS = {
    'Action': ['fight', 'gun', 'chase', 'explosion', 'kill', 'attack', 'shoot', 'run', 'escape', 'mission', 'weapon', 'battle', 'war'],
    'Comedy': ['funny', 'laugh', 'joke', 'haha', 'stupid', 'crazy', 'hilarious', 'idiot', 'comedian', 'fun', 'silly'],
    'Drama': ['sad', 'cry', 'feelings', 'sorry', 'relationship', 'family', 'life', 'death', 'serious', 'story', 'love', 'lost', 'hope'],
    'Sci-Fi': ['space', 'alien', 'robot', 'future', 'planet', 'ship', 'laser', 'time travel', 'science', 'galaxy', 'android', 'tech', 'technology', 'ai'],
    'Horror': ['scared', 'fear', 'ghost', 'monster', 'scream', 'blood', 'die', 'haunted', 'terror', 'nightmare', 'killer', 'dark', 'evil'],
    'Romance': ['love', 'kiss', 'heart', 'date', 'beautiful', 'together', 'forever', 'darling', 'marry', 'sweet', 'couple', 'wedding'],
    'Thriller': ['suspense', 'danger', 'nervous', 'plot', 'secret', 'mystery', 'trap', 'risk', 'threat', 'tense', 'escape', 'spy', 'agent'],
    'Fantasy': ['magic', 'wizard', 'dragon', 'sword', 'kingdom', 'elf', 'quest', 'myth', 'legend', 'creature', 'king', 'queen', 'prince', 'princess'],
    'Animation': [], # Still hard from text alone
    'Documentary': ['real', 'story', 'life', 'world', 'people', 'history', 'fact', 'interview', 'evidence', 'nature', 'science'] # Also difficult
}
# Add more genres and keywords for better accuracy

# --- Helper Functions ---

def preprocess_text(text):
    """Basic text cleaning: lowercase, remove artifacts, non-alphanumeric."""
    if not isinstance(text, str):
        return ""
    text = text.lower()
    # Remove timestamps like 00:00:15,203 --> 00:00:18,163
    text = re.sub(r'\d{1,2}:\d{2}:\d{2},\d{3}\s*-->\s*\d{1,2}:\d{2}:\d{2},\d{3}', '', text)
    # Remove simple timestamps like [00:15] or {00:15}
    text = re.sub(r'[\[\{\(]\s*\d{1,2}:\d{2}(:\d{2})?\s*[\]\}\)]', '', text) # Handle optional seconds
    # Remove typical subtitle formatting like <i>, <b>, font tags etc.
    text = re.sub(r'<[/ BUI]?.*?>', '', text, flags=re.IGNORECASE) # More generic tag removal
    # Remove speaker tags like [MAN], (WOMAN), MAN:, etc. (more robust)
    text = re.sub(r'^[\s\t]*[A-Z\s]+:', '', text, flags=re.MULTILINE) # Speaker at line start
    text = re.sub(r'[\[\{\(][^\]\}\)]*?:.*?[\]\}\)]', '', text) # Speaker tag like [MAN]: or (WOMAN SIGHING):
    # Remove music notes or symbols if present
    text = re.sub(r'[♪♫#*]', '', text)
    # Remove sequences indicating OCR errors or breaks like '---' or '==='
    text = re.sub(r'[-=]{2,}', ' ', text)
    # General cleaning: remove remaining non-alphanumeric, non-space chars
    text = re.sub(r'[^a-z0-9\s]', '', text)
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def generate_overview_extractive(full_text, max_chars=MAX_OVERVIEW_CHARS):
    """Generates an extractive summary using sentence scoring based on word frequency."""
    if not full_text or not isinstance(full_text, str) or len(full_text.strip()) < 10:
        return ""

    # *** ADDED try-except specifically for sent_tokenize ***
    try:
        sentences = sent_tokenize(full_text)
    except LookupError as e:
        # This catches the persistent 'punkt_tab not found' or similar NLTK data issues
        print(f"\n--- NLTK Tokenizer Warning ---")
        print(f"Sentence tokenization failed for an entry due to missing NLTK resource: {e}")
        print("This might happen if the 'punkt' download is incomplete or corrupted.")
        print("Skipping summary generation for this entry.")
        print("Consider running 'nltk.download(\"punkt\", force=True)' or 'nltk.download(\"all\")' manually.")
        print("----------------------------\n")
        # Fallback: return the beginning of the raw text
        return full_text[:max_chars].strip()
    except Exception as e:
        # Catch other potential tokenization errors
        print(f"Warning: Sentence tokenization failed unexpectedly. Error: {e}. Skipping summary.")
        return full_text[:max_chars].strip() # Fallback

    if not sentences:
        return ""

    cleaned_full_text = preprocess_text(full_text)
    if not cleaned_full_text:
        return sentences[0][:max_chars] if sentences else "" # Fallback if cleaning removes everything

    words = word_tokenize(cleaned_full_text)
    word_frequencies = Counter(word for word in words if word not in STOP_WORDS and len(word) > 1)

    if not word_frequencies:
         return sentences[0][:max_chars] if sentences else "" # Return beginning if no significant words

    sentence_scores = {}
    for i, sentence in enumerate(sentences):
        cleaned_sentence_words = word_tokenize(preprocess_text(sentence))
        score = sum(word_frequencies[word] for word in cleaned_sentence_words if word in word_frequencies)
        sentence_scores[i] = score

    num_sentences_to_consider = min(len(sentences) // 2 + 1, 20)
    k = min(num_sentences_to_consider, len(sentences))
    if k <= 0: return ""

    try:
        # Ensure sentence_scores is not empty before using heapq
        if not sentence_scores:
             top_sentence_indices = list(range(min(k, len(sentences)))) # Take first k sentences if no scores
        else:
            top_sentence_indices = heapq.nlargest(k, sentence_scores, key=sentence_scores.get)
            top_sentence_indices.sort()
    except Exception as e:
        print(f"Warning: Error during sentence selection for summary: {e}. Using first sentences.")
        top_sentence_indices = list(range(min(k, len(sentences)))) # Fallback


    summary = ""
    current_length = 0
    for index in top_sentence_indices:
        if index < len(sentences):
            sentence = sentences[index].strip()
            if not sentence: continue # Skip empty sentences

            needed_length = len(sentence) + (1 if summary else 0) # +1 for space
            if current_length + needed_length <= max_chars:
                summary += (" " if summary else "") + sentence
                current_length += needed_length
            else:
                if not summary: # If first sentence is too long, truncate it
                    summary = sentence[:max_chars]
                    current_length = len(summary)
                break # Stop adding sentences

    if not summary and sentences:
         summary = sentences[0][:max_chars].strip()

    return summary.strip()[:max_chars]


def extract_keywords_tfidf(full_text, num_keywords=NUM_KEYWORDS):
    """Extracts keywords using TF-IDF."""
    keywords = [] # Initialize keywords list
    try:
        if not full_text or not isinstance(full_text, str):
            return []

        processed_text = preprocess_text(full_text)
        if not processed_text or len(processed_text.split()) < 5: # Need at least a few words
             # Fallback: simple frequency if text too short
             words = word_tokenize(processed_text)
             word_counts = Counter(w for w in words if w not in STOP_WORDS and len(w) > 2)
             keywords = [word for word, count in word_counts.most_common(num_keywords)]
             return keywords

        # *** FIXED: Set min_df=1 for single-document processing ***
        vectorizer = TfidfVectorizer(stop_words='english',
                                     ngram_range=(1, 2),
                                     max_features=2000,
                                     min_df=1) # Must be 1 when fitting on single doc
        tfidf_matrix = vectorizer.fit_transform([processed_text])

        feature_names = vectorizer.get_feature_names_out()
        scores = tfidf_matrix.toarray().flatten()

        if len(feature_names) == 0:
             words = word_tokenize(processed_text)
             word_counts = Counter(w for w in words if w not in STOP_WORDS and len(w) > 2)
             keywords = [word for word, count in word_counts.most_common(num_keywords)]
             return keywords

        actual_num_keywords = min(num_keywords, len(feature_names))
        top_indices = scores.argsort()[-actual_num_keywords:][::-1]

        keywords = [feature_names[i] for i in top_indices]
        return keywords

    except ValueError as e:
        # Catch specific TF-IDF errors or others
        print(f"Warning: TF-IDF keyword extraction failed. Error: {e}. Falling back to word count.")
        # Fallback: simple frequency (ensure processed_text is defined)
        if 'processed_text' not in locals(): # If error happened before processed_text was assigned
             processed_text = preprocess_text(full_text) if isinstance(full_text, str) else ""
        words = word_tokenize(processed_text)
        word_counts = Counter(w for w in words if w not in STOP_WORDS and len(w) > 2)
        keywords = [word for word, count in word_counts.most_common(num_keywords)]
        return keywords
    except Exception as e:
         print(f"An unexpected error occurred during keyword extraction: {e}")
         return [] # Return empty list on unexpected error


def identify_genres_heuristic(full_text, threshold_multiplier=0.0005, min_keyword_matches=3):
    """Identifies potential genres based on keyword frequency and relative importance."""
    try:
        if not full_text or not isinstance(full_text, str):
            return ["Unknown"]

        processed_text = preprocess_text(full_text)
        if not processed_text:
            return ["Unknown"]

        words = word_tokenize(processed_text)
        total_words = len(words)
        if total_words == 0:
            return ["Unknown"]

        word_counts = Counter(words)

        genre_scores = {}
        for genre, keywords in GENRE_KEYWORDS.items():
            if not keywords: continue
            score = 0
            matches = 0
            for keyword in keywords:
                if ' ' in keyword:
                    phrase_count = processed_text.count(keyword)
                    if phrase_count > 0:
                        score += phrase_count * 2
                        matches += phrase_count
                elif keyword in word_counts:
                    count = word_counts[keyword]
                    score += count
                    matches += count

            normalized_score = score / total_words if total_words > 0 else 0
            # Adjust threshold logic slightly: consider if score > 0 at all if few matches
            passes_threshold = (matches >= min_keyword_matches and normalized_score > (len(keywords) * threshold_multiplier)) or \
                               (matches > 0 and matches < min_keyword_matches and normalized_score > 0) # Allow genres with few but present keywords

            if passes_threshold:
                 genre_scores[genre] = normalized_score

        if not genre_scores:
            return ["Unknown"]

        sorted_genres = sorted(genre_scores.items(), key=lambda item: item[1], reverse=True)

        num_genres_to_return = 0
        if len(sorted_genres) >= 3 and sorted_genres[1][1] > sorted_genres[0][1] * 0.4:
            num_genres_to_return = 3
        elif len(sorted_genres) >= 2 and sorted_genres[1][1] > sorted_genres[0][1] * 0.5:
            num_genres_to_return = 2
        elif len(sorted_genres) >= 1:
            num_genres_to_return = 1
        else:
            return ["Unknown"]

        num_genres_to_return = min(num_genres_to_return, len(sorted_genres))
        matched_genres = [genre for genre, score in sorted_genres[:num_genres_to_return]]

        return matched_genres if matched_genres else ["Unknown"]
    except Exception as e:
        print(f"An unexpected error occurred during genre identification: {e}")
        return ["Unknown"]


# --- Main Processing Function ---

def process_subtitle_file(input_path, output_path, id_col, text_col):
    """Reads, processes, and analyzes the subtitle CSV."""
    print(f"Reading CSV file: {input_path}...")
    try:
        df = pd.read_csv(input_path, on_bad_lines='warn', engine='python') # Try python engine for flexibility
    except FileNotFoundError:
        print(f"Error: Input file not found at {input_path}")
        return None
    except Exception as e:
        print(f"Error reading CSV: {e}")
        # If reading fails entirely, maybe try different encoding?
        try:
            print("Attempting to read CSV with latin-1 encoding...")
            df = pd.read_csv(input_path, on_bad_lines='warn', encoding='latin-1', engine='python')
        except Exception as e2:
            print(f"Error reading CSV with latin-1 encoding as well: {e2}")
            return None


    if id_col not in df.columns or text_col not in df.columns:
        print(f"Error: Missing required columns '{id_col}' or '{text_col}' in the CSV.")
        print(f"Available columns: {df.columns.tolist()}")
        return None

    print(f"Initial rows loaded: {len(df)}")
    df.dropna(subset=[id_col, text_col], inplace=True)
    print(f"Rows after dropping NA in key columns: {len(df)}")

    df[text_col] = df[text_col].apply(lambda x: str(x) if pd.notnull(x) else '')

    print(f"Combining subtitles for each '{id_col}'...")
    try:
        # Use aggregation which might be more memory efficient for large groups
        combined_df = df.groupby(id_col)[text_col].agg(' '.join).reset_index()
        combined_df.rename(columns={text_col: 'full_subtitles'}, inplace=True)
    except Exception as e:
        print(f"Error during grouping and combining subtitles: {e}")
        return None

    print(f"Found {len(combined_df)} unique movies.")
    if len(combined_df) == 0:
        print("No movie data found after grouping. Check your ID column and data.")
        return None

    print("Analyzing subtitles (this may take a while)...")

    results = []
    try:
        from tqdm.auto import tqdm
        iterator = tqdm(combined_df.iterrows(), total=len(combined_df), desc="Analyzing Movies")
    except ImportError:
        print("Optional dependency 'tqdm' not found. Progress bar disabled. Install with: pip install tqdm")
        iterator = combined_df.iterrows()
        progress_interval = max(1, len(combined_df) // 20)
        processed_count = 0

    for index, row in iterator:
        movie_id = row[id_col]
        full_text = row['full_subtitles']

        # --- Analysis ---
        # Wrap analysis steps in a try-except block for robustness per movie
        try:
             overview = generate_overview_extractive(full_text, MAX_OVERVIEW_CHARS)
             keywords = extract_keywords_tfidf(full_text, NUM_KEYWORDS)
             genres = identify_genres_heuristic(full_text)
        except Exception as e:
             print(f"\n--- ERROR processing movie ID: {movie_id} ---")
             print(f"An unexpected error occurred: {e}")
             print("Skipping analysis for this movie.")
             print("----------------------------------------------\n")
             overview = "Analysis Error"
             keywords = []
             genres = ["Error"]


        results.append({
            MOVIE_ID_COLUMN: movie_id,
            'overview': overview,
            'keywords': ', '.join(keywords),
            'genres': ', '.join(genres),
        })

        if 'tqdm' not in sys.modules:
            processed_count += 1
            if processed_count % progress_interval == 0 or processed_count == len(combined_df):
                 print(f"  Processed {processed_count}/{len(combined_df)} movies...")

    print("Analysis complete.")

    if not results:
        print("Warning: No results were generated from the analysis.")
        return None

    output_df = pd.DataFrame(results)

    try:
        output_df.to_csv(output_path, index=False, encoding='utf-8')
        print(f"Results saved to: {output_path}")
    except Exception as e:
        print(f"Error saving results to CSV: {e}")

    return output_df

# --- Execution ---
if __name__ == "__main__":
    processed_data = process_subtitle_file(
        INPUT_CSV_FILE,
        OUTPUT_CSV_FILE,
        MOVIE_ID_COLUMN,
        SUBTITLE_TEXT_COLUMN
    )

    if processed_data is not None:
        print("\n--- Sample Output (First 5 rows) ---")
        # Display more content if overview/keywords are long
        with pd.option_context('display.max_colwidth', 100):
             print(processed_data.head())
        print("\n-------------------------------------")
    else:
        print("Processing failed or produced no results.")

NLTK 'punkt' resource found.
NLTK 'stopwords' resource found.
Reading CSV file: movies_subtitles.csv...



  df = pd.read_csv(input_path, on_bad_lines='warn', engine='python') # Try python engine for flexibility


Initial rows loaded: 6942460
Rows after dropping NA in key columns: 6936991
Combining subtitles for each 'imdb_id'...
Found 3124 unique movies.
Analyzing subtitles (this may take a while)...


Analyzing Movies:   0%|          | 0/3124 [00:00<?, ?it/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************

This might happen if the 'punkt' download is incomplete or corrupted.
Skipping summary generation for this entry.
Consider running 'nltk.download("punkt", force=True)' or 'nltk.download("all")' manually.
----------------------------

An unexpected error occurred during genre identification: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/engl

In [None]:
# -*- coding: utf-8 -*-
# Add encoding declaration for potentially non-ascii characters in comments/code

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import heapq # For efficient summarization sentence selection
import sys # To potentially exit if downloads fail

# --- Configuration ---
# ... (your configuration remains the same) ...
INPUT_CSV_FILE = 'movies_subtitles.csv'
OUTPUT_CSV_FILE = 'movie_analysis_output.csv'
MOVIE_ID_COLUMN = 'imdb_id'
SUBTITLE_TEXT_COLUMN = 'text'
MAX_OVERVIEW_CHARS = 1000
NUM_KEYWORDS = 10


# --- Download necessary NLTK data (if not already downloaded) ---
# Use correct exception handling (LookupError)
try:
    nltk.data.find('tokenizers/punkt')
    print("NLTK 'punkt' resource found.")
except LookupError:
    print("Downloading NLTK 'punkt' tokenizer...")
    nltk.download('punkt', quiet=True)
    try:
        nltk.data.find('tokenizers/punkt')
        print("'punkt' downloaded successfully.")
    except LookupError:
        print("\n--- !!! ERROR !!! ---")
        print("Failed to download or locate the NLTK 'punkt' resource even after download attempt.")
        print("Please check your internet connection and NLTK setup.")
        print("You might need to manually download NLTK data (e.g., nltk.download('all')).")
        print("See: https://www.nltk.org/data.html")
        print("Exiting script.")
        print("---------------------\n")
        sys.exit("Required NLTK resource 'punkt' missing.")

try:
    nltk.data.find('corpora/stopwords')
    print("NLTK 'stopwords' resource found.")
except LookupError:
    print("Downloading NLTK 'stopwords'...")
    nltk.download('stopwords', quiet=True)
    try:
        nltk.data.find('corpora/stopwords')
        print("'stopwords' downloaded successfully.")
    except LookupError:
        print("\n--- !!! ERROR !!! ---")
        print("Failed to download or locate the NLTK 'stopwords' resource.")
        print("Exiting script.")
        print("---------------------\n")
        sys.exit("Required NLTK resource 'stopwords' missing.")

# *** ADDED SECTION FOR PUNKT_TAB ***
try:
    nltk.data.find('tokenizers/punkt_tab') # The error indicates this specific path
    print("NLTK 'punkt_tab' resource found.")
except LookupError:
    print("Downloading NLTK 'punkt_tab' resource (needed by 'punkt' tokenizer for some cases)...")
    nltk.download('punkt_tab', quiet=True)
    try:
        nltk.data.find('tokenizers/punkt_tab')
        print("'punkt_tab' downloaded successfully.")
    except LookupError:
        print("\n--- !!! ERROR !!! ---")
        print("Failed to download or locate the NLTK 'punkt_tab' resource even after download attempt.")
        print("This resource is sometimes required by the 'punkt' tokenizer.")
        print("Please check your internet connection and NLTK setup.")
        print("If the issue persists, you might need to investigate NLTK's data directory or try 'nltk.download(\"all\")'.")
        print("Exiting script as tokenization might fail unpredictably.")
        print("---------------------\n")
        sys.exit("Required NLTK resource 'punkt_tab' missing.")
# *** END OF ADDED SECTION FOR PUNKT_TAB ***


# Define STOP_WORDS *after* ensuring NLTK data is available
try:
    STOP_WORDS = set(stopwords.words('english'))
except Exception as e:
     print(f"\n--- !!! ERROR !!! ---")
     print(f"Failed to load NLTK stopwords. Error: {e}")
     print("Proceeding without stopwords, keyword quality may be affected.")
     print("---------------------\n")
     STOP_WORDS = set() # Use empty set if loading fails

# --- Genre Keywords (Heuristic - Expand as needed) ---
# ... (your GENRE_KEYWORDS and helper functions remain the same) ...
GENRE_KEYWORDS = {
    'Action': ['fight', 'gun', 'chase', 'explosion', 'kill', 'attack', 'shoot', 'run', 'escape', 'mission', 'weapon', 'battle', 'war'],
    'Comedy': ['funny', 'laugh', 'joke', 'haha', 'stupid', 'crazy', 'hilarious', 'idiot', 'comedian', 'fun', 'silly'],
    'Drama': ['sad', 'cry', 'feelings', 'sorry', 'relationship', 'family', 'life', 'death', 'serious', 'story', 'love', 'lost', 'hope'],
    'Sci-Fi': ['space', 'alien', 'robot', 'future', 'planet', 'ship', 'laser', 'time travel', 'science', 'galaxy', 'android', 'tech', 'technology', 'ai'],
    'Horror': ['scared', 'fear', 'ghost', 'monster', 'scream', 'blood', 'die', 'haunted', 'terror', 'nightmare', 'killer', 'dark', 'evil'],
    'Romance': ['love', 'kiss', 'heart', 'date', 'beautiful', 'together', 'forever', 'darling', 'marry', 'sweet', 'couple', 'wedding'],
    'Thriller': ['suspense', 'danger', 'nervous', 'plot', 'secret', 'mystery', 'trap', 'risk', 'threat', 'tense', 'escape', 'spy', 'agent'],
    'Fantasy': ['magic', 'wizard', 'dragon', 'sword', 'kingdom', 'elf', 'quest', 'myth', 'legend', 'creature', 'king', 'queen', 'prince', 'princess'],
    'Animation': [],
    'Documentary': ['real', 'story', 'life', 'world', 'people', 'history', 'fact', 'interview', 'evidence', 'nature', 'science']
}

# --- Helper Functions ---

def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'\d{1,2}:\d{2}:\d{2},\d{3}\s*-->\s*\d{1,2}:\d{2}:\d{2},\d{3}', '', text)
    text = re.sub(r'[\[\{\(]\s*\d{1,2}:\d{2}(:\d{2})?\s*[\]\}\)]', '', text)
    text = re.sub(r'<[/ BUI]?.*?>', '', text, flags=re.IGNORECASE)
    text = re.sub(r'^[\s\t]*[A-Z\s]+:', '', text, flags=re.MULTILINE)
    text = re.sub(r'[\[\{\(][^\]\}\)]*?:.*?[\]\}\)]', '', text)
    text = re.sub(r'[♪♫#*]', '', text)
    text = re.sub(r'[-=]{2,}', ' ', text)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def generate_overview_extractive(full_text, max_chars=MAX_OVERVIEW_CHARS):
    if not full_text or not isinstance(full_text, str) or len(full_text.strip()) < 10:
        return ""
    try:
        sentences = sent_tokenize(full_text)
    except LookupError as e:
        print(f"\n--- NLTK Tokenizer Warning (generate_overview_extractive) ---")
        print(f"Sentence tokenization failed for an entry due to missing NLTK resource: {e}")
        print("This might happen if an NLTK download (e.g. 'punkt' or 'punkt_tab') is incomplete or corrupted.")
        print("Skipping summary generation for this entry.")
        print("Ensure all required NLTK data was downloaded at script start.")
        print("--------------------------------------------------------------\n")
        return full_text[:max_chars].strip()
    except Exception as e:
        print(f"Warning: Sentence tokenization failed unexpectedly in generate_overview_extractive. Error: {e}. Skipping summary.")
        return full_text[:max_chars].strip()

    if not sentences:
        return ""
    cleaned_full_text = preprocess_text(full_text)
    if not cleaned_full_text:
        return sentences[0][:max_chars] if sentences else ""
    words = word_tokenize(cleaned_full_text)
    word_frequencies = Counter(word for word in words if word not in STOP_WORDS and len(word) > 1)
    if not word_frequencies:
         return sentences[0][:max_chars] if sentences else ""
    sentence_scores = {}
    for i, sentence in enumerate(sentences):
        cleaned_sentence_words = word_tokenize(preprocess_text(sentence))
        score = sum(word_frequencies[word] for word in cleaned_sentence_words if word in word_frequencies)
        sentence_scores[i] = score
    num_sentences_to_consider = min(len(sentences) // 2 + 1, 20)
    k = min(num_sentences_to_consider, len(sentences))
    if k <= 0: return ""
    try:
        if not sentence_scores:
             top_sentence_indices = list(range(min(k, len(sentences))))
        else:
            top_sentence_indices = heapq.nlargest(k, sentence_scores, key=sentence_scores.get)
            top_sentence_indices.sort()
    except Exception as e:
        print(f"Warning: Error during sentence selection for summary: {e}. Using first sentences.")
        top_sentence_indices = list(range(min(k, len(sentences))))
    summary = ""
    current_length = 0
    for index in top_sentence_indices:
        if index < len(sentences):
            sentence = sentences[index].strip()
            if not sentence: continue
            needed_length = len(sentence) + (1 if summary else 0)
            if current_length + needed_length <= max_chars:
                summary += (" " if summary else "") + sentence
                current_length += needed_length
            else:
                if not summary:
                    summary = sentence[:max_chars]
                    current_length = len(summary)
                break
    if not summary and sentences:
         summary = sentences[0][:max_chars].strip()
    return summary.strip()[:max_chars]

def extract_keywords_tfidf(full_text, num_keywords=NUM_KEYWORDS):
    keywords = []
    try:
        if not full_text or not isinstance(full_text, str):
            return []
        processed_text = preprocess_text(full_text)
        if not processed_text or len(processed_text.split()) < 5:
             words = word_tokenize(processed_text)
             word_counts = Counter(w for w in words if w not in STOP_WORDS and len(w) > 2)
             keywords = [word for word, count in word_counts.most_common(num_keywords)]
             return keywords
        vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=2000, min_df=1)
        tfidf_matrix = vectorizer.fit_transform([processed_text])
        feature_names = vectorizer.get_feature_names_out()
        scores = tfidf_matrix.toarray().flatten()
        if len(feature_names) == 0:
             words = word_tokenize(processed_text)
             word_counts = Counter(w for w in words if w not in STOP_WORDS and len(w) > 2)
             keywords = [word for word, count in word_counts.most_common(num_keywords)]
             return keywords
        actual_num_keywords = min(num_keywords, len(feature_names))
        top_indices = scores.argsort()[-actual_num_keywords:][::-1]
        keywords = [feature_names[i] for i in top_indices]
        return keywords
    except ValueError as e:
        print(f"Warning: TF-IDF keyword extraction failed. Error: {e}. Falling back to word count.")
        if 'processed_text' not in locals():
             processed_text = preprocess_text(full_text) if isinstance(full_text, str) else ""
        words = word_tokenize(processed_text) # This could also raise LookupError if punkt_tab is still an issue
        word_counts = Counter(w for w in words if w not in STOP_WORDS and len(w) > 2)
        keywords = [word for word, count in word_counts.most_common(num_keywords)]
        return keywords
    except LookupError as e: # Explicitly catch LookupError for tokenization here too
        print(f"\n--- NLTK Tokenizer Warning (extract_keywords_tfidf) ---")
        print(f"Word tokenization failed for an entry due to missing NLTK resource: {e}")
        print("This might happen if an NLTK download (e.g. 'punkt' or 'punkt_tab') is incomplete or corrupted.")
        print("Skipping keyword generation for this entry.")
        print("Ensure all required NLTK data was downloaded at script start.")
        print("------------------------------------------------------------\n")
        return [] # Return empty list
    except Exception as e:
         print(f"An unexpected error occurred during keyword extraction: {e}")
         return []

def identify_genres_heuristic(full_text, threshold_multiplier=0.0005, min_keyword_matches=3):
    try:
        if not full_text or not isinstance(full_text, str):
            return ["Unknown"]
        processed_text = preprocess_text(full_text)
        if not processed_text:
            return ["Unknown"]
        words = word_tokenize(processed_text) # This can also raise LookupError
        total_words = len(words)
        if total_words == 0:
            return ["Unknown"]
        word_counts = Counter(words)
        genre_scores = {}
        for genre, keywords_list in GENRE_KEYWORDS.items():
            if not keywords_list: continue
            score = 0
            matches = 0
            for keyword in keywords_list:
                if ' ' in keyword:
                    phrase_count = processed_text.count(keyword)
                    if phrase_count > 0:
                        score += phrase_count * 2
                        matches += phrase_count
                elif keyword in word_counts:
                    count = word_counts[keyword]
                    score += count
                    matches += count
            normalized_score = score / total_words if total_words > 0 else 0
            passes_threshold = (matches >= min_keyword_matches and normalized_score > (len(keywords_list) * threshold_multiplier)) or \
                               (matches > 0 and matches < min_keyword_matches and normalized_score > 0)
            if passes_threshold:
                 genre_scores[genre] = normalized_score
        if not genre_scores:
            return ["Unknown"]
        sorted_genres = sorted(genre_scores.items(), key=lambda item: item[1], reverse=True)
        num_genres_to_return = 0
        if len(sorted_genres) >= 3 and sorted_genres[1][1] > sorted_genres[0][1] * 0.4:
            num_genres_to_return = 3
        elif len(sorted_genres) >= 2 and sorted_genres[1][1] > sorted_genres[0][1] * 0.5:
            num_genres_to_return = 2
        elif len(sorted_genres) >= 1:
            num_genres_to_return = 1
        else:
            return ["Unknown"]
        num_genres_to_return = min(num_genres_to_return, len(sorted_genres))
        matched_genres = [genre for genre, score in sorted_genres[:num_genres_to_return]]
        return matched_genres if matched_genres else ["Unknown"]
    except LookupError as e: # Explicitly catch LookupError for tokenization here too
        print(f"\n--- NLTK Tokenizer Warning (identify_genres_heuristic) ---")
        print(f"Word tokenization failed for an entry due to missing NLTK resource: {e}")
        print("This might happen if an NLTK download (e.g. 'punkt' or 'punkt_tab') is incomplete or corrupted.")
        print("Skipping genre identification for this entry.")
        print("Ensure all required NLTK data was downloaded at script start.")
        print("--------------------------------------------------------------\n")
        return ["Unknown"]
    except Exception as e:
        print(f"An unexpected error occurred during genre identification: {e}")
        return ["Unknown"]

# --- Main Processing Function ---
# ... (your process_subtitle_file function remains the same) ...
def process_subtitle_file(input_path, output_path, id_col, text_col):
    print(f"Reading CSV file: {input_path}...")
    try:
        df = pd.read_csv(input_path, on_bad_lines='warn', engine='python')
    except FileNotFoundError:
        print(f"Error: Input file not found at {input_path}")
        return None
    except Exception as e:
        print(f"Error reading CSV: {e}")
        try:
            print("Attempting to read CSV with latin-1 encoding...")
            df = pd.read_csv(input_path, on_bad_lines='warn', encoding='latin-1', engine='python')
        except Exception as e2:
            print(f"Error reading CSV with latin-1 encoding as well: {e2}")
            return None

    if id_col not in df.columns or text_col not in df.columns:
        print(f"Error: Missing required columns '{id_col}' or '{text_col}' in the CSV.")
        print(f"Available columns: {df.columns.tolist()}")
        return None
    print(f"Initial rows loaded: {len(df)}")
    df.dropna(subset=[id_col, text_col], inplace=True)
    print(f"Rows after dropping NA in key columns: {len(df)}")
    df[text_col] = df[text_col].apply(lambda x: str(x) if pd.notnull(x) else '')
    print(f"Combining subtitles for each '{id_col}'...")
    try:
        combined_df = df.groupby(id_col)[text_col].agg(' '.join).reset_index()
        combined_df.rename(columns={text_col: 'full_subtitles'}, inplace=True)
    except Exception as e:
        print(f"Error during grouping and combining subtitles: {e}")
        return None
    print(f"Found {len(combined_df)} unique movies.")
    if len(combined_df) == 0:
        print("No movie data found after grouping. Check your ID column and data.")
        return None
    print("Analyzing subtitles (this may take a while)...")
    results = []
    try:
        from tqdm.auto import tqdm
        iterator = tqdm(combined_df.iterrows(), total=len(combined_df), desc="Analyzing Movies")
    except ImportError:
        print("Optional dependency 'tqdm' not found. Progress bar disabled. Install with: pip install tqdm")
        iterator = combined_df.iterrows()
        progress_interval = max(1, len(combined_df) // 20)
        processed_count = 0
    for index, row in iterator:
        movie_id = row[id_col]
        full_text = row['full_subtitles']
        try:
             overview = generate_overview_extractive(full_text, MAX_OVERVIEW_CHARS)
             keywords = extract_keywords_tfidf(full_text, NUM_KEYWORDS)
             genres = identify_genres_heuristic(full_text)
        except Exception as e: # General catch-all per movie, though specific catches are now in functions
             print(f"\n--- ERROR processing movie ID: {movie_id} ---")
             print(f"An unexpected error occurred: {e}")
             print("Skipping analysis for this movie.")
             print("----------------------------------------------\n")
             overview = "Analysis Error"
             keywords = []
             genres = ["Error"]
        results.append({
            MOVIE_ID_COLUMN: movie_id,
            'overview': overview,
            'keywords': ', '.join(keywords),
            'genres': ', '.join(genres),
        })
        if 'tqdm' not in sys.modules:
            processed_count += 1
            if processed_count % progress_interval == 0 or processed_count == len(combined_df):
                 print(f"  Processed {processed_count}/{len(combined_df)} movies...")
    print("Analysis complete.")
    if not results:
        print("Warning: No results were generated from the analysis.")
        return None
    output_df = pd.DataFrame(results)
    try:
        output_df.to_csv(output_path, index=False, encoding='utf-8')
        print(f"Results saved to: {output_path}")
    except Exception as e:
        print(f"Error saving results to CSV: {e}")
    return output_df

# --- Execution ---
if __name__ == "__main__":
    processed_data = process_subtitle_file(
        INPUT_CSV_FILE,
        OUTPUT_CSV_FILE,
        MOVIE_ID_COLUMN,
        SUBTITLE_TEXT_COLUMN
    )
    if processed_data is not None:
        print("\n--- Sample Output (First 5 rows) ---")
        with pd.option_context('display.max_colwidth', 100):
             print(processed_data.head())
        print("\n-------------------------------------")
    else:
        print("Processing failed or produced no results.")

NLTK 'punkt' resource found.
NLTK 'stopwords' resource found.
NLTK 'punkt_tab' resource found.
Reading CSV file: movies_subtitles.csv...
Initial rows loaded: 10358496
Rows after dropping NA in key columns: 10346661
Combining subtitles for each 'imdb_id'...
Found 4665 unique movies.
Analyzing subtitles (this may take a while)...


Analyzing Movies:   0%|          | 0/4665 [00:00<?, ?it/s]

Analysis complete.
Results saved to: movie_analysis_output.csv

--- Sample Output (First 5 rows) ---
     imdb_id  \
0  tt0002130   
1  tt0005044   
2  tt0005059   
3  tt0005060   
4  tt0007338   

                                                                                              overview  \
0  I'll never get to sleep. Hold the line... <i>Well,</i> <i>you know what to expect now.</i> She d...   
1  Don't pretend you don't know\nwhat I'm talking about. I don't know what the\nfuck you're talking...   
2  Whether you got a white skate on one\nfoot and a black on the other it's... What's the... what's...   
3  Αnd then we'll follow you,\ndark-haired cigarette girls, and whisper words of love\nin every ear...   
4  Well with all due respect, Mr. Holmes,\nYou know John very well. - I don't think that's... - I d...   

                                                              keywords  \
0            im, ill, dont, youre, just, look, thats, paul, come, shes   
1            dont

In [None]:
import pandas as pd
import sys

# --- Configuration ---
METADATA_FILE = 'movies_meta.csv'
ANALYSIS_FILE = 'movie_analysis_output.csv'
OUTPUT_FILE = 'merged_movie_data.csv'
MERGE_COLUMN = 'imdb_id' # The common column to join on

# --- Main Merge Function ---
def merge_csv_files(meta_path, analysis_path, output_path, merge_col):
    """
    Loads two CSV files, merges them based on a specified column,
    and saves the result to a new CSV file.
    """
    print(f"Attempting to load metadata file: {meta_path}")
    try:
        # Read metadata, treat merge column as string to avoid type issues
        df_meta = pd.read_csv(meta_path, dtype={merge_col: str})
        print(f"  Successfully loaded metadata: {len(df_meta)} rows, {len(df_meta.columns)} columns.")
    except FileNotFoundError:
        print(f"Error: Metadata file not found at '{meta_path}'")
        return None
    except Exception as e:
        print(f"Error loading metadata file '{meta_path}': {e}")
        return None

    print(f"Attempting to load analysis file: {analysis_path}")
    try:
         # Read analysis results, treat merge column as string
        df_analysis = pd.read_csv(analysis_path, dtype={merge_col: str})
        print(f"  Successfully loaded analysis results: {len(df_analysis)} rows, {len(df_analysis.columns)} columns.")
    except FileNotFoundError:
        print(f"Error: Analysis file not found at '{analysis_path}'")
        return None
    except Exception as e:
        print(f"Error loading analysis file '{analysis_path}': {e}")
        return None

    # --- Validate Merge Column ---
    if merge_col not in df_meta.columns:
        print(f"Error: Merge column '{merge_col}' not found in metadata file '{meta_path}'.")
        print(f"Available columns: {df_meta.columns.tolist()}")
        return None
    if merge_col not in df_analysis.columns:
        print(f"Error: Merge column '{merge_col}' not found in analysis file '{analysis_path}'.")
        print(f"Available columns: {df_analysis.columns.tolist()}")
        return None

    # --- Perform the Merge ---
    print(f"\nMerging the two dataframes on column: '{merge_col}'...")

    # Use 'inner' merge to keep only rows where imdb_id exists in BOTH files.
    # Other options:
    # 'left': Keep all rows from df_meta, add analysis data where matched, NaN otherwise.
    # 'right': Keep all rows from df_analysis, add meta data where matched, NaN otherwise.
    # 'outer': Keep all rows from both, fill missing data with NaN.
    merged_df = pd.merge(
        left=df_meta,
        right=df_analysis,
        on=merge_col,
        how='inner' # Choose 'inner', 'left', 'right', or 'outer'
    )

    print(f"Merge complete. Resulting dataframe has {len(merged_df)} rows and {len(merged_df.columns)} columns.")

    if len(merged_df) == 0:
        print("Warning: The merged dataframe is empty. This means no common")
        print(f"'{merge_col}' values were found between the two files using an 'inner' merge.")
        print(f"Consider using 'how='left'' or 'how='outer'' if you expect unmatched rows.")
        # Optionally return None or the empty df depending on desired behavior
        # return None
        return merged_df


    # --- Save the Result ---
    try:
        merged_df.to_csv(output_path, index=False, encoding='utf-8')
        print(f"\nSuccessfully saved merged data to: {output_path}")
    except Exception as e:
        print(f"Error saving merged file to '{output_path}': {e}")
        return None

    return merged_df

# --- Execution ---
if __name__ == "__main__":
    print("--- Starting CSV Merge Script ---")
    final_df = merge_csv_files(
        METADATA_FILE,
        ANALYSIS_FILE,
        OUTPUT_FILE,
        MERGE_COLUMN
    )

    if final_df is not None:
        print("\n--- Merge Script Finished ---")
        print("Showing first 5 rows of the merged data:")
        # Display more content if overview/keywords are long
        with pd.option_context('display.max_colwidth', 80, 'display.max_columns', None):
            print(final_df.head())
    else:
        print("\n--- Merge Script Failed ---")

--- Starting CSV Merge Script ---
Attempting to load metadata file: movies_meta.csv
  Successfully loaded metadata: 4690 rows, 24 columns.
Attempting to load analysis file: movie_analysis_output.csv
  Successfully loaded analysis results: 3124 rows, 4 columns.

Merging the two dataframes on column: 'imdb_id'...
Merge complete. Resulting dataframe has 3128 rows and 27 columns.

Successfully saved merged data to: merged_movie_data.csv

--- Merge Script Finished ---
Showing first 5 rows of the merged data:
   adult  \
0  False   
1  False   
2  False   
3  False   
4  False   

                                                             belongs_to_collection  \
0  {'id': 10194, 'name': 'Toy Story Collection', 'poster_path': '/7G9915LfUQ2lV...   
1                                                                              NaN   
2  {'id': 645, 'name': 'James Bond Collection', 'poster_path': '/HORpg5CSkmeQlA...   
3                                                                         

In [None]:
import pandas as pd
import sys

# --- Configuration ---
INPUT_FILE = 'merged_movie_data.csv' # The merged CSV file
COLUMN_TO_COUNT = 'homepage'        # The column containing homepage URLs

# --- Function to count unique values ---
def count_unique_in_column(filepath, column_name):
    """
    Loads a CSV file and counts the number of unique non-null values
    in a specified column.
    """
    print(f"Attempting to load file: {filepath}")
    try:
        df = pd.read_csv(filepath)
        print(f"  Successfully loaded file: {len(df)} rows, {len(df.columns)} columns.")
    except FileNotFoundError:
        print(f"Error: File not found at '{filepath}'")
        return None # Indicate failure
    except Exception as e:
        print(f"Error loading file '{filepath}': {e}")
        return None # Indicate failure

    # --- Validate Column ---
    if column_name not in df.columns:
        print(f"Error: Column '{column_name}' not found in the file.")
        print(f"Available columns are: {df.columns.tolist()}")
        return None # Indicate failure

    # --- Count Unique Values ---
    # .nunique() efficiently counts distinct non-null values
    unique_count = df[column_name].nunique()

    print(f"\nAnalysis complete for column: '{column_name}'")
    print(f"Number of unique non-null values: {unique_count}")

    # --- Optional: Show some unique values (if needed) ---
    # If you want to see *what* some of the unique values are:
    # try:
    #     unique_values_sample = df[column_name].dropna().unique()
    #     print(f"\nSample of unique values (up to 10):")
    #     print(unique_values_sample[:10])
    # except Exception as e:
    #     print(f"Could not display unique values sample due to error: {e}")

    return unique_count

# --- Execution ---
if __name__ == "__main__":
    print(f"--- Counting unique values in column '{COLUMN_TO_COUNT}' ---")
    result = count_unique_in_column(INPUT_FILE, COLUMN_TO_COUNT)

    if result is not None:
        print("\n--- Script Finished ---")
    else:
        print("\n--- Script Failed or Column Not Found ---")

--- Counting unique values in column 'homepage' ---
Attempting to load file: merged_movie_data.csv
  Successfully loaded file: 3128 rows, 27 columns.

Analysis complete for column: 'homepage'
Number of unique non-null values: 664

--- Script Finished ---
