In [None]:
import pandas as pd
import numpy as np
import unicodedata
import re

def normalize_title(title):
    # Handle null values and non-string inputs
    # Returns empty string for NaN, None, or non-string values
    if pd.isna(title) or not isinstance(title, str):
        return ""
    
    # Unicode normalization (NFKD) handles special characters and accents
    # Converts "Café" to "Cafe", "naïve" to "naive", etc.
    # Also converts to lowercase for consistent comparison
    title = unicodedata.normalize('NFKD', title).lower()
    
    # Remove common English articles from the beginning
    # "The Matrix" -> "Matrix", "A Beautiful Mind" -> "Beautiful Mind"
    # Uses word boundary (\s+) to avoid removing partial matches
    title = re.sub(r'^(the|a|an)\s+', '', title)
    
    # Replace hyphens and underscores with spaces
    # "Spider-Man" -> "Spider Man", "Iron_Man" -> "Iron Man"
    # Prepares for space removal in final step
    title = re.sub(r'[-_]', ' ', title)
    
    # Remove parentheses/brackets and their contents
    # "Avatar (2009)" -> "Avatar", "The Matrix [Reloaded]" -> "The Matrix"
    # Handles both () and [] brackets with any content inside
    title = re.sub(r'\([^)]*\)|\[[^\]]*\]', '', title)
    
    # Replace colons and semicolons with spaces
    # "Mission: Impossible" -> "Mission Impossible"
    # "Fast & Furious; Tokyo Drift" -> "Fast & Furious Tokyo Drift"
    title = re.sub(r'[;:]', ' ', title)
    
    # Remove all punctuation except apostrophes and spaces
    # Keeps letters, numbers, spaces, and apostrophes only
    # "Fast & Furious!" -> "Fast   Furious", "Don't" -> "Don't"
    title = re.sub(r'[^\w\s\']', ' ', title)
    
    # Remove remaining apostrophes
    # "Don't Stop Me Now" -> "Dont Stop Me Now"
    # Ensures consistent matching for contractions
    title = re.sub(r'\'', '', title)
    
    # Remove ALL spaces and extra whitespace, then trim
    # "The   Dark    Knight" -> "thedarkknight"
    # Creates compact string for fuzzy matching algorithms
    title = re.sub(r'\s+', '', title).strip()
    
    return title

def clean_data(input_file, output_file):
    # Load data
    df = pd.read_excel(input_file)
    
    # Create a copy to work with
    df_cleaned = df.copy()
    
    # Remove empty columns
    df_cleaned = df_cleaned.dropna(axis=1, how='all')
    
    # Replace 'n/a' with NaN
    df_cleaned = df_cleaned.replace('n/a', np.nan)
    
    # Normalize titles if column exists
    if 'title' in df_cleaned.columns:
        df_cleaned['title_normalized'] = df_cleaned['title'].apply(normalize_title)
    
    # Convert financial columns to numeric
    financial_cols = ['international_box_office', 'domestic_box_office',
                     'worldwide_box_office', 'production_budget', 'opening_weekend']
    
    for col in financial_cols:
        if col in df_cleaned.columns:
            df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')
    
    # Fill missing text values
    text_cols = ['genre', 'keywords', 'creative_type']
    for col in text_cols:
        if col in df_cleaned.columns:
            df_cleaned[col] = df_cleaned[col].fillna('Unknown')
    
    # Save to CSV
    df_cleaned.to_csv(output_file, index=False)
    return df_cleaned

# Usage
if __name__ == "__main__":
    clean_data('data/sales.xlsx', 'cleanedData/sales_cleaned.csv')
    clean_data('data/metaClean43Brightspace.xlsx', 'cleanedData/metadata_cleaned.csv')

Cleaning release_date column...
Cleaning RelDate column...
