In [None]:
import pandas as pd
import numpy as np
import unicodedata
import re

# =============================================================================
# STEP 1: TITLE NORMALIZATION FUNCTION
# =============================================================================
# --- TASK: Define function to standardize movie titles for matching ---
def normalize_title(title):
    
    # --- LOGIC: Handle invalid inputs ---
    # We must first check if the input is missing (NaN) or not a text string, and return an empty string if so.
    if pd.isna(title) or not isinstance(title, str):
        return ""
    
    # --- LOGIC: Convert to lowercase and handle accents ---
    # **Unicode normalization** (NFKD) converts characters with accents (e.g., "Café") into simple English characters ("Cafe"). 
    # Converting the title to **lowercase** ensures 'The Matrix' matches 'the matrix'.
    title = unicodedata.normalize('NFKD', title).lower()
    
    # --- LOGIC: Remove leading English articles ---
    # We strip common articles ("the," "a," "an") only if they appear at the very beginning of the title. 
    # Example: "The Matrix" becomes "Matrix".
    title = re.sub(r'^(the|a|an)\s+', '', title)
    
    # --- LOGIC: Replace hyphens and underscores with spaces ---
    # Characters like hyphens or underscores (e.g., "Spider-Man") are converted to spaces.
    title = re.sub(r'[-_]', ' ', title)
    
    # --- LOGIC: Remove parenthetical content ---
    # We remove any text enclosed in parentheses or square brackets, which often contain years or subtitles that interfere with matching. 
    # Example: "Avatar (2009)" becomes "Avatar".
    title = re.sub(r'\([^)]*\)|\[[^\]]*\]', '', title)
    
    # --- LOGIC: Replace separating punctuation with spaces ---
    # Colons and semicolons are replaced with spaces to help separate words.
    title = re.sub(r'[;:]', ' ', title)
    
    # --- LOGIC: Remove remaining punctuation (except apostrophes) ---
    # All non-alphanumeric characters (like exclamation points or dollar signs) are converted to spaces.
    title = re.sub(r'[^\w\s\']', ' ', title)
    
    # --- LOGIC: Remove remaining apostrophes ---
    # Apostrophes are removed entirely to ensure consistency, as contractions (e.g., "Don't") should not prevent matching.
    title = re.sub(r'\'', '', title)
    
    # --- LOGIC: Remove all spaces for a compact string ---
    # We strip all spaces and extra whitespace. This creates the final, ultra-compact string 
    # (e.g., "The Dark Knight" -> "thedarkknight") for robust **fuzzy matching** later on.
    title = re.sub(r'\s+', '', title).strip()
    
    return title

# =============================================================================
# STEP 2: DATA CLEANUP ORCHESTRATION FUNCTION
# =============================================================================
# --- TASK: Main function to clean, standardize, and save the data ---
def clean_data(input_file, output_file):
    
    # --- OPERATION: Load data ---
    # Reads the raw source data file (an Excel file) into a Pandas DataFrame.
    df = pd.read_excel(input_file)
    
    # Creates a copy of the original data to work with, preserving the raw file in memory.
    df_cleaned = df.copy()
    
    # --- OPERATION: Remove empty columns ---
    # Drops any columns where every single cell is empty (contains NaN).
    df_cleaned = df_cleaned.dropna(axis=1, how='all')
    
    # --- OPERATION: Standardize missing values ---
    # Replaces common text-based placeholders for missing data (like 'n/a') with the standard numerical missing value: NaN (Not a Number).
    df_cleaned = df_cleaned.replace('n/a', np.nan)
    
    # --- OPERATION: Normalize movie titles ---
    # If the file contains a 'title' column, we apply the normalization function to create the new 'title_normalized' column.
    if 'title' in df_cleaned.columns:
        df_cleaned['title_normalized'] = df_cleaned['title'].apply(normalize_title)
    
    # --- OPERATION: Convert financial columns to numbers ---
    # We define a list of columns that are expected to contain monetary values (Box Office, Budget, etc.).
    financial_cols = ['international_box_office', 'domestic_box_office',
                     'worldwide_box_office', 'production_budget', 'opening_weekend']
    
    # We force these columns to the numeric type, using 'errors='coerce'' to convert any leftover non-numeric data (like hyphens or text) into NaN.
    for col in financial_cols:
        if col in df_cleaned.columns:
            df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')
    
    # --- OPERATION: Fill missing text values ---
    # For descriptive columns (like genre or keywords), missing values are less problematic if filled with a placeholder.
    text_cols = ['genre', 'keywords', 'creative_type']
    # We replace any remaining NaN values in these text columns with the string 'Unknown'.
    for col in text_cols:
        if col in df_cleaned.columns:
            df_cleaned[col] = df_cleaned[col].fillna('Unknown')
    
    # --- OPERATION: Save to CSV ---
    # The final, clean DataFrame is saved to the specified output file location in **CSV (Comma Separated Values)** format.
    df_cleaned.to_csv(output_file, index=False)
    return df_cleaned

# --- USAGE: Run cleanup for both files ---
# This block executes the cleaning function for both the sales and metadata files, preparing them for the database builder.
if __name__ == "__main__":
    clean_data('data/sales.xlsx', 'cleanedData/sales_cleaned.csv')
    clean_data('data/metaClean43Brightspace.xlsx', 'cleanedData/metadata_cleaned.csv')

Cleaning release_date column...
Cleaning RelDate column...
