In [None]:
# Cell 1: Import necessary libraries
import pandas as pd
import re

# Cell 2: Load the Excel files
# Replace 'extract.xlsx' and 'gold.xlsx' with the actual file paths
try:
    df_extract = pd.read_excel('extract.xlsx')
    df_gold = pd.read_excel('gold.xlsx')
    print("Successfully loaded 'extract.xlsx' and 'gold.xlsx'.")
except FileNotFoundError:
    print("One or both Excel files not found. Creating dummy data for demonstration.")
    # Create dummy dataframes for demonstration if files are not found
    data_extract = {
        'filepath': ['/tmp/job_e6b7dd30-4f60-4f8b-8331-2458fea95c06_eh6vxyz3/zip_0_new/GenAI-OUT-F-IMG-16042025-30000/2510600056001201252_f.jpeg',
                     '/tmp/job_e6b7dd30-4f60-4f8b-8331-2458fea95c06_eh6vxyz3/zip_0_new/GenAI-OUT-F-IMG-16042025-30000/1234567890123456789_f.jpeg',
                     '/tmp/job_e6b7dd30-4f60-4f8b-8331-2458fea95c06_eh6vxyz3/zip_0_new/GenAI-OUT-F-IMG-16042025-30000/nodatafoundhere_f.jpeg',
                     None, # Test with None filepath
                     '/tmp/job_e6b7dd30-4f60-4f8b-8331-2458fea95c06_eh6vxyz3/zip_0_new/GenAI-OUT-F-IMG-16042025-30000/3333333333333333333_f.jpeg' 
                    ],
        'payee_name': ['John Doe!', 'Jane Smith.', 'Missing Gold Match', 'No Filepath Name', 'Same Name Co.'],
        'amount_numeric': [750, 1200.50, 500, 200, 999.99],
        'micr_code': ['123456CITYBANKBRANCH1SAVINGS', '987654CITY2BANK2BRANCH2CURRENT', '000000XXXXYYYY000ZZZZ', '111', 'ABCDEFG123HIJKL']
    }
    df_extract = pd.DataFrame(data_extract)

    data_gold = {
        'TRANSACTION_DATE': ['16/04/2025', '17/04/2025', '18/04/2025', '19/04/2025'],
        'INSTRUMENT_ID': [2510600056001201252, 9876543210987654321, 12345, 3333333333333333333], # String to test coercion
        'FLOW_TYPE': ['INWARD', 'OUTWARD', 'INWARD', 'OUTWARD'],
        'SCAN_INSTRUMENT_NUMBER': [123456, 987654, pd.NA, 999000],
        'SCAN_PAYEE_BANK_CITY_CODE': ['CITY', 'CITY2', 'CITY3', 'CITYX'],
        'SCAN_PAYEE_BANK_CODE': ['BANK', 'BANK2', 'BANK3', 'BANKY'],
        'SCAN_PAYEE_BANK_BRANCH_CODE': ['BRANCH', 'BRANCH2', 'BRANCH3', 'BRANCHZ'],
        'SCAN_MICR_ACNO': [1, 0, 123, 0], 
        'SCAN_INSTRUMENT_TYPE': ['SAVINGS', 'CURRENT', 'SAVINGS', 'TYPEA'],
        'PRES_NAME': [' John Doe ', 'Jane  Smith', 'Another Name Inc.', 'Same Name Co.'],
        'CAR_AMOUNT': [750.00, 1250.00, 600.00, 999.99]
    }
    df_gold = pd.DataFrame(data_gold)

# Display the first few rows of each dataframe
print("Excel 1 (Extract) Head:")
print(df_extract.head())
print("\nExcel 2 (Gold) Head:")
print(df_gold.head())

# Cell 3: Preprocess df_extract
# Extract INSTRUMENT_ID from filepath
def extract_instrument_id(filepath):
    if pd.isna(filepath):
        return None
    # Regex to find numbers followed by _f.jpeg at the end of a path segment
    match = re.search(r'/([0-9]+)_f\.jpeg$', str(filepath))
    if match:
        return match.group(1)
    return None

df_extract['INSTRUMENT_ID_EXTRACT'] = df_extract['filepath'].apply(extract_instrument_id)
df_extract['INSTRUMENT_ID_EXTRACT'] = pd.to_numeric(df_extract['INSTRUMENT_ID_EXTRACT'], errors='coerce') # Coerce to numeric, non-matches become NaT/NaN

print("\nDF Extract after INSTRUMENT_ID extraction:")
print(df_extract.head())

# Cell 4: Preprocess df_gold
# Construct micr_code in df_gold
def construct_micr(row):
    parts = []
    # Ensure that we check for pd.NA as well as np.nan if using newer pandas versions, or just use pd.isna()
    if pd.notna(row['SCAN_INSTRUMENT_NUMBER']):
        parts.append(str(row['SCAN_INSTRUMENT_NUMBER']))
    if pd.notna(row['SCAN_PAYEE_BANK_CITY_CODE']):
        parts.append(str(row['SCAN_PAYEE_BANK_CITY_CODE']))
    if pd.notna(row['SCAN_PAYEE_BANK_CODE']):
        parts.append(str(row['SCAN_PAYEE_BANK_CODE']))
    if pd.notna(row['SCAN_PAYEE_BANK_BRANCH_CODE']):
        parts.append(str(row['SCAN_PAYEE_BANK_BRANCH_CODE']))
    if pd.notna(row['SCAN_MICR_ACNO']) and row['SCAN_MICR_ACNO'] != 0:
         # Ensure it's int then str if not 0, handle potential float if column is mixed
        try:
            parts.append(str(int(float(row['SCAN_MICR_ACNO']))))
        except ValueError: # Handle if it cannot be converted to float/int
             parts.append(str(row['SCAN_MICR_ACNO']))
    if pd.notna(row['SCAN_INSTRUMENT_TYPE']):
        parts.append(str(row['SCAN_INSTRUMENT_TYPE']))
    return "".join(parts)

df_gold['micr_code_gold'] = df_gold.apply(construct_micr, axis=1)
df_gold['INSTRUMENT_ID'] = pd.to_numeric(df_gold['INSTRUMENT_ID'], errors='coerce') # Ensure Gold ID is also numeric for merging

print("\nDF Gold after micr_code_gold construction:")
print(df_gold[['INSTRUMENT_ID', 'micr_code_gold']].head())

# Cell 5: Merge the DataFrames
df_merged = pd.merge(df_extract, df_gold, left_on='INSTRUMENT_ID_EXTRACT', right_on='INSTRUMENT_ID', how='left', suffixes=('_extract', '_gold'))

print("\nMerged DataFrame Head:")
print(df_merged.head())


# Cell 6: Define comparison functions

def compare_names(name1, name2):
    if pd.isna(name1) and pd.isna(name2): # Both NaN considered a non-match for specific comparison, but could be true if desired.
        return False, "Both names are NaN"
    if pd.isna(name1) or pd.isna(name2):
        return False, f"One name is NaN (Extract: '{name1}', Gold: '{name2}')"
    
    norm_name1 = str(name1).strip().lower()
    norm_name2 = str(name2).strip().lower()
    
    norm_name1 = re.sub(r'[^a-z0-9\s]', '', norm_name1)
    norm_name2 = re.sub(r'[^a-z0-9\s]', '', norm_name2)
    
    norm_name1 = re.sub(r'\s+', ' ', norm_name1).strip()
    norm_name2 = re.sub(r'\s+', ' ', norm_name2).strip()
    
    match = norm_name1 == norm_name2
    return match, f"Normalized Extract: '{norm_name1}', Normalized Gold: '{norm_name2}' (Match: {match})"

def compare_amounts(amount1, amount2):
    if pd.isna(amount1) and pd.isna(amount2):
        return False # Or True if NaN == NaN is desired for amounts
    if pd.isna(amount1) or pd.isna(amount2):
        return False
    try:
        return abs(float(amount1) - float(amount2)) < 0.001 # Comparison for floats
    except ValueError:
        return False

def compare_micr(micr1, micr2):
    if pd.isna(micr1) and pd.isna(micr2):
        return False # Or True
    if pd.isna(micr1) or pd.isna(micr2):
        return False
    return str(micr1).strip().lower() == str(micr2).strip().lower() # Adding lower for case-insensitivity if MICR can have letters

# Cell 7: Apply comparison functions and calculate match scores
df_merged['payee_name_match_status'] = False
df_merged['payee_name_match_details'] = ''
df_merged['amount_match_status'] = False
df_merged['micr_match_status'] = False

# Check if essential columns exist before iterating.
essential_extract_cols = ['payee_name', 'amount_numeric', 'micr_code']
essential_gold_cols = ['PRES_NAME', 'CAR_AMOUNT', 'micr_code_gold'] # micr_code_gold is created above

has_payee_cols = 'payee_name' in df_merged.columns and 'PRES_NAME' in df_merged.columns
has_amount_cols = 'amount_numeric' in df_merged.columns and 'CAR_AMOUNT' in df_merged.columns
has_micr_cols = 'micr_code' in df_merged.columns and 'micr_code_gold' in df_merged.columns

for index, row in df_merged.iterrows():
    if has_payee_cols:
        name_match_result, name_match_detail_str = compare_names(row['payee_name'], row['PRES_NAME'])
        df_merged.loc[index, 'payee_name_match_status'] = name_match_result
        df_merged.loc[index, 'payee_name_match_details'] = name_match_detail_str
    else:
        df_merged.loc[index, 'payee_name_match_details'] = "Payee name columns not found or not merged"

    if has_amount_cols:
        df_merged.loc[index, 'amount_match_status'] = compare_amounts(row['amount_numeric'], row['CAR_AMOUNT'])
    
    if has_micr_cols:
        df_merged.loc[index, 'micr_match_status'] = compare_micr(row['micr_code'], row['micr_code_gold'])

df_merged['fields_to_compare'] = 0
df_merged['fields_matched'] = 0

if has_payee_cols:
    df_merged['fields_to_compare'] += 1
    df_merged['fields_matched'] += df_merged['payee_name_match_status'].astype(int)
if has_amount_cols:
    df_merged['fields_to_compare'] += 1
    df_merged['fields_matched'] += df_merged['amount_match_status'].astype(int)
if has_micr_cols:
    df_merged['fields_to_compare'] += 1
    df_merged['fields_matched'] += df_merged['micr_match_status'].astype(int)

df_merged['match_percentage'] = 0 # Initialize
# Calculate only where fields_to_compare is greater than 0
mask_compare = df_merged['fields_to_compare'] > 0
df_merged.loc[mask_compare, 'match_percentage'] = (df_merged.loc[mask_compare, 'fields_matched'] / df_merged.loc[mask_compare, 'fields_to_compare'] * 100)

print("\nDataFrame with Match Status and Percentage:")
display_cols = ['INSTRUMENT_ID_EXTRACT', 'payee_name', 'PRES_NAME', 'payee_name_match_status', 'payee_name_match_details',
                'amount_numeric', 'CAR_AMOUNT', 'amount_match_status',
                'micr_code', 'micr_code_gold', 'micr_match_status',
                'fields_matched', 'fields_to_compare', 'match_percentage']
# Ensure all display_cols exist in df_merged
display_cols = [col for col in display_cols if col in df_merged.columns]
print(df_merged[display_cols].head())

# Cell 8: Select relevant columns for the output and save to Excel
output_columns = [
    'filepath', 'INSTRUMENT_ID_EXTRACT', # Original extract key
    'payee_name', 'amount_numeric', 'micr_code', # Extract data
    'INSTRUMENT_ID', 'PRES_NAME', 'CAR_AMOUNT', 'micr_code_gold', # Gold data (original key + compared fields + constructed MICR)
    # MICR components from Gold for verification
    'SCAN_INSTRUMENT_NUMBER', 'SCAN_PAYEE_BANK_CITY_CODE', 'SCAN_PAYEE_BANK_CODE', 
    'SCAN_PAYEE_BANK_BRANCH_CODE', 'SCAN_MICR_ACNO', 'SCAN_INSTRUMENT_TYPE', 
    # Match results
    'payee_name_match_status', 'payee_name_match_details', 
    'amount_match_status', 
    'micr_match_status', 
    'fields_matched', 'fields_to_compare', 'match_percentage'
]

final_output_columns = [col for col in output_columns if col in df_merged.columns]
df_output = df_merged[final_output_columns]

try:
    output_file_name = 'comparison_results.xlsx'
    df_output.to_excel(output_file_name, index=False)
    print(f"\nComparison results saved to '{output_file_name}'")
except Exception as e:
    print(f"Error saving to Excel: {e}")

# Cell 9: Display overall statistics (optional)
total_records_merged = len(df_merged) # Total records after merge (can be > df_extract if multiple matches or < if not all merged)
total_extract_records = len(df_extract)
print(f"\n--- Overall Statistics ---")
print(f"Total Records in Extract File: {total_extract_records}")
print(f"Total Records after Merging (based on INSTRUMENT_ID): {total_records_merged}")

if total_records_merged > 0 and 'match_percentage' in df_merged.columns:
    perfect_matches = len(df_merged[df_merged['match_percentage'] == 100])
    print(f"Records with 100% Match (among merged and comparable records): {perfect_matches}")

    if df_merged['fields_to_compare'].sum() > 0:
        overall_field_match_accuracy = (df_merged['fields_matched'].sum() / df_merged['fields_to_compare'].sum() * 100)
        print(f"Overall Field Match Accuracy (for comparable fields in merged records): {overall_field_match_accuracy:.2f}%")
    else:
        print("Overall Field Match Accuracy: No fields were marked for comparison.")

    if has_payee_cols:
        # Consider only rows where both original names are not NaN for accuracy calculation
        comparable_names = df_merged[df_merged['payee_name'].notna() & df_merged['PRES_NAME'].notna()]
        if not comparable_names.empty:
            name_matches = comparable_names['payee_name_match_status'].sum()
            name_accuracy = (name_matches / len(comparable_names) * 100)
            print(f"Payee Name Match Accuracy: {name_accuracy:.2f}% ({name_matches}/{len(comparable_names)} comparable pairs)")
        else:
            print("Payee Name Match Accuracy: No comparable payee name pairs found.")

    if has_amount_cols:
        comparable_amounts = df_merged[df_merged['amount_numeric'].notna() & df_merged['CAR_AMOUNT'].notna()]
        if not comparable_amounts.empty:
            amount_matches = comparable_amounts['amount_match_status'].sum()
            amount_accuracy = (amount_matches / len(comparable_amounts) * 100)
            print(f"Amount Match Accuracy: {amount_accuracy:.2f}% ({amount_matches}/{len(comparable_amounts)} comparable pairs)")
        else:
            print("Amount Match Accuracy: No comparable amount pairs found.")
            
    if has_micr_cols:
        comparable_micrs = df_merged[df_merged['micr_code'].notna() & df_merged['micr_code_gold'].notna()]
        if not comparable_micrs.empty:
            micr_matches = comparable_micrs['micr_match_status'].sum()
            micr_accuracy = (micr_matches / len(comparable_micrs) * 100)
            print(f"MICR Code Match Accuracy: {micr_accuracy:.2f}% ({micr_matches}/{len(comparable_micrs)} comparable pairs)")
        else:
            print("MICR Code Match Accuracy: No comparable MICR code pairs found.")
else:
    print("No records were merged or no comparable fields found to calculate statistics.")

print("--- End of Notebook ---")