In [None]:
# Cell 1: Import necessary libraries
import pandas as pd
import re
import numpy as np # Import numpy
try:
    from tqdm.auto import tqdm  # For progress bars
    TQDM_AVAILABLE = True
except ImportError:
    TQDM_AVAILABLE = False
    print("Warning: tqdm library not found. Progress bars will not be shown. To enable them, please install tqdm: pip install tqdm")
    # Define a dummy tqdm if not available so the code doesn't break
    def tqdm(iterable, *args, **kwargs):
        return iterable


# Cell 2: Load the Excel files
# Replace 'extract.xlsx' and 'gold.xlsx' with the actual file paths
try:
    df_extract = pd.read_excel('extract.xlsx')
    df_gold = pd.read_excel('gold.xlsx')
    print("Successfully loaded 'extract.xlsx' and 'gold.xlsx'.")
except FileNotFoundError:
    print("One or both Excel files not found. Using dummy data for demonstration.")
    # Create dummy dataframes for demonstration if files are not found
    data_extract = {
        'filepath': ['/tmp/job_e6b7dd30-4f60-4f8b-8331-2458fea95c06_eh6vxyz3/zip_0_new/GenAI-OUT-F-IMG-16042025-30000/2510600056001201252_f.jpeg',
                     '/tmp/job_e6b7dd30-4f60-4f8b-8331-2458fea95c06_eh6vxyz3/zip_0_new/GenAI-OUT-F-IMG-16042025-30000/1234567890123456789_f.jpeg',
                     '/tmp/job_e6b7dd30-4f60-4f8b-8331-2458fea95c06_eh6vxyz3/zip_0_new/GenAI-OUT-F-IMG-16042025-30000/9999999999999999999_f.jpeg',
                     '/tmp/job_e6b7dd30-4f60-4f8b-8331-2458fea95c06_eh6vxyz3/zip_0_new/GenAI-OUT-F-IMG-16042025-30000/0000000000000000000_f.jpeg'], # Added a no-match record
        'payee_name': ['John Doe!', 'Jane Smith.', 'Another Person', 'No Match Name'],
        'amount_numeric': [750, 1200.50, 500, 100],
        'micr_code': ['123456CITYBANKBRANCH001SAVINGS', '987654CITY2BANK2BRANCH2002CURRENT', '111222CITY3BANK3BRANCH3003SAVINGS', 'NOMATCHMICR']
    }
    df_extract = pd.DataFrame(data_extract)

    data_gold = {
        'TRANSACTION_DATE': ['16/04/2025', '17/04/2025', '18/04/2025'],
        'INSTRUMENT_ID': [2510600056001201252, 9876543210987654321, 9999999999999999999],
        'FLOW_TYPE': ['INWARD', 'OUTWARD', 'INWARD'],
        'SCAN_INSTRUMENT_NUMBER': [123456, 987654, 111222],
        'SCAN_PAYEE_BANK_CITY_CODE': ['CITY', 'CITY2', 'CITY3'],
        'SCAN_PAYEE_BANK_CODE': ['BANK', 'BANK2', 'BANK3'],
        'SCAN_PAYEE_BANK_BRANCH_CODE': ['BRANCH', 'BRANCH2', 'BRANCH3'],
        'SCAN_MICR_ACNO': [1, 0, 3],
        'SCAN_INSTRUMENT_TYPE': ['SAVINGS', 'CURRENT', 'SAVINGS'],
        'PRES_NAME': [' John Doe ', 'Jane  Smith', 'Another Person '],
        'CAR_AMOUNT': [750.00, 1250.00, 500.00]
    }
    df_gold = pd.DataFrame(data_gold)

# Display the first few rows of each dataframe
print("\nExcel 1 (Extract) Head:")
print(df_extract.head())
print("\nExcel 2 (Gold) Head:")
print(df_gold.head())

# Cell 3: Preprocess df_extract
# Extract INSTRUMENT_ID from filepath
def extract_instrument_id(filepath):
    if pd.isna(filepath):
        return None
    match = re.search(r'/([^/]+)_f\.jpeg$', str(filepath))
    if match:
        return match.group(1)
    return None

df_extract['INSTRUMENT_ID_EXTRACT'] = df_extract['filepath'].apply(extract_instrument_id)
df_extract['INSTRUMENT_ID_EXTRACT'] = pd.to_numeric(df_extract['INSTRUMENT_ID_EXTRACT'], errors='coerce')

print("\nDF Extract after INSTRUMENT_ID extraction:")
print(df_extract.head())

# Cell 4: Preprocess df_gold
# Construct micr_code in df_gold
def construct_micr(row):
    parts = []
    if pd.notna(row['SCAN_INSTRUMENT_NUMBER']):
        parts.append(str(row['SCAN_INSTRUMENT_NUMBER']))
    if pd.notna(row['SCAN_PAYEE_BANK_CITY_CODE']):
        parts.append(str(row['SCAN_PAYEE_BANK_CITY_CODE']))
    if pd.notna(row['SCAN_PAYEE_BANK_CODE']):
        parts.append(str(row['SCAN_PAYEE_BANK_CODE']))
    if pd.notna(row['SCAN_PAYEE_BANK_BRANCH_CODE']):
        parts.append(str(row['SCAN_PAYEE_BANK_BRANCH_CODE']))
    if pd.notna(row['SCAN_MICR_ACNO']) and row['SCAN_MICR_ACNO'] != 0:
        parts.append(str(int(row['SCAN_MICR_ACNO'])))
    if pd.notna(row['SCAN_INSTRUMENT_TYPE']):
        parts.append(str(row['SCAN_INSTRUMENT_TYPE']))
    return "".join(parts)

df_gold['micr_code_gold'] = df_gold.apply(construct_micr, axis=1)
df_gold['INSTRUMENT_ID'] = pd.to_numeric(df_gold['INSTRUMENT_ID'], errors='coerce')

print("\nDF Gold after micr_code_gold construction:")
print(df_gold.head())

# Cell 5: Merge the DataFrames
df_merged = pd.merge(df_extract, df_gold, left_on='INSTRUMENT_ID_EXTRACT', right_on='INSTRUMENT_ID', how='left', suffixes=('_extract', '_gold'))

print("\nMerged DataFrame Head:")
print(df_merged.head())

# Cell 6: Define comparison functions
def compare_names(name1, name2):
    if pd.isna(name1) or pd.isna(name2):
        return False, "One or both names are NaN"
    norm_name1 = str(name1).strip().lower()
    norm_name2 = str(name2).strip().lower()
    norm_name1 = re.sub(r'[^a-z0-9\s]', '', norm_name1)
    norm_name2 = re.sub(r'[^a-z0-9\s]', '', norm_name2)
    norm_name1 = re.sub(r'\s+', ' ', norm_name1).strip()
    norm_name2 = re.sub(r'\s+', ' ', norm_name2).strip()
    return norm_name1 == norm_name2, f"Normalized Extract: '{norm_name1}', Normalized Gold: '{norm_name2}'"

def compare_amounts(amount1, amount2):
    if pd.isna(amount1) or pd.isna(amount2):
        return False
    try:
        return float(amount1) == float(amount2)
    except ValueError:
        return False

def compare_micr(micr1, micr2):
    if pd.isna(micr1) or pd.isna(micr2):
        return False
    return str(micr1).strip() == str(micr2).strip()

# Cell 7: Apply comparison functions and calculate match scores
if TQDM_AVAILABLE:
    tqdm.pandas(desc="Processing records for comparison") 

df_merged['payee_name_match_status'] = False
df_merged['payee_name_match_details'] = ''
df_merged['amount_match_status'] = False
df_merged['micr_match_status'] = False
df_merged['fields_matched'] = 0

print("\nApplying comparisons:")
# Use tqdm directly on iterrows
for index, row in tqdm(df_merged.iterrows(), total=df_merged.shape[0], desc="Comparing records"):
    current_matches = 0
    # Compare Payee Name
    if 'payee_name' in df_merged.columns and 'PRES_NAME' in df_merged.columns:
        if pd.notna(row['payee_name']) and pd.notna(row['PRES_NAME']):
            name_match_result, name_match_detail_str = compare_names(row['payee_name'], row['PRES_NAME'])
            df_merged.loc[index, 'payee_name_match_status'] = name_match_result
            df_merged.loc[index, 'payee_name_match_details'] = name_match_detail_str
            if name_match_result:
                current_matches += 1
        else:
             df_merged.loc[index, 'payee_name_match_details'] = "One or both names missing for comparison"
    else:
        df_merged.loc[index, 'payee_name_match_details'] = "Payee name columns not found"

    # Compare Amount
    if 'amount_numeric' in df_merged.columns and 'CAR_AMOUNT' in df_merged.columns:
        if pd.notna(row['amount_numeric']) and pd.notna(row['CAR_AMOUNT']):
            amount_match_result = compare_amounts(row['amount_numeric'], row['CAR_AMOUNT'])
            df_merged.loc[index, 'amount_match_status'] = amount_match_result
            if amount_match_result:
                current_matches += 1
    
    # Compare MICR
    if 'micr_code' in df_merged.columns and 'micr_code_gold' in df_merged.columns:
        if pd.notna(row['micr_code']) and pd.notna(row['micr_code_gold']):
            micr_match_result = compare_micr(row['micr_code'], row['micr_code_gold'])
            df_merged.loc[index, 'micr_match_status'] = micr_match_result
            if micr_match_result:
                current_matches += 1
    df_merged.loc[index, 'fields_matched'] = current_matches

fields_to_compare_list = []
for index, row in df_merged.iterrows():
    count = 0
    if 'payee_name' in df_merged.columns and 'PRES_NAME' in df_merged.columns and \
       pd.notna(row['payee_name']) and pd.notna(row['PRES_NAME']):
        count += 1
    if 'amount_numeric' in df_merged.columns and 'CAR_AMOUNT' in df_merged.columns and \
       pd.notna(row['amount_numeric']) and pd.notna(row['CAR_AMOUNT']):
        count += 1
    if 'micr_code' in df_merged.columns and 'micr_code_gold' in df_merged.columns and \
       pd.notna(row['micr_code']) and pd.notna(row['micr_code_gold']):
        count += 1
    fields_to_compare_list.append(count)
df_merged['fields_to_compare'] = fields_to_compare_list

# Calculate match percentage using np.where to avoid division by zero issues and recursion
df_merged['match_percentage'] = np.where(
    df_merged['fields_to_compare'] > 0,
    (df_merged['fields_matched'] / df_merged['fields_to_compare']) * 100,
    0  # Set to 0 if fields_to_compare is 0
)
df_merged['match_percentage'] = df_merged['match_percentage'].fillna(0) # Ensure any other NaNs become 0


print("\nDataFrame with Match Status and Percentage:")
print(df_merged[['INSTRUMENT_ID_EXTRACT', 'payee_name_match_status', 'amount_match_status', 'micr_match_status',
                 'fields_matched', 'fields_to_compare', 'match_percentage']].head())

# Cell 8: Select relevant columns for the output and save to Excel
output_columns = [
    'filepath', 'INSTRUMENT_ID_EXTRACT', 'payee_name', 'amount_numeric', 'micr_code', 
    'INSTRUMENT_ID', 'PRES_NAME', 'CAR_AMOUNT', 'micr_code_gold', 
    'SCAN_INSTRUMENT_NUMBER', 'SCAN_PAYEE_BANK_CITY_CODE', 'SCAN_PAYEE_BANK_CODE', 
    'SCAN_PAYEE_BANK_BRANCH_CODE', 'SCAN_MICR_ACNO', 'SCAN_INSTRUMENT_TYPE', 
    'payee_name_match_status', 'payee_name_match_details', 'amount_match_status', 'micr_match_status', 
    'fields_matched', 'fields_to_compare', 'match_percentage' 
]

final_output_columns = [col for col in output_columns if col in df_merged.columns]
df_output = df_merged[final_output_columns]

try:
    output_file_name = 'comparison_results_final.xlsx'
    df_output.to_excel(output_file_name, index=False)
    print(f"\nComparison results saved to '{output_file_name}'")
except Exception as e:
    print(f"Error saving to Excel: {e}")

# Cell 9: Display overall statistics (optional)
total_records = len(df_merged)
# For perfect matches, consider only rows where there was something to compare
perfect_matches_df = df_merged[df_merged['fields_to_compare'] > 0]
perfect_matches = len(perfect_matches_df[perfect_matches_df['match_percentage'] == 100])

overall_field_matches_sum = df_merged['fields_matched'].sum()
overall_fields_to_compare_sum = df_merged['fields_to_compare'].sum()

overall_match_accuracy = (overall_field_matches_sum / overall_fields_to_compare_sum * 100) if overall_fields_to_compare_sum > 0 else 0

print(f"\n--- Overall Statistics ---")
print(f"Total Records Processed: {total_records}")
print(f"Records with 100% Match (among comparable): {perfect_matches}")

if overall_fields_to_compare_sum > 0:
    print(f"Overall Field Match Accuracy: {overall_match_accuracy:.2f}% ({overall_field_matches_sum}/{overall_fields_to_compare_sum} fields)")
else:
    print("Overall Field Match Accuracy: Not applicable (no fields to compare or all comparable fields were NaN)")

if 'payee_name_match_status' in df_merged.columns:
    name_matches = df_merged['payee_name_match_status'].sum()
    name_total_comparable = len(df_merged[df_merged['payee_name'].notna() & df_merged['PRES_NAME'].notna()])
    name_accuracy = (name_matches / name_total_comparable * 100) if name_total_comparable > 0 else 0
    print(f"Payee Name Match Accuracy: {name_accuracy:.2f}% ({name_matches}/{name_total_comparable} comparable pairs)")

if 'amount_match_status' in df_merged.columns:
    amount_matches = df_merged['amount_match_status'].sum()
    amount_total_comparable = len(df_merged[df_merged['amount_numeric'].notna() & df_merged['CAR_AMOUNT'].notna()])
    amount_accuracy = (amount_matches / amount_total_comparable * 100) if amount_total_comparable > 0 else 0
    print(f"Amount Match Accuracy: {amount_accuracy:.2f}% ({amount_matches}/{amount_total_comparable} comparable pairs)")

if 'micr_match_status' in df_merged.columns:
    micr_matches = df_merged['micr_match_status'].sum()
    micr_total_comparable = len(df_merged[df_merged['micr_code'].notna() & df_merged['micr_code_gold'].notna()])
    micr_accuracy = (micr_matches / micr_total_comparable * 100) if micr_total_comparable > 0 else 0
    print(f"MICR Code Match Accuracy: {micr_accuracy:.2f}% ({micr_matches}/{micr_total_comparable} comparable pairs)")

print("--- End of Notebook ---")