In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import os
from pathlib import Path

# Function to extract INSTRUMENT_ID from filepath
def extract_instrument_id(filepath):
    # Extract the filename from the path
    filename = os.path.basename(filepath)
    # Extract the instrument ID (before the underscore)
    if '_' in filename:
        return filename.split('_')[0]
    return None

# Function to clean strings for comparison
def clean_string(text):
    if pd.isna(text):
        return ""
    # Convert to string if not already
    text = str(text)
    # Convert to lowercase
    text = text.lower()
    # Remove leading/trailing spaces
    text = text.strip()
    # Remove special characters
    text = re.sub(r'[^\w\s]', '', text)
    return text

# Function to compare amounts
def compare_amounts(amount1, amount2):
    if pd.isna(amount1) or pd.isna(amount2):
        return 0
    
    try:
        # Convert to float for numeric comparison
        float1 = float(amount1)
        float2 = float(amount2)
        # Check if they are equal (accounting for precision)
        return 1 if abs(float1 - float2) < 0.01 else 0
    except (ValueError, TypeError):
        return 0

# Function to concatenate bank fields
def concatenate_bank_fields(row):
    fields = [
        str(row['SCAN_PAYEE_BANK_CITY_CODE']) if not pd.isna(row['SCAN_PAYEE_BANK_CITY_CODE']) else '',
        str(row['SCAN_PAYEE_BANK_CODE']) if not pd.isna(row['SCAN_PAYEE_BANK_CODE']) else '',
        str(row['SCAN_PAYEE_BANK_BRANCH_CODE']) if not pd.isna(row['SCAN_PAYEE_BANK_BRANCH_CODE']) else ''
    ]
    return ''.join(fields)

# Load the Excel files
extract_df = pd.read_excel('extract.xlsx')
gold_df = pd.read_excel('Gold.xlsx')

# Create a new dataframe to store results
results = []

# Process each row in extract_df
for idx, extract_row in extract_df.iterrows():
    # Extract the instrument ID from filepath
    instrument_id = extract_instrument_id(extract_row['filepath'])
    
    if instrument_id is None:
        continue
    
    # Find corresponding row in gold_df
    matching_gold_rows = gold_df[gold_df['INSTRUMENT_ID'] == instrument_id]
    
    if matching_gold_rows.empty:
        # No matching row found
        result_row = {
            'extract_filepath': extract_row['filepath'],
            'extract_instrument_id': instrument_id,
            'gold_instrument_id': None,
            'match_found': False,
            'instrument_number_match': 0,
            'payee_details_match': 0,
            'micr_acno_match': 0,
            'instrument_type_match': 0,
            'payee_name_match': 0,
            'amount_match': 0,
            'overall_match_percentage': 0
        }
    else:
        # Use the first matching row (should be unique)
        gold_row = matching_gold_rows.iloc[0]
        
        # Compare instrument number
        instr_num_match = 1 if clean_string(extract_row['micr_scan_instrument_number']) == clean_string(gold_row['SCAN_INSTRUMENT_NUMBER']) else 0
        
        # Compare concatenated bank fields with payee details
        gold_bank_concat = concatenate_bank_fields(gold_row)
        payee_details_match = 1 if clean_string(extract_row['micr_scan_payee_details']) == clean_string(gold_bank_concat) else 0
        
        # Compare MICR account number
        micr_acno_match = 1 if clean_string(extract_row['micr_scan_micr_acno']) == clean_string(gold_row['SCAN_MICR_ACNO']) else 0
        
        # Compare instrument type
        instrument_type_match = 1 if clean_string(extract_row['micr_scan_instrument_type']) == clean_string(gold_row['SCAN_INSTRUMENT_TYPE']) else 0
        
        # Compare payee names
        payee_name_match = 1 if clean_string(extract_row['payee_name']) == clean_string(gold_row['PRES_NAME']) else 0
        
        # Compare amounts
        amount_match = compare_amounts(extract_row['amount_numeric'], gold_row['CAR_AMOUNT'])
        
        # Calculate overall match percentage
        total_fields = 6  # Number of fields being compared
        matches = sum([instr_num_match, payee_details_match, micr_acno_match, 
                       instrument_type_match, payee_name_match, amount_match])
        match_percentage = (matches / total_fields) * 100
        
        result_row = {
            'extract_filepath': extract_row['filepath'],
            'extract_instrument_id': instrument_id,
            'gold_instrument_id': gold_row['INSTRUMENT_ID'],
            'match_found': True,
            'instrument_number_match': instr_num_match,
            'payee_details_match': payee_details_match,
            'micr_acno_match': micr_acno_match,
            'instrument_type_match': instrument_type_match,
            'payee_name_match': payee_name_match,
            'amount_match': amount_match,
            'overall_match_percentage': match_percentage
        }
    
    results.append(result_row)

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Add more detailed information for mismatches
detailed_results = []

for idx, row in results_df.iterrows():
    if not row['match_found']:
        detailed_results.append(row.to_dict())
        continue
    
    # Get original rows
    extract_row = extract_df[extract_df['filepath'] == row['extract_filepath']].iloc[0]
    gold_row = gold_df[gold_df['INSTRUMENT_ID'] == row['gold_instrument_id']].iloc[0]
    
    # Add original values for comparison
    detail_row = row.to_dict()
    detail_row.update({
        'extract_instrument_number': extract_row['micr_scan_instrument_number'],
        'gold_instrument_number': gold_row['SCAN_INSTRUMENT_NUMBER'],
        
        'extract_payee_details': extract_row['micr_scan_payee_details'],
        'gold_payee_details': concatenate_bank_fields(gold_row),
        
        'extract_micr_acno': extract_row['micr_scan_micr_acno'],
        'gold_micr_acno': gold_row['SCAN_MICR_ACNO'],
        
        'extract_instrument_type': extract_row['micr_scan_instrument_type'],
        'gold_instrument_type': gold_row['SCAN_INSTRUMENT_TYPE'],
        
        'extract_payee_name': extract_row['payee_name'],
        'gold_payee_name': gold_row['PRES_NAME'],
        
        'extract_amount': extract_row['amount_numeric'],
        'gold_amount': gold_row['CAR_AMOUNT']
    })
    
    detailed_results.append(detail_row)

# Convert detailed results to DataFrame
detailed_results_df = pd.DataFrame(detailed_results)

# Save results to Excel
detailed_results_df.to_excel('comparison_results.xlsx', index=False)

# Display summary
print(f"Total records processed: {len(extract_df)}")
print(f"Records with matching IDs: {results_df['match_found'].sum()}")
print(f"Average match percentage: {results_df['overall_match_percentage'].mean():.2f}%")

# Display field-wise match percentages
for field in ['instrument_number_match', 'payee_details_match', 'micr_acno_match', 
              'instrument_type_match', 'payee_name_match', 'amount_match']:
    match_pct = results_df[results_df['match_found']][field].mean() * 100
    print(f"{field}: {match_pct:.2f}%")