In [None]:
# Excel Files Comparison for Field Matching

# Install required packages
!pip install pandas numpy fuzzywuzzy python-Levenshtein

# Import necessary libraries
import pandas as pd
import numpy as np
import re
import os
from fuzzywuzzy import fuzz
import string
import matplotlib.pyplot as plt

# Function to extract instrument id from filepath
def extract_instrument_id(filepath):
    if pd.isna(filepath):
        return None
    # Extract the filename from the path
    filename = os.path.basename(filepath)
    # Extract the instrument ID (assuming it's the part before _f.jpeg)
    match = re.search(r'(\d+)_f\.jpeg$', filename)
    if match:
        return match.group(1)
    return None

# Function to clean string for comparison
def clean_string(s):
    if pd.isna(s):
        return ''
    # Convert to string if not already
    s = str(s)
    # Convert to lowercase
    s = s.lower()
    # Remove leading/trailing spaces
    s = s.strip()
    # Remove punctuation
    s = s.translate(str.maketrans('', '', string.punctuation))
    return s

# Function to compare amounts
def compare_amounts(amount1, amount2):
    if pd.isna(amount1) and pd.isna(amount2):
        return True
    if pd.isna(amount1) or pd.isna(amount2):
        return False
    
    # Convert to float
    try:
        amount1 = float(amount1)
        amount2 = float(amount2)
        # Compare with a small tolerance for floating point errors
        return abs(amount1 - amount2) < 0.01
    except:
        return False

# Function to calculate fuzzy match score for strings
def fuzzy_match_score(s1, s2):
    if pd.isna(s1) and pd.isna(s2):
        return 100
    if pd.isna(s1) or pd.isna(s2):
        return 0
    
    s1 = clean_string(s1)
    s2 = clean_string(s2)
    
    if not s1 and not s2:
        return 100
    if not s1 or not s2:
        return 0
    
    # Return token sort ratio which handles word order differences
    return fuzz.token_sort_ratio(s1, s2)

# Function to compare two records and calculate match percentages
def compare_records(extract_row, gold_row):
    results = {}
    
    # Compare instrument number
    scan_instrument_number_match = fuzzy_match_score(
        extract_row.get('micr_scan_instrument_number'), 
        gold_row.get('SCAN_INSTRUMENT_NUMBER')
    )
    results['instrument_number_match'] = scan_instrument_number_match
    
    # Concatenate bank details from gold and compare with extract
    gold_bank_details = str(gold_row.get('SCAN_PAYEE_BANK_CITY_CODE', '')) + \
                        str(gold_row.get('SCAN_PAYEE_BANK_CODE', '')) + \
                        str(gold_row.get('SCAN_PAYEE_BANK_BRANCH_CODE', ''))
    
    bank_details_match = fuzzy_match_score(
        extract_row.get('micr_scan_payee_details'),
        gold_bank_details
    )
    results['bank_details_match'] = bank_details_match
    
    # Compare MICR account number
    micr_acno_match = fuzzy_match_score(
        extract_row.get('micr_scan_micr_acno'),
        gold_row.get('SCAN_MICR_ACNO')
    )
    results['micr_acno_match'] = micr_acno_match
    
    # Compare instrument type
    instrument_type_match = fuzzy_match_score(
        extract_row.get('micr_scan_instrument_type'),
        gold_row.get('SCAN_INSTRUMENT_TYPE')
    )
    results['instrument_type_match'] = instrument_type_match
    
    # Compare payee name
    payee_name_match = fuzzy_match_score(
        extract_row.get('payee_name'),
        gold_row.get('PRES_NAME')
    )
    results['payee_name_match'] = payee_name_match
    
    # Compare amount
    amount_match = 100 if compare_amounts(
        extract_row.get('amount_numeric'),
        gold_row.get('CAR_AMOUNT')
    ) else 0
    results['amount_match'] = amount_match
    
    # Calculate overall match percentage
    overall_match = np.mean([
        scan_instrument_number_match,
        bank_details_match,
        micr_acno_match,
        instrument_type_match,
        payee_name_match,
        amount_match
    ])
    results['overall_match'] = overall_match
    
    return results

# Load the Excel files
# Replace these paths with your actual file paths
extract_file_path = 'extract.xlsx'
gold_file_path = 'gold.xlsx'
output_file_path = 'comparison_results.xlsx'

print(f"Loading extract file from: {extract_file_path}")
extract_df = pd.read_excel(extract_file_path)

print(f"Loading gold file from: {gold_file_path}")
gold_df = pd.read_excel(gold_file_path)

print(f"Extract shape: {extract_df.shape}")
print(f"Gold shape: {gold_df.shape}")

# Display the first few rows of each DataFrame to verify data
print("\nExtract DataFrame first 3 rows:")
display(extract_df.head(3))

print("\nGold DataFrame first 3 rows:")
display(gold_df.head(3))

# Extract the instrument ID from filepath
extract_df['instrument_id'] = extract_df['filepath'].apply(extract_instrument_id)

# Show the extracted instrument IDs
print("\nExtracted instrument IDs from filepath:")
display(pd.DataFrame({
    'filepath': extract_df['filepath'].head(3),
    'extracted_instrument_id': extract_df['instrument_id'].head(3)
}))

# Create a dictionary for fast lookup in gold_df
gold_dict = {str(row['INSTRUMENT_ID']): row.to_dict() for _, row in gold_df.iterrows()}

# Initialize lists to store results
results_data = []

# Process each row in extract_df
for idx, extract_row in extract_df.iterrows():
    if idx % 100 == 0:
        print(f"Processing row {idx} of {len(extract_df)}")
        
    extract_id = str(extract_row['instrument_id'])
    extract_data = extract_row.to_dict()
    
    # Find matching record in gold_df
    if extract_id in gold_dict:
        gold_data = gold_dict[extract_id]
        match_results = compare_records(extract_data, gold_data)
        
        # Add identifiers to the results
        result_row = {
            'extract_filepath': extract_row['filepath'],
            'extract_instrument_id': extract_id,
            'gold_instrument_id': gold_data['INSTRUMENT_ID'],
            'extract_payee_name': extract_row['payee_name'],
            'gold_pres_name': gold_data['PRES_NAME'],
            'extract_amount': extract_row['amount_numeric'],
            'gold_amount': gold_data['CAR_AMOUNT']
        }
        # Add match percentages
        result_row.update(match_results)
        results_data.append(result_row)
    else:
        # No match found in gold data
        result_row = {
            'extract_filepath': extract_row['filepath'],
            'extract_instrument_id': extract_id,
            'gold_instrument_id': 'NO MATCH',
            'extract_payee_name': extract_row['payee_name'],
            'gold_pres_name': 'NO MATCH',
            'extract_amount': extract_row['amount_numeric'],
            'gold_amount': 'NO MATCH',
            'instrument_number_match': 0,
            'bank_details_match': 0,
            'micr_acno_match': 0,
            'instrument_type_match': 0,
            'payee_name_match': 0,
            'amount_match': 0,
            'overall_match': 0
        }
        results_data.append(result_row)

# Create a results DataFrame
results_df = pd.DataFrame(results_data)

# Display sample of results
print("\nSample of detailed results:")
display(results_df.head())

# Calculate overall statistics
match_stats = {
    'Total Records': len(extract_df),
    'Records Matched': sum(1 for r in results_data if r.get('gold_instrument_id') != 'NO MATCH'),
    'Records Not Matched': sum(1 for r in results_data if r.get('gold_instrument_id') == 'NO MATCH'),
    'Average Instrument Number Match %': results_df['instrument_number_match'].mean(),
    'Average Bank Details Match %': results_df['bank_details_match'].mean(),
    'Average MICR Account Match %': results_df['micr_acno_match'].mean(),
    'Average Instrument Type Match %': results_df['instrument_type_match'].mean(),
    'Average Payee Name Match %': results_df['payee_name_match'].mean(),
    'Average Amount Match %': results_df['amount_match'].mean(),
    'Average Overall Match %': results_df['overall_match'].mean()
}

# Create a stats DataFrame
stats_df = pd.DataFrame([match_stats])

# Display statistics
print("\nSummary Statistics:")
display(stats_df)

# Save results to an Excel file with multiple sheets
with pd.ExcelWriter(output_file_path) as writer:
    results_df.to_excel(writer, sheet_name='Detailed Results', index=False)
    stats_df.to_excel(writer, sheet_name='Summary Statistics', index=False)

print(f"Comparison complete. Results saved to {output_file_path}")

# Create a visualization of match percentages
match_categories = ['Instrument Number', 'Bank Details', 'MICR Account', 
                    'Instrument Type', 'Payee Name', 'Amount', 'Overall']
match_values = [
    match_stats['Average Instrument Number Match %'],
    match_stats['Average Bank Details Match %'],
    match_stats['Average MICR Account Match %'],
    match_stats['Average Instrument Type Match %'],
    match_stats['Average Payee Name Match %'],
    match_stats['Average Amount Match %'],
    match_stats['Average Overall Match %']
]

plt.figure(figsize=(12, 6))
bars = plt.bar(match_categories, match_values, color='skyblue')

# Add value labels on top of bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 1,
             f'{height:.1f}%', ha='center', va='bottom')

plt.title('Average Match Percentages by Field')
plt.xlabel('Field')
plt.ylabel('Match Percentage (%)')
plt.ylim(0, 105)  # Set y-axis limit to accommodate labels
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Create a pie chart showing matched vs unmatched records
labels = ['Matched Records', 'Unmatched Records']
sizes = [match_stats['Records Matched'], match_stats['Records Not Matched']]
colors = ['#66b3ff', '#ff9999']
explode = (0.1, 0)  # explode 1st slice for emphasis

plt.figure(figsize=(10, 6))
plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%',
        shadow=True, startangle=90)
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle
plt.title('Proportion of Records Matched')
plt.tight_layout()
plt.show()