In [None]:
import pandas as pd
import re

def normalize_string(s):
    """
    Normalize a string for comparison by:
    - Removing leading/trailing spaces
    - Converting to lowercase
    - Removing special characters (keep only alphanumeric)
    """
    if isinstance(s, str):
        s = s.strip().lower()
        s = re.sub(r'[^a-zA-Z0-9]', '', s)
        return s
    return s

def compare_amounts(amount1, amount2):
    """
    Compare two amounts, handling potential type differences (e.g., float vs. int).
    """
    try:
        amount1 = float(amount1)
        amount2 = float(amount2)
        return amount1 == amount2
    except ValueError:
        return False  # If either amount cannot be converted to a float, they don't match

def extract_filename(filepath):
    """
    Extracts the filename (without extension) from the filepath.
    Handles cases where the filename might have multiple dots.
    """
    if isinstance(filepath, str):
        # Use os.path.basename to get the filename with extension
        filename_with_extension = filepath.split('/')[-1]
        # Split the filename by '.' and remove the last part (extension)
        parts = filename_with_extension.split('.')[:-1]
        # Join the parts back together to get the filename without extension
        filename = '.'.join(parts)
        return filename
    return None

# Load the Excel files
try:
    df_extract = pd.read_excel("extract.xlsx")  # Replace with your actual file path
    df_gold = pd.read_excel("Gold.xlsx")      # Replace with your actual file path
except FileNotFoundError as e:
    print(f"Error: {e}")
    exit()

# Add a new column to df_extract to store the extracted filename
df_extract['extracted_filename'] = df_extract['filepath'].apply(extract_filename)

# Initialize a list to store the comparison results as dictionaries
comparison_results = []

# Iterate through each row of df_extract
for _, row_extract in df_extract.iterrows():
    # Find the matching row in df_gold based on INSTRUMENT_ID
    matching_row_gold = df_gold[df_gold['INSTRUMENT_ID'] == row_extract['extracted_filename']]

    if not matching_row_gold.empty:
        row_gold = matching_row_gold.iloc[0] # Get the first row if there are multiple matches (which is not expected, but handled)

        # Perform the comparisons and store the results
        record_comparison = {
            "Filepath": row_extract['filepath'],
            "INSTRUMENT_ID": row_gold['INSTRUMENT_ID'], # Keep the INSTRUMENT_ID for reference
            "Filename Match": row_extract['extracted_filename'] == row_gold['INSTRUMENT_ID'],
            "SCAN_INSTRUMENT_NUMBER Match": row_gold['SCAN_INSTRUMENT_NUMBER'] == row_extract['micr_scan_instrument_number'],
            "Payee Bank Details Match": normalize_string(row_gold['SCAN_PAYEE_BANK_CITY_CODE'] + row_gold['SCAN_PAYEE_BANK_CODE'] + row_gold['SCAN_PAYEE_BANK_BRANCH_CODE'])
                                         == normalize_string(row_extract['micr_scan_payee_details']),
            "SCAN_MICR_ACNO Match": row_gold['SCAN_MICR_ACNO'] == row_extract['micr_scan_micr_acno'],
            "SCAN_INSTRUMENT_TYPE Match": row_gold['SCAN_INSTRUMENT_TYPE'] == row_extract['micr_scan_instrument_type'],
            "Payee Name Match": normalize_string(row_gold['PRES_NAME']) == normalize_string(row_extract['payee_name']),
            "Amount Match": compare_amounts(row_gold['CAR_AMOUNT'], row_extract['amount_numeric'])
        }
        comparison_results.append(record_comparison)
    else:
        # Handle the case where no matching INSTRUMENT_ID is found in df_gold
        record_comparison = {
            "Filepath": row_extract['filepath'],
            "INSTRUMENT_ID": None,
            "Filename Match": False,
            "SCAN_INSTRUMENT_NUMBER Match": False,
            "Payee Bank Details Match": False,
            "SCAN_MICR_ACNO Match": False,
            "SCAN_INSTRUMENT_TYPE Match": False,
            "Payee Name Match": False,
            "Amount Match": False,
            "Error": "No matching INSTRUMENT_ID found in Gold data"  # Add an error message
        }
        comparison_results.append(record_comparison)

# Convert the comparison results to a DataFrame
df_results = pd.DataFrame(comparison_results)

# Calculate match percentages for each comparison
match_percentage = {}
for column in df_results.columns:
    if column.endswith("Match"):
        match_percentage[column] = (df_results[column].sum() / len(df_results)) * 100

# Print the match percentages
print("Match Percentages:")
for column, percentage in match_percentage.items():
    print(f"{column}: {percentage:.2f}%")

# Save the results to a new Excel file
df_results.to_excel("comparison_results.xlsx", index=False) #save the dataframe as excel
print("Comparison results saved to comparison_results.xlsx")
