In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import os
import re
from pathlib import Path

# Cell 1: Load the data
print("Loading Excel files...")

# Update these paths to your actual file locations
old_excel_path = "old_data.xlsx"
new_excel_path = "new_data.xlsx"

# Load the excel files
df_old = pd.read_excel(old_excel_path)
df_new = pd.read_excel(new_excel_path)

print(f"Old data shape: {df_old.shape}")
print(f"New data shape: {df_new.shape}")

# Cell 2: Extract unique identifiers from filepaths
print("Extracting file identifiers...")

def extract_file_identifier(filepath):
    """Extract unique identifier from filepath (last two segments)"""
    if not isinstance(filepath, str):
        return None
    
    try:
        parts = filepath.split('/')
        # Get last two non-empty segments
        last_segments = [p for p in parts if p.strip()][-2:]
        return '/'.join(last_segments)
    except:
        return filepath

# Add file identifier columns
df_old['file_id'] = df_old['filepath'].apply(extract_file_identifier)
df_new['file_id'] = df_new['filepath'].apply(extract_file_identifier)

# Print sample identifiers
print("Sample file identifiers from OLD data:")
print(df_old['file_id'].head())
print("\nSample file identifiers from NEW data:")
print(df_new['file_id'].head())

# Cell 3: Merge the dataframes based on file_id
print("Merging dataframes...")

# Rename columns to indicate their source
df_old_renamed = df_old.rename(columns={col: f"{col} OLD" for col in df_old.columns if col != 'file_id'})
df_new_renamed = df_new.rename(columns={col: f"{col} NEW" for col in df_new.columns if col != 'file_id'})

# Merge dataframes
merged_df = pd.merge(df_old_renamed, df_new_renamed, on='file_id', how='outer')

print(f"Merged dataframe shape: {merged_df.shape}")

# Cell 4: Define the fields to compare
comparison_fields = [
    'bank_name',
    'bank_branch',
    'account_number',
    'date',
    'payee_name',
    'amount_words',
    'amount_numeric',
    'currency',
    'issuer_name',
    'micr_code',
    'IFSC'
]

# Cell 5: Create comparison columns
print("Creating comparison columns...")

# Function to compare values
def compare_values(old_val, new_val):
    """Compare values and return Y if they match, N if they don't"""
    # Handle NaN values
    if pd.isna(old_val) and pd.isna(new_val):
        return 'Y'  # Both are NaN, consider it a match
    elif pd.isna(old_val) or pd.isna(new_val):
        return 'N'  # One is NaN, the other isn't
    
    # Convert to string and compare
    try:
        old_str = str(old_val).strip().lower()
        new_str = str(new_val).strip().lower()
        return 'Y' if old_str == new_str else 'N'
    except:
        return 'N'  # If any conversion error, consider not matching

# Create comparison columns
for field in comparison_fields:
    old_col = f"{field} OLD"
    new_col = f"{field} NEW"
    
    # Skip if either column is missing
    if old_col not in merged_df.columns or new_col not in merged_df.columns:
        print(f"Skipping {field} comparison: Column(s) missing")
        continue
    
    match_col = f"{field} match OLD NEW"
    merged_df[match_col] = merged_df.apply(
        lambda row: compare_values(row.get(old_col), row.get(new_col)),
        axis=1
    )

# Cell 6: Rearrange columns as per requested output format
print("Rearranging columns...")

# Define the desired column order
desired_columns = []

# Add Instrument ID and filepath
if 'Instrument ID OLD' in merged_df.columns:
    desired_columns.append('Instrument ID OLD')
if 'filepath OLD' in merged_df.columns:
    desired_columns.append('filepath OLD')

# Add all comparison fields
for field in comparison_fields:
    # Define all possible columns for this field
    field_columns = [
        f"{field} OLD",
        f"{field} NEW",
        f"{field} match OLD NEW",
        f"{field}_valid OLD",
        f"{field}_conf OLD",
        f"{field}_conf NEW",
    ]
    
    # Only add columns that exist in the merged dataframe
    for col in field_columns:
        if col in merged_df.columns:
            desired_columns.append(col)

# Create the final dataframe with only the desired columns
final_df = merged_df[desired_columns]

print(f"Final dataframe shape: {final_df.shape}")
print(f"Final columns: {len(final_df.columns)}")

# Cell 7: Save the result to a new Excel file
output_path = "combined_comparison.xlsx"
print(f"Saving result to {output_path}...")

final_df.to_excel(output_path, index=False)

print(f"Process completed. File saved to {output_path}")

# Cell 8: Verify the output format
print("First few rows of the output:")
final_df.head()

# Cell 9: Field matching statistics
print("Match statistics:")
match_columns = [col for col in final_df.columns if 'match OLD NEW' in col]
for col in match_columns:
    match_counts = final_df[col].value_counts()
    total = match_counts.sum()
    matches = match_counts.get('Y', 0)
    percent_match = (matches / total * 100) if total > 0 else 0
    print(f"{col}: {matches}/{total} matches ({percent_match:.2f}%)")