In [33]:
#imp and non imp column added

import time
import os
import re
import logging
from datetime import datetime
import pandas as pd
from google.cloud import bigquery
from google.api_core.exceptions import GoogleAPIError
from docx import Document
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from slack_sdk import WebClient
from slack_sdk.errors import SlackApiError
from google.cloud.bigquery import SchemaField
import traceback
import sys

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Configuration
PROJECT_ID = 'fynd-jio-impetus-prod'       # Replace with your project ID
DATASET_ID = 'Impetus_dev_prod'                 # Replace with your dataset ID
PREFIXES = ['procuro_', 'costing_engine_', 'scan_pack_', 'pigeon_']  # Define your prefixes
# PROJECT_ID = 'fynd-jio-impetus-non-prod'       # Replace with your project ID
# DATASET_ID = 'Impetus_dev_sit'                 # Replace with your dataset ID
# PREFIXES = ['procuro_', 'costing_engine_', 'scan_pack_', 'pigeon_']  # Define your prefixes



# Error log list
ERROR_LOG_M = []

# Mapping of base table names to their key columns in master and target tables
BASE_TABLES = {
    # 'brand': {
    #     'master_key': 'code',
    #     'targets': {
    #         'procuro_': 'code',
    #         'costing_engine_': 'code'
    #     },
    #     'active_filter': {
    #         'column': 'is_active',
    #         'value': True
    #     },
    #     'perform_checks': True  # Default behavior
    # },
    # 'brand_pm_mapping': {
    #     'master_key': 'pm_id',
    #     'targets': {
    #         'costing_engine_': 'pm_id'
    #     },
    #     'perform_checks': True
    # },
    # 'brick': {
    #     'master_key': 'brick_code',
    #     'targets': {
    #         'costing_engine_': 'code'
    #     },
    #     'perform_checks': True
    # },
    # 'coe_bom_element_type_mapping': {
    #     'master_key': 'coe_name',
    #     'targets': {
    #         'costing_engine_': 'coe_name'
    #     },
    #     'perform_checks': True
    # },
    # 'event_log': {
    #     'master_key': 'user_id',
    #     'targets': {
    #         'costing_engine_': 'user_id'
    #     },
    #     'perform_checks': True
    # },
    'supplier': {
        'master_key': 'supplier_code',
        'targets': {
            'procuro_': 'supplier_code',
            'costing_engine_': 'supplier_code'
        },
        'active_filter': {  # Apply active filter
            'column': 'is_active',
            'value': True
        },
        'perform_checks': True
    },
    'vendor_details': {  # Newly added entry
        'master_key': 'supplier_code',  # Using supplier_code as the key
        'master_table': 'master_hub_supplier',  # Specify the master table explicitly
        'targets': {
            'scan_pack_': 'vendor_code'
        },
        'active_filter': {  # Apply active filter
            'column': 'is_active',
            'value': True
        },
        'perform_checks': True
    },
    # 'hsn_tax_mapping': {  # Newly added base table for HSN Codes
    #     'master_key': 'hsn_code',  # Assuming 'hsn_code' is the key column
    #     'master_table': 'master_hub_hsn',
    #     'targets': {
    #         'procuro_': 'hsn_code',
    #     },
    #     'perform_checks': False  # Only perform key comparisons
    # },
    # 'config_buyer_brand_mapping': {  # Updated entry
    #     'master_key': 'id',
    #     'master_table': 'master_hub_buyer_brand_mapping',
    #     'targets': {
    #         'costing_engine_': 'id'
    #     },
    #     'column_mapping': {  # Mapping of master columns to target columns
    #         'brand_id': 'brand_code'
    #     },
    #     'perform_checks': True
    # },   
}

# Define Non-Important Columns
non_imp_columns = {
    'supplier': ['id', '_id', 'updated_at', 'created_at'],
    'vendor_details': ['id', '_id', 'updated_at', 'created_at']  # Add if applicable
}

# Define Important Columns
imp_columns = {
    'brand': ['name', 'id', 'slug', 'code'],
    'brick': ['name', 'id', 'brick_code', 'description', 'class_code'],
    'config_buyer_brand_mapping': ['buyer_email', 'brand_code', 'id', 'buyer_id', 'is_active', 'buyer_name'],
    'brand_pm_mapping': ['pm_id', 'brand_code', 'pm_email', 'is_active', 'pm_name', 'id'],
    'coe_bom_element_type_mapping': ['is_active','coe_id','id','coe_name','coe_approver_email','element_type']
    # Add more base tables and their important columns as needed
}

# Slack configuration
SLACK_TOKEN = "xoxb-2151238541-7946286860052-5FCcfqBPem0xKigGlIcKdLgX"  # Replace with your Slack token
# SLACK_CHANNEL = "C07UN19ETK5"  # Replace with your Slack channel ID
SLACK_CHANNEL = "C08310RS2PK"


# Initialize Slack client
if SLACK_TOKEN and SLACK_CHANNEL:
    slack_client = WebClient(token=SLACK_TOKEN)
    logging.info("Slack client initialized successfully.")
else:
    slack_client = None
    logging.warning("Slack token or channel not found. Slack notifications will be disabled.")

def get_bigquery_client(project_id):
    """
    Initialize and return a BigQuery client.

    Args:
        project_id (str): GCP project ID.

    Returns:
        bigquery.Client: An initialized BigQuery client.
    """
    try:
        client = bigquery.Client(project=project_id)
        logging.info("BigQuery client initialized successfully.")
        return client
    except Exception as e:
        logging.error(f"Failed to initialize BigQuery client: {e}")
        raise

def find_common_tables_with_master_hub(client, dataset_name, prefixes, base_tables):
    """
    Find tables in the specified dataset that share the same base name after removing the 'master_hub_' prefix
    and exist with other given prefixes.

    Args:
        client (bigquery.Client): Initialized BigQuery client.
        dataset_name (str): The name of the dataset to search within.
        prefixes (list): List of prefixes to compare with 'master_hub_'.
        base_tables (dict): The BASE_TABLES dictionary containing base table configurations.

    Returns:
        dict: A dictionary where keys are base names and values are dictionaries showing which prefixes have tables.
    """
    try:
        # Reference the dataset
        dataset_ref = client.dataset(dataset_name)

        # List all tables in the dataset
        tables = client.list_tables(dataset_ref)
        table_names = [table.table_id for table in tables]
        logging.info(f"Found {len(table_names)} tables in dataset '{dataset_name}'.")

        # Dictionary to hold base names and their corresponding tables
        common_tables = {}
        for base_name, config in base_tables.items():
            # Determine the master table
            master_table = config.get('master_table', f'master_hub_{base_name}')
            if master_table in table_names:
                common_tables[base_name] = {'master_hub_': master_table}
                # Check for target tables with specified prefixes
                for prefix, target_key in config.get('targets', {}).items():
                    target_table = f"{prefix}{base_name}"
                    if target_table in table_names:
                        common_tables[base_name][prefix] = target_table
            else:
                logging.warning(f"Master table '{master_table}' for base '{base_name}' not found in dataset.")

        # Filter out base names that only have 'master_hub_' but no other matching prefixes
        common_tables_with_prefixes = {base_name: tables for base_name, tables in common_tables.items() if len(tables) > 1}

        logging.info(f"Identified {len(common_tables_with_prefixes)} common base names with 'master_hub_' and other specified prefixes.")
        return common_tables_with_prefixes

    except GoogleAPIError as e:
        logging.error(f"Google API Error: {e.message}")
        return {}
    except Exception as e:
        logging.error(f"An unexpected error occurred: {e}")
        return {}

def get_table_schema(client, dataset_name, table_name):
    """
    Retrieve the schema of a specified BigQuery table.

    Args:
        client (bigquery.Client): Initialized BigQuery client.
        dataset_name (str): The name of the dataset.
        table_name (str): The name of the table.

    Returns:
        dict: A dictionary mapping column names to their data types.
    """
    try:
        table_ref = client.dataset(dataset_name).table(table_name)
        table = client.get_table(table_ref)
        schema = {field.name: field.field_type for field in table.schema}
        logging.info(f"Retrieved schema for table '{table_name}'.")
        return schema
    except GoogleAPIError as e:
        logging.error(f"Failed to retrieve schema for table '{table_name}': {e.message}")
        return {}
    except Exception as e:
        logging.error(f"An unexpected error occurred while retrieving schema for table '{table_name}': {e}")
        return {}

def load_table_from_bigquery(client, dataset_name, table_name):
    """
    Load a table from BigQuery into a Pandas DataFrame.

    Args:
        client (bigquery.Client): Initialized BigQuery client.
        dataset_name (str): The name of the dataset.
        table_name (str): The name of the table.

    Returns:
        pd.DataFrame: DataFrame containing the table data.
    """
    try:
        query = f"SELECT * FROM `{PROJECT_ID}.{dataset_name}.{table_name}`"
        df = client.query(query).to_dataframe()
        logging.info(f"Loaded data from table '{table_name}' into DataFrame.")
        return df
    except GoogleAPIError as e:
        logging.error(f"Failed to load table '{table_name}': {e.message}")
        return pd.DataFrame()
    except Exception as e:
        logging.error(f"An unexpected error occurred while loading table '{table_name}': {e}")
        return pd.DataFrame()

def standardize_dataframe(df, exclude_columns=[]):
    """
    Standardize string columns in the DataFrame by stripping whitespace and converting to lowercase,
    excluding specified columns.

    Args:
        df (pd.DataFrame): The DataFrame to standardize.
        exclude_columns (list): Columns to exclude from standardization.

    Returns:
        pd.DataFrame: Standardized DataFrame.
    """
    df_copy = df.copy()
    for col in df_copy.columns:
        if col in exclude_columns:
            continue  # Skip standardizing this column
        # if pd.api.types.is_string_dtype(df_copy[col]):
        df_copy[col] = df_copy[col].astype(str).str.strip().str.lower()
    logging.info("Standardized DataFrame for comparison.")
    return df_copy

def find_common_and_non_common_columns(df1, df2):
    """
    Identify common and unique columns between two DataFrames.

    Args:
        df1 (pd.DataFrame): First DataFrame.
        df2 (pd.DataFrame): Second DataFrame.

    Returns:
        tuple: (common_columns, df1_unique_columns, df2_unique_columns)
    """
    common_columns = list(set(df1.columns).intersection(set(df2.columns)))
    df1_unique_columns = list(set(df1.columns) - set(df2.columns))
    df2_unique_columns = list(set(df2.columns) - set(df1.columns))
    logging.info(f"Found {len(common_columns)} common columns, {len(df1_unique_columns)} unique to first table, {len(df2_unique_columns)} unique to second table.")
    return common_columns, df1_unique_columns, df2_unique_columns

def find_mismatches(df_master, df_target, columns_to_check, master_key, target_key, table1, table2, duplicates_master, duplicates_target, non_imp_columns, column_mapping=None):
    """
    Identify mismatches between two DataFrames based on specified columns and key columns.

    Args:
        df_master (pd.DataFrame): Source DataFrame (master_hub_ table).
        df_target (pd.DataFrame): Target DataFrame (prefixed table).
        columns_to_check (list): List of columns to apply mismatch checks.
        master_key (str): The key column in the master table.
        target_key (str): The key column in the target table.
        table1 (str): Name of the source table.
        table2 (str): Name of the target table.
        duplicates_master (pd.DataFrame): Duplicate keys in master table.
        duplicates_target (pd.DataFrame): Duplicate keys in target table.
        non_imp_columns (list): List of non-important columns to exclude.
        column_mapping (dict, optional): Mapping of master columns to target columns. Defaults to None.

    Returns:
        tuple: (mismatches, error_logs_m)
    """
    mismatches = []
    error_logs_m = []
    # Ensure key columns are present in both DataFrames
    if master_key not in df_master.columns or target_key not in df_target.columns:
        logging.error(f"Key columns '{master_key}' or '{target_key}' not found in the respective tables.")
        return mismatches, error_logs_m

    # Rename target key to match master key for easier comparison
    df_target_renamed = df_target.rename(columns={target_key: master_key})

    # Merge DataFrames on the master_key, excluding duplicates
    merged_df = pd.merge(
        df_master.drop_duplicates(subset=master_key),
        df_target_renamed.drop_duplicates(subset=master_key),
        on=master_key,
        suffixes=(f'_{table1}', f'_{table2}'),
        how='inner'
    )

    logging.info(f"Merged DataFrame has {len(merged_df)} records for mismatch comparison.")

    for index, row in merged_df.iterrows():
        key = row[master_key]
        for master_col in columns_to_check:
            if master_col.startswith('_boltic_') or master_col in non_imp_columns:
                continue  # Skip columns starting with '_boltic_' or non-important columns

            # Determine corresponding target column
            target_col = master_col  # Default: same name
            if column_mapping and master_col in column_mapping:
                target_col = column_mapping[master_col]

            # Check if both columns exist in the merged DataFrame
            master_value_col = f"{master_col}_{table1}"
            target_value_col = f"{target_col}_{table2}"
            if master_value_col not in row or target_value_col not in row:
                continue  # Skip if columns not present

            val_master = row.get(master_value_col)
            val_target = row.get(target_value_col)

            # Handle NaN values in comparison
            if pd.isna(val_master) and pd.isna(val_target):
                continue  # Both are NaN, treat as equal
            elif pd.isna(val_master) or pd.isna(val_target) or val_master != val_target:
                mismatch_detail = {
                    master_key: key,
                    'column': master_col,  # Report master column name
                    f'{table1}_value': val_master,
                    f'{table2}_value': val_target
                }
                mismatches.append(mismatch_detail)
                error_detail = {
                    'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    'issue': 'mismatch',
                    'error_message': '',
                    'source_table': table1,
                    'target_table': table2,
                    'issue_column': master_col,
                    'unique_identifier': f'{master_key}: {key}'
                }
                error_logs_m.append(error_detail)

    logging.info(f"Found {len(mismatches)} mismatches between '{table1}' and '{table2}'.")
    return mismatches, error_logs_m

def find_duplicates(df, key_column, table_name):
    """
    Detect duplicate key_column entries in the DataFrame and identify differences.

    Args:
        df (pd.DataFrame): The DataFrame to check.
        key_column (str): The key column to check for duplicates.
        table_name (str): Name of the table being checked.

    Returns:
        tuple: (duplicate_records_df, error_logs_m)
    """
    if key_column not in df.columns:
        logging.error(f"Key column '{key_column}' not found in DataFrame.")
        return pd.DataFrame(), []

    # Get all duplicate entries (keep=False to get all duplicates)
    duplicates_df = df[df.duplicated(subset=key_column, keep=False)]

    # Group by key_column
    grouped = duplicates_df.groupby(key_column)

    duplicate_records = []
    error_logs_m = []

    for key, group in grouped:
        if len(group) <= 1:
            continue  # Not a duplicate

        # Drop key_column and any columns starting with '_boltic_'
        group_non_key = group.drop(columns=[key_column] + [col for col in group.columns if col.startswith('_boltic_')])

        # Check if all rows are identical
        if group_non_key.nunique().sum() == 0:
            difference = "No difference exists"
        else:
            # Find which columns have differences
            cols_with_diff = group_non_key.columns[group_non_key.nunique() > 1].tolist()
            difference = "Difference in value of columns: " + ', '.join(cols_with_diff)

        duplicate_records.append({
            key_column: key,
            'Difference in value': difference
        })
        error_detail = {
            'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'issue': 'duplicate',
            'error_message': f'{difference}',
            'source_table': f'{table_name}',
            'target_table': '',
            'issue_column': '',
            'unique_identifier': f'{key_column}: {key}'
        }
        error_logs_m.append(error_detail)

    logging.info(f"Found {len(duplicate_records)} duplicate entries based on '{key_column}'.")
    return pd.DataFrame(duplicate_records), error_logs_m

def validate_data_types(schema_master, schema_target, master_key, table1_name, table2_name, columns_to_check):
    """
    Compare data types of specified columns between master and target schemas.

    Args:
        schema_master (dict): Schema of the master table.
        schema_target (dict): Schema of the target table.
        master_key (str): The key column for reference.
        table1_name (str): Name of the first table.
        table2_name (str): Name of the second table.
        columns_to_check (list): List of columns to validate data types.

    Returns:
        tuple: (data_type_issues_df, error_logs_m)
    """
    data_type_issues = []
    error_logs_m = []

    # Identify common columns to check
    common_columns = set(columns_to_check).intersection(set(schema_master.keys()), set(schema_target.keys()))

    for column in common_columns:
        type_master = schema_master[column]
        type_target = schema_target[column]
        if type_master != type_target:
            data_type_issues.append({
                'column_name': column,
                f'{table1_name}_data_type': type_master,
                f'{table2_name}_data_type': type_target
            })
            error_detail = {
                'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                'issue': 'data_type_issues',
                'error_message': f'{table1_name}_data_type: {type_master} , {table2_name}_data_type: {type_target}',
                'source_table': table1_name,
                'target_table': table2_name,
                'issue_column': column,
                'unique_identifier': ''
            }
            error_logs_m.append(error_detail)

    logging.info(f"Found {len(data_type_issues)} data type issues.")
    return pd.DataFrame(data_type_issues), error_logs_m

def validate_formats(df_master, df_target, key_column, target_key, target_table, master_table, columns_to_check):
    """
    Validate specific column formats using regular expressions and include corresponding target table values.

    Args:
        df_master (pd.DataFrame): The master DataFrame to validate.
        df_target (pd.DataFrame): The target DataFrame to fetch corresponding values.
        key_column (str): The key column in the master DataFrame.
        target_key (str): The key column in the target DataFrame.
        target_table (str): The name of the target table.
        master_table (str): The name of the master table.
        columns_to_check (list): List of columns to validate formats.

    Returns:
        tuple: (format_issues_df, error_logs_m)
    """
    format_issues = pd.DataFrame(columns=[key_column, 'column', 'value', 'issue', f'{target_table}_value'])
    error_logs_m = []

    for idx, row in df_master.iterrows():
        key_value = str(row[key_column]).strip()

        # GSTIN format validation
        if 'gstin' in columns_to_check and 'gstin' in df_master.columns:
            gstin = str(row['gstin']).strip()
            if not re.match(r'^[0-9]{2}[A-Z]{5}[0-9]{4}[A-Z]{1}[A-Z0-9]{3}$', gstin):
                # Fetch corresponding target value
                if key_value in df_target[target_key].astype(str).str.strip().values:
                    target_row = df_target[df_target[target_key].astype(str).str.strip() == key_value].iloc[0]
                    target_value = target_row['gstin'] if 'gstin' in target_row else "Column not present"
                else:
                    target_value = f"'{target_key}' not present"

                format_issues = pd.concat([format_issues, pd.DataFrame([{
                    key_column: key_value,
                    'column': 'gstin',
                    'value': row['gstin'],
                    'issue': 'Invalid GSTIN format',
                    f'{target_table}_value': target_value
                }])], ignore_index=True)
                error_detail = {
                    'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    'issue': 'format_issue',
                    'error_message': 'Invalid GSTIN format',
                    'source_table': master_table,
                    'target_table': '',
                    'issue_column': 'gstin',
                    'unique_identifier': f'{key_column}: {key_value}'
                }
                error_logs_m.append(error_detail)

        # Email format validation
        if 'email' in columns_to_check and 'email' in df_master.columns:
            email = str(row['email']).strip()
            if not re.match(r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$', email):
                # Fetch corresponding target value
                if key_value in df_target[target_key].astype(str).str.strip().values:
                    target_row = df_target[df_target[target_key].astype(str).str.strip() == key_value].iloc[0]
                    target_value = target_row['email'] if 'email' in target_row else "Column not present"
                else:
                    target_value = f"'{target_key}' not present"

                format_issues = pd.concat([format_issues, pd.DataFrame([{
                    key_column: key_value,
                    'column': 'email',
                    'value': row['email'],
                    'issue': 'Invalid email format',
                    f'{target_table}_value': target_value
                }])], ignore_index=True)
                error_detail = {
                    'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    'issue': 'format_issue',
                    'error_message': 'Invalid email format',
                    'source_table': master_table,
                    'target_table': '',
                    'issue_column': 'email',
                    'unique_identifier': f'{key_column}: {key_value}'
                }
                error_logs_m.append(error_detail)

        # Pincode format validation
        if 'pincode' in columns_to_check and 'pincode' in df_master.columns:
            pincode = str(row['pincode']).strip()
            if not re.match(r'^\d{6}$', pincode):
                # Fetch corresponding target value
                if key_value in df_target[target_key].astype(str).str.strip().values:
                    target_row = df_target[df_target[target_key].astype(str).str.strip() == key_value].iloc[0]
                    target_value = target_row['pincode'] if 'pincode' in target_row else "Column not present"
                else:
                    target_value = f"'{target_key}' not present"

                format_issues = pd.concat([format_issues, pd.DataFrame([{
                    key_column: key_value,
                    'column': 'pincode',
                    'value': row['pincode'],
                    'issue': 'Pincode must be exactly 6 digits',
                    f'{target_table}_value': target_value
                }])], ignore_index=True)
                error_detail = {
                    'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    'issue': 'format_issue',
                    'error_message': 'Pincode must be exactly 6 digits',
                    'source_table': master_table,
                    'target_table': '',
                    'issue_column': 'pincode',
                    'unique_identifier': f'{key_column}: {key_value}'
                }
                error_logs_m.append(error_detail)

        # Address length validation
        if 'address' in columns_to_check and 'address' in df_master.columns:
            address = str(row['address']).strip()
            if len(address) > 100:
                # Fetch corresponding target value
                if key_value in df_target[target_key].astype(str).str.strip().values:
                    target_row = df_target[df_target[target_key].astype(str).str.strip() == key_value].iloc[0]
                    target_value = target_row['address'] if 'address' in target_row else "Column not present"
                else:
                    target_value = f"'{target_key}' not present"

                format_issues = pd.concat([format_issues, pd.DataFrame([{
                    key_column: key_value,
                    'column': 'address',
                    'value': address,
                    'issue': 'Address exceeds 100 characters after stripping',
                    f'{target_table}_value': target_value
                }])], ignore_index=True)
                error_detail = {
                    'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    'issue': 'format_issue',
                    'error_message': 'Address exceeds 100 characters',
                    'source_table': master_table,
                    'target_table': '',
                    'issue_column': 'address',
                    'unique_identifier': f'{key_column}: {key_value}'
                }
                error_logs_m.append(error_detail)

    logging.info(f"Found {len(format_issues)} format issues.")
    return format_issues, error_logs_m


def create_table(doc, data, column_names):
    """
    Helper function to create a table in a docx document from a list of dictionaries.

    Args:
        doc (Document): The Word document object.
        data (list or list of dict): Data to populate the table.
        column_names (list): List of column names for the table headers.
    """
    if not data:
        return
    table = doc.add_table(rows=1, cols=len(column_names))
    table.style = 'Light List Accent 1'
    hdr_cells = table.rows[0].cells
    for i, col_name in enumerate(column_names):
        hdr_cells[i].text = col_name

    for row_data in data:
        row_cells = table.add_row().cells
        for i, col_name in enumerate(column_names):
            cell_value = str(row_data.get(col_name, '')).strip()
            row_cells[i].text = cell_value
    logging.info("Added table to the Word document.")


def add_non_matching_keys_section(doc, df1_only_keys, table1_name, df2_only_keys, table2_name, key_column_master, key_column_target):
    """
    Add a section in the Word document for non-matching keys between two tables.

    Args:
        doc (Document): The Word document object.
        df1_only_keys (list): Keys present only in table1.
        table1_name (str): Name of the first table.
        df2_only_keys (list): Keys present only in table2.
        table2_name (str): Name of the second table.
        key_column_master (str): The key column in the master table.
        key_column_target (str): The key column in the target table.
    """
    if df1_only_keys or df2_only_keys:
        if df1_only_keys:
            doc.add_heading(f"'{key_column_master}' present only in '{table1_name}' and not in '{table2_name}' ({len(df1_only_keys)})", level=2)
            create_table(doc, [{key_column_master: key[key_column_master]} for key in df1_only_keys], [key_column_master])
        if df2_only_keys:
            doc.add_heading(f"'{key_column_target}' present only in '{table2_name}' and not in '{table1_name}' ({len(df2_only_keys)})", level=2)
            create_table(doc, [{key_column_target: key[key_column_target]} for key in df2_only_keys], [key_column_target])
    else:
        doc.add_paragraph("No non-matching keys found.")

def add_table_of_contents(doc):
    """
    Adds a Table of Contents to the Word document.

    Args:
        doc (Document): The Word document object.
    """
    paragraph = doc.add_paragraph()
    run = paragraph.add_run()
    fldChar_begin = OxmlElement('w:fldChar')  # creates a new element
    fldChar_begin.set(qn('w:fldCharType'), 'begin')  # sets attribute on element
    instrText = OxmlElement('w:instrText')
    instrText.set(qn('xml:space'), 'preserve')  # sets attribute on element
    instrText.text = 'TOC \\o "1-2" \\h \\z \\u'  # change to what you need
    fldChar_separate = OxmlElement('w:fldChar')
    fldChar_separate.set(qn('w:fldCharType'), 'separate')
    fldChar_end = OxmlElement('w:fldChar')
    fldChar_end.set(qn('w:fldCharType'), 'end')
    run._r.append(fldChar_begin)
    run._r.append(instrText)
    run._r.append(fldChar_separate)
    run._r.append(fldChar_end)
    logging.info("Added Table of Contents to the Word document.")

def create_aggregated_document(all_results, base_name):
    """
    Creates a single Word document that presents all comparison results for a base table.

    Args:
        all_results (list): List of comparison result dictionaries.
        base_name (str): The base name of the table.

    Returns:
        str: The filepath of the saved report.
    """
    doc = Document()
    doc.add_heading(f'{base_name.capitalize()} Tables Comparison Report', level=0)
    doc.add_paragraph(f'Report generated on {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n')

    # Add Instruction for TOC Update
    doc.add_paragraph(
        "📌 **Note:** To update the Table of Contents and make the links clickable, go to the ‘References’ tab and click ‘Update Table’ or press F9 in Windows and Fn+F9 in Mac after opening this document in Microsoft Word.",
        style='Intense Quote'
    )

    # Add Table of Contents
    doc.add_heading('Table of Contents', level=1)
    add_table_of_contents(doc)
    doc.add_page_break()

    for result in all_results:
        table1_name = result['table1_name']
        table2_name = result['table2_name']
        key_column_master = result['key_column_master']
        key_column_target = result['key_column_target']
        doc.add_heading(f'Comparison: {table1_name} vs {table2_name}', level=1)

        # Mismatches
        if result['mismatches']:
            doc.add_heading(f'Mismatches ({len(result["mismatches"])})', level=2)
            column_names = [key_column_master, 'column', f'{table1_name}_value', f'{table2_name}_value']
            create_table(doc, result['mismatches'], column_names)
        else:
            doc.add_heading("No mismatches found.", level=2)

        # Null values in master table
        if not result['null_values_master'].empty:
            count_null_master = len(result['null_values_master'])
            doc.add_heading(f'Null values in {table1_name} ({count_null_master})', level=2)
            column_names = [key_column_master, 'column', table2_name]
            create_table(doc, result['null_values_master'].to_dict('records'), column_names)
        else:
            doc.add_heading(f"No null values found in {table1_name}.", level=2)

        # Null values in target table
        if not result['null_values_target'].empty:
            count_null_target = len(result['null_values_target'])
            doc.add_heading(f'Null values in {table2_name} ({count_null_target})', level=2)
            column_names = [key_column_target, 'column', table1_name]
            create_table(doc, result['null_values_target'].to_dict('records'), column_names)
        else:
            doc.add_heading(f"No null values found in {table2_name}.", level=2)

        # Duplicate keys in master table
        if not result['duplicates_master'].empty:
            count_dup_master = len(result['duplicates_master'])
            doc.add_heading(f'Duplicate Keys in {table1_name} ({count_dup_master})', level=2)
            create_table(doc, result['duplicates_master'].to_dict('records'), [key_column_master, 'Difference in value'])
        else:
            doc.add_heading("No duplicate keys found in master table.", level=2)

        # Duplicate keys in target table
        if not result['duplicates_target'].empty:
            count_dup_target = len(result['duplicates_target'])
            doc.add_heading(f'Duplicate Keys in {table2_name} ({count_dup_target})', level=2)
            create_table(doc, result['duplicates_target'].to_dict('records'), [key_column_target, 'Difference in value'])
        else:
            doc.add_heading(f"No duplicate keys found in {table2_name}.", level=2)

        # Data type issues
        if not result['data_type_issues'].empty:
            count_data_type_issues = len(result['data_type_issues'])
            doc.add_heading(f'Data Type Issues ({count_data_type_issues})', level=2)
            column_names = ['column_name', f'{table1_name}_data_type', f'{table2_name}_data_type']
            create_table(doc, result['data_type_issues'].to_dict('records'), column_names)
        else:
            doc.add_heading("No data type issues found.", level=2)

        # Format issues in master table with target values
        if not result['format_issues_master'].empty:
            count_format_issues_master = len(result['format_issues_master'])
            doc.add_heading(f'Format Issues in {table1_name} ({count_format_issues_master})', level=2)
            column_names_master = [key_column_master, 'column', 'value', 'issue', f'{table2_name}_value']
            create_table(doc, result['format_issues_master'].to_dict('records'), column_names_master)
        else:
            doc.add_heading(f"No format issues found in {table1_name}.", level=2)

        # Pincode Mapping Issues with target details
        if not result['pincode_mapping_issues'].empty:
            count_pincode_issues = len(result['pincode_mapping_issues'])
            doc.add_heading(f'Pincode Mapping Issues in {table1_name} ({count_pincode_issues})', level=2)
            column_names = [
                key_column_master, 'pincode', 'state', 'city', 'issue',
                f'{table2_name}_details'
            ]
            create_table(doc, result['pincode_mapping_issues'].to_dict('records'), column_names)
        else:
            doc.add_heading("No pincode mapping issues found.", level=2)

        # Non-matching keys in master DataFrame
        if result['df_master_only_keys']:
            count_master_only = len(result['df_master_only_keys'])
            doc.add_heading(f'Keys only in {table1_name} ({count_master_only})', level=2)
            column_names = [key_column_master]
            create_table(doc, result['df_master_only_keys'], column_names)
        else:
            doc.add_heading(f"No keys found only in {table1_name}.", level=2)

        # Non-matching keys in target DataFrame
        if result['df_target_only_keys']:
            count_target_only = len(result['df_target_only_keys'])
            doc.add_heading(f'Keys only in {table2_name} ({count_target_only})', level=2)
            column_names = [key_column_target]
            create_table(doc, result['df_target_only_keys'], column_names)
        else:
            doc.add_heading(f"No keys found only in {table2_name}.", level=2)

        doc.add_page_break()  # Optional: Add a page break between comparisons

    # Save the aggregated document to the current directory
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    report_filename = f"{base_name}_comparison_report_aggregated_{timestamp}.docx"
    doc.save(report_filename)
    logging.info(f"Saved aggregated comparison report as '{report_filename}'.")

    return report_filename  # Return the filename for further processing

def send_slack_alert(message):
    """
    Send a message to a specified Slack channel.

    Args:
        message (str): The message to send.
    """
    if not slack_client:
        logging.warning("Slack client is not initialized. Skipping Slack notification.")
        return

    try:
        response = slack_client.chat_postMessage(
            channel=SLACK_CHANNEL,
            text=message
        )
        logging.info(f"Message sent to {SLACK_CHANNEL}: {response['ts']}")
    except SlackApiError as e:
        logging.error(f"Error sending message to Slack: {e.response['error']}")

def upload_file_to_slack(filepath, title=None):
    """
    Upload a file to the specified Slack channel using files_upload_v2.

    Args:
        filepath (str): The path to the file to upload.
        title (str, optional): The title for the uploaded file. Defaults to the file's basename.
    """
    if not slack_client:
        logging.warning("Slack client is not initialized. Skipping file upload.")
        return

    try:
        with open(filepath, 'rb') as f:
            response = slack_client.files_upload_v2(
                channel=SLACK_CHANNEL,
                file=f,
                filename=os.path.basename(filepath),  # Explicitly set the filename with extension
                title=title if title else os.path.basename(filepath),  # Set the title
                initial_comment=title if title else "File uploaded."  # Optional: Add an initial comment
            )

        # Verify if the upload was successful
        if response.get('ok'):
            file_permalink = response['file']['permalink']
            logging.info(f"File uploaded to Slack channel '{SLACK_CHANNEL}': {file_permalink}")
        else:
            logging.error(f"Failed to upload file to Slack: {response}")
    except SlackApiError as e:
        logging.error(f"Slack API Error during file upload: {e.response['error']}")
    except Exception as e:
        logging.error(f"Unexpected error during file upload: {e}")

def find_non_matching_keys(df_master, df_target, master_key, target_key, duplicates_master, duplicates_target, master_table, target_table):
    """
    Identify keys present in df_master but not in df_target and vice versa, including duplicates.

    Args:
        df_master (pd.DataFrame): Source DataFrame.
        df_target (pd.DataFrame): Target DataFrame.
        master_key (str): The key column in the master table.
        target_key (str): The key column in the target table.
        duplicates_master (pd.DataFrame): Duplicate keys in master table.
        duplicates_target (pd.DataFrame): Duplicate keys in target table.
        master_table (str): Name of the master table.
        target_table (str): Name of the target table.

    Returns:
        tuple: (master_only_keys, target_only_keys, error_logs_m)
    """
    error_logs_m = []
    # Include all keys, including duplicates
    keys_master = set(df_master[master_key].astype(str).str.strip())
    keys_target = set(df_target[target_key].astype(str).str.strip())

    # Keys present only in master
    master_only = keys_master - keys_target
    # Keys present only in target
    target_only = keys_target - keys_master

    logging.info(f"Found {len(master_only)} keys in source not in target and {len(target_only)} keys in target not in source.")

    # Convert to list of dictionaries for consistency
    master_only_keys = [{master_key: key} for key in master_only]
    target_only_keys = [{target_key: key} for key in target_only]

    # Log errors for keys only in master
    for key in master_only:
        error_detail = {
            'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'issue': 'missing_key',
            'error_message': f"Key '{master_key}' with value '{key}' is present only in '{master_table}' and missing in '{target_table}'.",
            'source_table': master_table,
            'target_table': target_table, 
            'issue_column': master_key,
            'unique_identifier': f"{master_key}: {key}"
        }
        error_logs_m.append(error_detail)

    # Log errors for keys only in target
    for key in target_only:
        error_detail = {
            'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'issue': 'missing_key',
            'error_message': f"Key '{target_key}' with value '{key}' is present only in '{target_table}' and missing in '{master_table}'.",
            'source_table': target_table,
            'target_table': master_table,
            'issue_column': target_key,
            'unique_identifier': f"{target_key}: {key}"
        }
        error_logs_m.append(error_detail)

    return master_only_keys, target_only_keys, error_logs_m

def find_detailed_nulls(df_master, df_target, master_key, target_key, master_table, target_table, columns_to_check):
    """
    Identify null values in both master and target tables for specified columns and fetch corresponding values or indicate missing keys.

    Args:
        df_master (pd.DataFrame): Source DataFrame (master_hub_ table).
        df_target (pd.DataFrame): Target DataFrame (prefixed table).
        master_key (str): The key column in the master table.
        target_key (str): The key column in the target table.
        master_table (str): Name of the master table.
        target_table (str): Name of the target table.
        columns_to_check (list): List of columns to check for null values.

    Returns:
        tuple: (null_values_master, null_values_target, error_logs_m)
    """
    null_values_master = []
    null_values_target = []
    error_logs_m = []

    # Find nulls in master
    null_master = df_master[df_master[columns_to_check].isnull().any(axis=1)]
    for idx, row in null_master.iterrows():
        key_value = str(row[master_key]).strip()
        for column in columns_to_check:
            if column == master_key or column.startswith('_boltic_'):
                continue  # Skip key column and non-important columns
            if column not in row:
                continue  # Skip if column is not in the row
            if pd.isnull(row[column]):
                if key_value in df_target[target_key].astype(str).str.strip().values:
                    target_row = df_target[df_target[target_key].astype(str).str.strip() == key_value].iloc[0]
                    target_value = target_row[column] if column in target_row else "Column not present"
                else:
                    target_value = f"'{target_key}' not present"
                null_record = {
                    master_key: key_value,
                    'column': column,
                    target_table: target_value
                }
                error_detail = {
                    'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    'issue': 'null',
                    'error_message': 'Null in columns',
                    'source_table': master_table,
                    'target_table': '',
                    'issue_column': column,
                    'unique_identifier': f'{master_key} : {key_value}'
                }
                error_logs_m.append(error_detail)
                null_values_master.append(null_record)

    # Find nulls in target
    null_target = df_target[df_target[columns_to_check].isnull().any(axis=1)]
    for idx, row in null_target.iterrows():
        key_value = str(row[target_key]).strip()
        for column in columns_to_check:
            if column == target_key or column.startswith('_boltic_'):
                continue  # Skip key column and non-important columns
            if column not in row:
                continue  # Skip if column is not in the row
            if pd.isnull(row[column]):
                if key_value in df_master[master_key].astype(str).str.strip().values:
                    master_row = df_master[df_master[master_key].astype(str).str.strip() == key_value].iloc[0]
                    master_value = master_row[column] if column in master_row else "Column not present"
                else:
                    master_value = f"'{master_key}' not present"
                null_record = {
                    target_key: key_value,
                    'column': column,
                    master_table: master_value
                }
                error_detail = {
                    'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    'issue': 'null',
                    'error_message': 'Null in columns',
                    'source_table': target_table,
                    'target_table': '',
                    'issue_column': column,
                    'unique_identifier': f'{target_key} : {key_value}'
                }
                error_logs_m.append(error_detail)
                null_values_target.append(null_record)

    logging.info(f"Found {len(null_values_master)} null values in master table '{master_table}'.")
    logging.info(f"Found {len(null_values_target)} null values in target table '{target_table}'.")
    return null_values_master, null_values_target, error_logs_m

def validate_pincode_mapping(df_master, df_target, key_column, target_key, target_table, client, master_table):
    """
    Validate pincode mapping by comparing with the all_india_PO_list reference table.
    If a pincode issue is found in the master table, then check the corresponding pincode in the target table.

    Args:
        df_master (pd.DataFrame): The master DataFrame to validate.
        df_target (pd.DataFrame): The target DataFrame to fetch corresponding values.
        key_column (str): The key column in the master DataFrame.
        target_key (str): The key column in the target DataFrame.
        target_table (str): The name of the target table.
        client (bigquery.Client): Initialized BigQuery client.
        master_table (str): Name of the master table.

    Returns:
        tuple: (pincode_mapping_issues_df, error_logs_m)
    """

    error_logs_m = []
    # Read the reference table from Analytics dataset
    try:
        reference_table = "all_india_po_list"
        reference_dataset = "analytics_data"
        query = f"SELECT pincode, city, state FROM `{PROJECT_ID}.{reference_dataset}.{reference_table}`"
        reference_df = client.query(query).to_dataframe()
        reference_df['pincode'] = reference_df['pincode'].astype(str).str.strip()
        reference_df['city'] = reference_df['city'].astype(str).str.strip().str.lower()
        reference_df['state'] = reference_df['state'].astype(str).str.strip().str.lower()
        logging.info(f"Loaded reference pincode mapping from '{reference_table}' in '{reference_dataset}' dataset.")
    except Exception as e:
        logging.error(f"Failed to load reference pincode mapping: {e}")
        return pd.DataFrame(), error_logs_m

    # Check if df_master has 'pincode', 'city', 'state' columns
    required_columns = {'pincode', 'city', 'state'}
    if not required_columns.issubset(df_master.columns):
        logging.info(f"DataFrame does not have required columns for pincode mapping validation: {required_columns}")
        return pd.DataFrame(), error_logs_m

    # Initialize the issues DataFrame with a single target table details column
    pincode_mapping_issues = pd.DataFrame(columns=[
        key_column, 'pincode', 'state', 'city', 'issue',
        f'{target_table}_details'
    ])

    # Iterate over each row in df_master to validate pincode mapping
    for idx, row in df_master.iterrows():
        key_value = str(row[key_column]).strip()
        pincode = str(row['pincode']).strip()
        city = str(row['city']).strip().lower()
        state = str(row['state']).strip().lower()

        # Fetch corresponding target row if exists
        target_row = df_target[df_target[target_key].astype(str).str.strip() == key_value]
        if not target_row.empty:
            target_row = target_row.iloc[0]
            target_pincode = target_row['pincode'] if 'pincode' in target_row and pd.notnull(target_row['pincode']) else "Pincode missing"
            target_state = target_row['state'] if 'state' in target_row and pd.notnull(target_row['state']) else "State missing"
            target_city = target_row['city'] if 'city' in target_row and pd.notnull(target_row['city']) else "City missing"
            target_details = f"Pincode: {target_pincode}, State: {target_state}, City: {target_city}"
        else:
            target_details = f"Key '{key_column}' with value '{key_value}' not present in target table '{target_table}'."

        # Check if pincode exists in reference
        ref_matches = reference_df[reference_df['pincode'] == pincode]
        if ref_matches.empty:
            issue = f"Invalid pincode ({pincode})."
            pincode_mapping_issues = pd.concat([pincode_mapping_issues, pd.DataFrame([{
                key_column: key_value,
                'pincode': pincode,
                'state': state,
                'city': city,
                'issue': issue,
                f'{target_table}_details': target_details
            }])], ignore_index=True)
            error_detail = {
                'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                'issue': 'pincode_mapping',
                'error_message': f"{issue}. {target_table} Details: {target_details}",
                'source_table': master_table,
                'target_table': target_table,
                'issue_column': 'pincode',
                'unique_identifier': f'{key_column}: {key_value}'
            }
            error_logs_m.append(error_detail)
            continue

        # Check if any of the reference entries match both the city and state
        exact_match = ref_matches[
            (ref_matches['city'] == city) & (ref_matches['state'] == state)
        ]
        if not exact_match.empty:
            continue  # No issue, mapping is correct

        # Check for state mismatch
        state_matches = ref_matches[ref_matches['state'] == state]

        # Check for city mismatch
        city_matches = ref_matches[ref_matches['city'] == city]

        if state_matches.empty and city_matches.empty:
            # Both state and city do not match
            expected_entries = ref_matches[['state', 'city']].drop_duplicates()
            expected_states = expected_entries['state'].tolist()
            expected_cities = expected_entries['city'].tolist()
            expected_states_str = ', '.join(expected_states)
            expected_cities_str = ', '.join(expected_cities)
            issue = f"Pincode {pincode} does not match state '{state}' and city '{city}'. Expected states: {expected_states_str}; Expected cities: {expected_cities_str}."
        elif state_matches.empty:
            # State does not match
            expected_states = ref_matches['state'].unique().tolist()
            expected_states_str = ', '.join(expected_states)
            issue = f"Pincode {pincode} does not match state '{state}'. Expected states: {expected_states_str}."
        elif city_matches.empty:
            # City does not match
            expected_cities = state_matches['city'].unique().tolist()
            expected_cities_str = ', '.join(expected_cities)
            issue = f"Pincode {pincode} does not match city '{city}'. Expected cities: {expected_cities_str}."
        else:
            # Other cases
            issue = f"Pincode {pincode} has a mapping inconsistency."

        pincode_mapping_issues = pd.concat([pincode_mapping_issues, pd.DataFrame([{
            key_column: key_value,
            'pincode': pincode,
            'state': state,
            'city': city,
            'issue': issue,
            f'{target_table}_details': target_details
        }])], ignore_index=True)
        error_detail = {
            'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'issue': 'pincode_mapping',
            'error_message': f"{issue}. {target_table} Details: {target_details}",
            'source_table': master_table,
            'target_table': target_table,
            'issue_column': 'pincode',
            'unique_identifier': f'{key_column}: {key_value}'
        }
        error_logs_m.append(error_detail)

    logging.info(f"Found {len(pincode_mapping_issues)} pincode mapping issues in master table '{master_table}'.")
    return pincode_mapping_issues, error_logs_m

def compare_tables(client, dataset_name, base_name, master_table, target_table, master_key, target_key, column_mapping=None):
    """
    Compare two tables and generate a report.

    Args:
        client (bigquery.Client): Initialized BigQuery client.
        dataset_name (str): The name of the dataset.
        base_name (str): The base name of the table.
        master_table (str): Name of the master_hub_ table.
        target_table (str): Name of the target prefixed table.
        master_key (str): The key column in the master table.
        target_key (str): The key column in the target table.

    Returns:
        dict: A dictionary containing all comparison results.
    """
    logging.info(f"Starting comparison for base table '{base_name}': '{master_table}' vs '{target_table}'.")

    # Initialize comparison results
    mismatches = []
    null_values_master = []
    null_values_target = []
    data_type_issues = pd.DataFrame()
    format_issues_master = pd.DataFrame()
    pincode_mapping_issues = pd.DataFrame()
    duplicates_master = pd.DataFrame()
    duplicates_target = pd.DataFrame()
    master_only_keys = []
    target_only_keys = []

    # Load data
    df_master = load_table_from_bigquery(client, dataset_name, master_table)
    df_target = load_table_from_bigquery(client, dataset_name, target_table)

    # Apply standardization
    df_master = standardize_dataframe(df_master, exclude_columns=[master_key, 'is_active'])
    df_target = standardize_dataframe(df_target, exclude_columns=[target_key, 'is_active'])

    # # # Debugging: Log unique 'pan' values after standardization
    # if 'is_active' in df_master.columns:
    #     logging.info(f"Master 'pan' unique values: {df_master['is_active'].unique()}")
    # if 'is_active' in df_target.columns:
    #     logging.info(f"Target 'pan' unique values: {df_target['is_active'].unique()}")

    
    # Apply active filter if defined
    base_table_info = BASE_TABLES.get(base_name, {})
    active_filter = base_table_info.get('active_filter')
    perform_checks = base_table_info.get('perform_checks', True)

    if active_filter:
        column = active_filter.get('column')
        value = active_filter.get('value')
        if column and column in df_master.columns:
            initial_count = len(df_master)
            df_master = df_master[df_master[column] == value]
            filtered_count = len(df_master)
            logging.info(f"Filtered '{base_name}' master table: {initial_count - filtered_count} records excluded based on {column} = {value}.")
        else:
            logging.warning(f"Active filter specified but column '{column}' not found in master table '{master_table}'.")

    # Proceed with filtering
    # if active_filter:
    #     column = active_filter.get('column')
    #     value = active_filter.get('value')
    #     if column:
    #         if column == 'is_active':
    #             df_master = df_master[df_master[column].str.lower() == 'true']  # Updated to string comparison
    #             logging.info(f"Filtered '{base_name}' master table based on {column} = 'true'.")
    #         else:
    #             df_master = df_master[df_master[column] == value]
    #             logging.info(f"Filtered '{base_name}' master table based on {column} = {value}.")
    
    
    if df_master.empty or df_target.empty:
        logging.warning(f"One of the tables '{master_table}' or '{target_table}' is empty. Skipping comparison.")
        return None

    # Identify BigNumeric columns in master and target tables
    schema_master = get_table_schema(client, dataset_name, master_table)
    schema_target = get_table_schema(client, dataset_name, target_table)
    bignumeric_columns_master = [col for col, dtype in schema_master.items() if dtype == 'BIGNUMERIC']
    bignumeric_columns_target = [col for col, dtype in schema_target.items() if dtype == 'BIGNUMERIC']
    
    # # Format BigNumeric columns in master table
    # for col in bignumeric_columns_master:
    #     if col in df_master.columns:
    #         df_master[col] = df_master[col].apply(lambda x: format(x, '.0f') if pd.notnull(x) else x)

    # # Format BigNumeric columns in target table
    # for col in bignumeric_columns_target:
    #     if col in df_target.columns:
    #         df_target[col] = df_target[col].apply(lambda x: format(x, '.0f') if pd.notnull(x) else x)

    # # Get imp_columns and non_imp_columns
    # imp_columns = Imp_columns.get(base_name, None)
    # non_imp_columns = Non_imp_columns.get(base_name, [])

    # Identify common columns
    common_columns, master_unique_cols, target_unique_cols = find_common_and_non_common_columns(df_master, df_target)

    if not common_columns:
        logging.warning(f"No common columns found between '{master_table}' and '{target_table}'. Skipping comparison.")
        return None

    # Apply column mapping if provided
    if column_mapping:
        # Rename master columns to target columns for comparison
        for master_col, target_col in column_mapping.items():
            if master_col in df_master.columns and target_col in df_target.columns:
                # Ensure both columns are standardized
                df_master[master_col] = df_master[master_col].astype(str).str.strip().str.lower()
                df_target[target_col] = df_target[target_col].astype(str).str.strip().str.lower()
        # Adjust common_columns by replacing master_col with target_col
        adjusted_common_columns = set(common_columns)
        for master_col, target_col in column_mapping.items():
            if master_col in adjusted_common_columns and target_col in adjusted_common_columns:
                adjusted_common_columns.remove(master_col)
                adjusted_common_columns.remove(target_col)
                adjusted_common_columns.add(master_col)  # Use master_col as the unified name
        common_columns = list(adjusted_common_columns)

    # Determine columns to check based on imp_columns
    if imp_columns:
        columns_to_check = [col for col in imp_columns if col in common_columns]
        logging.info(f"Important columns defined for '{base_name}': {columns_to_check}")
    else:
        columns_to_check = [col for col in common_columns if col not in non_imp_columns]
        logging.info(f"No important columns defined for '{base_name}'. Applying checks to all columns except non_imp_columns: {columns_to_check}")

    if perform_checks:
        # Find duplicates in both tables
        duplicates_master, error_logs_m = find_duplicates(df_master, master_key, master_table)
        ERROR_LOG_M.extend(error_logs_m)
        duplicates_target, error_logs_m = find_duplicates(df_target, target_key, target_table)
        ERROR_LOG_M.extend(error_logs_m)

    if not duplicates_master.empty:
        logging.warning(f"Duplicate keys found in source table '{master_table}'. These will be reported but not used in mismatch comparison.")
    if not duplicates_target.empty:
        logging.warning(f"Duplicate keys found in target table '{target_table}'. These will be reported but not used in mismatch comparison.")

    # Perform mismatch comparison
    if perform_checks:
        mismatches, error_logs_m = find_mismatches(
            df_master,
            df_target,
            columns_to_check,
            master_key,
            target_key,
            master_table,
            target_table,
            duplicates_master,
            duplicates_target,
            non_imp_columns,
            column_mapping  # Pass column_mapping to handle differently named columns
        )
        ERROR_LOG_M.extend(error_logs_m)

    # Find detailed null values in both tables
    if perform_checks:
        null_values_master, null_values_target, error_logs_m = find_detailed_nulls(
            df_master,
            df_target,
            master_key,
            target_key,
            master_table,
            target_table,
            columns_to_check  # Pass columns_to_check
        )
        ERROR_LOG_M.extend(error_logs_m)

    # Validate data types between master and target schemas
    if perform_checks:
        data_type_issues, error_logs_m = validate_data_types(
            schema_master,
            schema_target,
            master_key,
            master_table,
            target_table,
            columns_to_check  # Pass columns_to_check
        )
        ERROR_LOG_M.extend(error_logs_m)

    # Validate formats in master table only
    if perform_checks:
        format_issues_master, error_logs_m = validate_formats(
            df_master,
            df_target,
            master_key,
            target_key,
            target_table,
            master_table,
            columns_to_check  # Pass columns_to_check
        )
        ERROR_LOG_M.extend(error_logs_m)

        # Validate pincode mapping if applicable
        pincode_mapping_issues = pd.DataFrame()
        if {'pincode', 'city', 'state'}.issubset(df_master.columns):
            pincode_mapping_issues, error_logs_m = validate_pincode_mapping(
                df_master,
                df_target,
                master_key,
                target_key,
                target_table,
                client,
                master_table
            )
            ERROR_LOG_M.extend(error_logs_m)

    # Find non-matching keys
    master_only_keys, target_only_keys, error_logs_m = find_non_matching_keys(
        df_master, df_target, master_key, target_key, duplicates_master, duplicates_target, master_table, target_table
    )
    ERROR_LOG_M.extend(error_logs_m)

    # Compile results
    results = {
        'mismatches': mismatches,
        'null_values_master': pd.DataFrame(null_values_master),
        'null_values_target': pd.DataFrame(null_values_target),
        'duplicates_master': duplicates_master,
        'duplicates_target': duplicates_target,
        'data_type_issues': data_type_issues,
        'format_issues_master': format_issues_master,
        'pincode_mapping_issues': pincode_mapping_issues,
        'key_column_master': master_key,
        'key_column_target': target_key,
        'df_master_only_keys': master_only_keys,
        'df_target_only_keys': target_only_keys,
        'table1_name': master_table,
        'table2_name': target_table,
        'df_master': df_master,
        'df_target': df_target
    }

    logging.info(f"Completed comparison for '{master_table}' vs '{target_table}'.")
    return results

def generate_string_schema(df):
    """
    Generates a BigQuery schema with all fields as STRING.
    
    Args:
        df (pd.DataFrame): The DataFrame for which to generate the schema.
        
    Returns:
        list: A list of SchemaField objects with type STRING.
    """
    schema = [SchemaField(column, "STRING", mode="NULLABLE") for column in df.columns]
    return schema

def _upload_dataframe_to_bigquery(client, analytics_dataset, table_name, df):
    """
    Helper function to upload a DataFrame to BigQuery.

    Args:
        client (bigquery.Client): Initialized BigQuery client.
        analytics_dataset (str): The name of the Analytics dataset.
        table_name (str): The name of the table to upload.
        df (pd.DataFrame): The DataFrame to upload.

    Returns:
        None
    """
    if df.empty:
        logging.info(f"No data to upload for '{table_name}'. Skipping.")
        return

    # Convert all columns to string type
    df = df.astype(str)

    # Generate BigQuery schema with all fields as STRING
    schema = generate_string_schema(df)

    # Ensure table name doesn't exceed BigQuery's maximum length (1,024 characters)
    if len(table_name) > 1024:
        original_table_name = table_name
        table_name = table_name[:1021] + '...'
        logging.warning(f"Table name truncated from '{original_table_name}' to '{table_name}' due to length constraints.")

    # Define the full table ID
    table_id = f"{client.project}.{analytics_dataset}.{table_name}"

    # Upload the DataFrame to BigQuery
    try:
        job = client.load_table_from_dataframe(
            df,
            table_id,
            job_config=bigquery.LoadJobConfig(
                write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
                schema=schema  # Using the provided schema with all fields as STRING
            )
        )
        job.result()  # Wait for the job to complete
        logging.info(f"Successfully uploaded '{table_id}' with {len(df)} records.")
    except Exception as e:
        logging.error(f"Failed to upload '{table_id}' to BigQuery: {e}")

def upload_comparison_results_to_bigquery(client, analytics_dataset, ERROR_LOG_M):
    """
    Uploads the ERROR_LOG_M to BigQuery as a separate table in the Analytics dataset.

    Args:
        client (bigquery.Client): Initialized BigQuery client.
        analytics_dataset (str): The name of the Analytics dataset.
        ERROR_LOG_M (list): The error log data as a list of dictionaries.

    Returns:
        None
    """
    # Handle ERROR_LOG separately
    if ERROR_LOG_M is not None:
        # Determine the DataFrame to upload
        if isinstance(ERROR_LOG_M, pd.DataFrame):
            error_df = ERROR_LOG_M
        elif isinstance(ERROR_LOG_M, list):
            error_df = pd.DataFrame(ERROR_LOG_M)
        else:
            logging.warning("Unsupported data type for ERROR_LOG. Skipping upload.")
            error_df = None

        if error_df is not None and not error_df.empty:
            _upload_dataframe_to_bigquery(client, analytics_dataset, "error_logs_master_hub", error_df)
        else:
            logging.info("No error logs to upload.")
    else:
        logging.info("No error logs present.")

def main():
    """
    Main function to orchestrate the comparison of multiple base tables against their master_hub_ counterparts.
    """
    try:
        # Initialize BigQuery client
        try:
            client = get_bigquery_client(PROJECT_ID)
        except Exception:
            logging.error("Exiting due to BigQuery client initialization failure.")
            return

        # Find common tables with 'master_hub_' and other prefixes, passing BASE_TABLES
        common_tables = find_common_tables_with_master_hub(client, DATASET_ID, PREFIXES, BASE_TABLES)

        if not common_tables:
            logging.info("No common tables found with 'master_hub_' and the specified prefixes.")
            return

        # Iterate over each base table and perform comparisons
        for base_name, tables in common_tables.items():
            base_table_info = BASE_TABLES.get(base_name)
            if not base_table_info:
                logging.warning(f"No configuration found for base table '{base_name}'. Skipping.")
                continue

            master_key = base_table_info.get('master_key')
            target_tables = base_table_info.get('targets', {})
            column_mapping = base_table_info.get('column_mapping', {})
            

            master_table = tables.get('master_hub_')
            if not master_table:
                logging.warning(f"Master table 'master_hub_{base_name}' not found. Skipping.")
                continue

            all_results = []

            # Iterate through each prefix and its corresponding target_key
            for prefix, target_key in target_tables.items():
                target_table = tables.get(prefix)
                if not target_table:
                    logging.warning(f"Target table with prefix '{prefix}' for base table '{base_name}' not found. Skipping.")
                    continue

                comparison_result = compare_tables(
                    client, 
                    DATASET_ID, 
                    base_name, 
                    master_table, 
                    target_table, 
                    master_key, 
                    target_key,  # Pass the correct target_key per prefix
                    column_mapping  # Pass the column_mapping
                )
                if comparison_result:
                    all_results.append(comparison_result)

                    # Prepare and send a separate Slack message for each comparison
                    total_mismatches = len(comparison_result['mismatches'])
                    total_nulls_master = len(comparison_result['null_values_master'])
                    total_nulls_target = len(comparison_result['null_values_target'])
                    total_dup_master = len(comparison_result['duplicates_master'])
                    total_dup_target = len(comparison_result['duplicates_target'])
                    total_data_type_issues = len(comparison_result['data_type_issues'])
                    total_format_issues_master = len(comparison_result['format_issues_master'])
                    total_pincode_issues = len(comparison_result['pincode_mapping_issues'])
                    total_non_matching_source = len(comparison_result.get('df_master_only_keys', []))
                    total_non_matching_target = len(comparison_result.get('df_target_only_keys', []))

                    message = (
                        f"✅ *Comparison Report Generated for `{base_name}`*\n"
                        f"*Tables Compared: `{comparison_result['table1_name']}` vs `{comparison_result['table2_name']}`*\n"
                        f"- *Total Mismatches between values of same column name of both tables : `{total_mismatches}`*\n"
                        f"- *Total Null Values in `{comparison_result['table1_name']}`: `{total_nulls_master}`*\n"
                        f"- *Total Null Values in `{comparison_result['table2_name']}`: `{total_nulls_target}`*\n"
                        f"- *Duplicate `{master_key}` in `{comparison_result['table1_name']}`: `{total_dup_master}`*\n"
                        f"- *Duplicate `{target_key}` in `{comparison_result['table2_name']}`: `{total_dup_target}`*\n"
                        f"- *Total Data Type Issues(mismatch between datatype in columns with same name of both tables): `{total_data_type_issues}`*\n"
                        f"- *Total Format/Value Issues(gstin, email, pincode) in `{comparison_result['table1_name']}`: `{total_format_issues_master}`*\n"
                        f"- *Total Pincode Mapping Issues in `{comparison_result['table1_name']}`: `{total_pincode_issues}`*\n"
                         "- *Non-Matching Keys*:\n"
                        f"--*`{master_key}` only in `{comparison_result['table1_name']}` and not in `{comparison_result['table2_name']}`:`{total_non_matching_source}`,*\n"
                        f"--*`{target_key}` only in `{comparison_result['table2_name']}` and not in `{comparison_result['table1_name']}`:`{total_non_matching_target}`*"
                    )

                    send_slack_alert(message)
            
            if all_results:
                # Generate aggregated report for the base name and get the filepath
                report_filepath = create_aggregated_document(all_results, base_name)
                
                # Upload the report to Slack using the updated function
                upload_file_to_slack(report_filepath, title=f"{base_name.capitalize()} Comparison Report")
                
                # Remove the local report file after successful upload
                try:
                    os.remove(report_filepath)
                    logging.info(f"Removed local report file '{report_filepath}'.")
                except Exception as e:
                    logging.error(f"Failed to remove local report file '{report_filepath}': {e}")
                time.sleep(30)
            else:
                logging.info(f"No comparison results to report for base name '{base_name}'.")
               

        # Upload error logs to BigQuery after all comparisons
        upload_comparison_results_to_bigquery(
            client, 
            'analytics_data',
            ERROR_LOG_M
            )

        logging.info("All comparisons completed.")
    except Exception as e:
        # Capture the full traceback
        tb = traceback.format_exc()
        logging.error("An unexpected error occurred in the main process.", exc_info=True)

        # Prepare a detailed error message for Slack
        error_message = (
            f"❌ *Comparison Process Failed*\n"
            f"*Error:* {str(e)}\n"
            f"*Traceback:*\n```{tb}```"
        )
        send_slack_alert(error_message)

        # Optionally, exit the script with a non-zero status
        sys.exit(1)

main()


2024-11-28 09:59:34,040 - INFO - Slack client initialized successfully.
2024-11-28 09:59:34,599 - INFO - BigQuery client initialized successfully.
2024-11-28 09:59:35,816 - INFO - Found 174 tables in dataset 'Impetus_dev_prod'.
2024-11-28 09:59:35,818 - INFO - Identified 2 common base names with 'master_hub_' and other specified prefixes.
2024-11-28 09:59:35,818 - INFO - Starting comparison for base table 'supplier': 'master_hub_supplier' vs 'procuro_supplier'.
2024-11-28 09:59:37,401 - INFO - Loaded data from table 'master_hub_supplier' into DataFrame.


KeyboardInterrupt: 

In [6]:
# print(repr(df_master.loc[df_master['supplier_code'] == '32021006', 'pan'].iloc[0]))
# print(repr(df_target.loc[df_target['supplier_code'] == '32021006', 'pan'].iloc[0]))


NameError: name 'df_master' is not defined

In [1]:
#imp and non imp column added
# f
import time
import os
import re
import logging
from datetime import datetime
import pandas as pd
from google.cloud import bigquery
from google.api_core.exceptions import GoogleAPIError
from docx import Document
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from slack_sdk import WebClient
from slack_sdk.errors import SlackApiError
from google.cloud.bigquery import SchemaField
import traceback
import sys

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Configuration
PROJECT_ID = 'fynd-jio-impetus-prod'       # Replace with your project ID
DATASET_ID = 'Impetus_dev_prod'                 # Replace with your dataset ID
PREFIXES = ['procuro_', 'costing_engine_', 'scan_pack_', 'pigeon_']  # Define your prefixes
# PROJECT_ID = 'fynd-jio-impetus-non-prod'       # Replace with your project ID
# DATASET_ID = 'Impetus_dev_sit'                 # Replace with your dataset ID
# PREFIXES = ['procuro_', 'costing_engine_', 'scan_pack_', 'pigeon_']  # Define your prefixes



# Error log list
ERROR_LOG_M = []

# Mapping of base table names to their key columns in master and target tables
BASE_TABLES = {
    # 'brand': {
    #     'master_key': 'code',
    #     'targets': {
    #         'procuro_': 'code',
    #         'costing_engine_': 'code'
    #     },
    #     'active_filter': {
    #         'column': 'is_active',
    #         'value': True
    #     },
    #     'perform_checks': True  # Default behavior
    # },
    # 'brand_pm_mapping': {
    #     'master_key': 'pm_id',
    #     'targets': {
    #         'costing_engine_': 'pm_id'
    #     },
    #     'perform_checks': True
    # },
    # 'brick': {
    #     'master_key': 'brick_code',
    #     'targets': {
    #         'costing_engine_': 'code'
    #     },
    #     'perform_checks': True
    # },
    # 'coe_bom_element_type_mapping': {
    #     'master_key': 'coe_name',
    #     'targets': {
    #         'costing_engine_': 'coe_name'
    #     },
    #     'perform_checks': True
    # },
    # 'event_log': {
    #     'master_key': 'user_id',
    #     'targets': {
    #         'costing_engine_': 'user_id'
    #     },
    #     'perform_checks': True
    # },
    'supplier': {
        'master_key': 'supplier_code',
        'targets': {
            'procuro_': 'supplier_code',
            'costing_engine_': 'supplier_code'
        },
        'active_filter': {  # Apply active filter
            'column': 'is_active',
            'value': True
        },
        'perform_checks': True
    },
    'vendor_details': {  # Newly added entry
        'master_key': 'supplier_code',  # Using supplier_code as the key
        'master_table': 'master_hub_supplier',  # Specify the master table explicitly
        'targets': {
            'scan_pack_': 'vendor_code'
        },
        'active_filter': {  # Apply active filter
            'column': 'is_active',
            'value': True
        },
        'perform_checks': True
    },
    # 'hsn_tax_mapping': {  # Newly added base table for HSN Codes
    #     'master_key': 'hsn_code',  # Assuming 'hsn_code' is the key column
    #     'master_table': 'master_hub_hsn',
    #     'targets': {
    #         'procuro_': 'hsn_code',
    #     },
    #     'perform_checks': False  # Only perform key comparisons
    # },
    # 'config_buyer_brand_mapping': {  # Updated entry
    #     'master_key': 'id',
    #     'master_table': 'master_hub_buyer_brand_mapping',
    #     'targets': {
    #         'costing_engine_': 'id'
    #     },
    #     'column_mapping': {  # Mapping of master columns to target columns
    #         'brand_id': 'brand_code'
    #     },
    #     'perform_checks': True
    # },   
}

# Define Non-Important Columns
Non_imp_columns = {
    'supplier': ['id', '_id', 'updated_at', 'created_at'],
    'vendor_details': ['id', '_id', 'updated_at', 'created_at']  # Add if applicable
}

# Define Important Columns
Imp_columns = {
    'brand': ['name', 'id', 'slug', 'code'],
    'brick': ['name', 'id', 'brick_code', 'description', 'class_code'],
    'config_buyer_brand_mapping': ['buyer_email', 'brand_code', 'id', 'buyer_id', 'is_active', 'buyer_name'],
    'brand_pm_mapping': ['pm_id', 'brand_code', 'pm_email', 'is_active', 'pm_name', 'id'],
    'config_buyer_brand_mapping': ['is_active','coe_id','id','coe_name','coe_approver_email','element_type']
    # Add more base tables and their important columns as needed
}

# Slack configuration
SLACK_TOKEN = "xoxb-2151238541-7946286860052-5FCcfqBPem0xKigGlIcKdLgX"  # Replace with your Slack token
SLACK_CHANNEL = "C08310RS2PK"  # Replace with your Slack channel ID

# Initialize Slack client
if SLACK_TOKEN and SLACK_CHANNEL:
    slack_client = WebClient(token=SLACK_TOKEN)
    logging.info("Slack client initialized successfully.")
else:
    slack_client = None
    logging.warning("Slack token or channel not found. Slack notifications will be disabled.")

def get_bigquery_client(project_id):
    """
    Initialize and return a BigQuery client.

    Args:
        project_id (str): GCP project ID.

    Returns:
        bigquery.Client: An initialized BigQuery client.
    """
    try:
        client = bigquery.Client(project=project_id)
        logging.info("BigQuery client initialized successfully.")
        return client
    except Exception as e:
        logging.error(f"Failed to initialize BigQuery client: {e}")
        raise

def find_common_tables_with_master_hub(client, dataset_name, prefixes, base_tables):
    """
    Find tables in the specified dataset that share the same base name after removing the 'master_hub_' prefix
    and exist with other given prefixes.

    Args:
        client (bigquery.Client): Initialized BigQuery client.
        dataset_name (str): The name of the dataset to search within.
        prefixes (list): List of prefixes to compare with 'master_hub_'.
        base_tables (dict): The BASE_TABLES dictionary containing base table configurations.

    Returns:
        dict: A dictionary where keys are base names and values are dictionaries showing which prefixes have tables.
    """
    try:
        # Reference the dataset
        dataset_ref = client.dataset(dataset_name)

        # List all tables in the dataset
        tables = client.list_tables(dataset_ref)
        table_names = [table.table_id for table in tables]
        logging.info(f"Found {len(table_names)} tables in dataset '{dataset_name}'.")

        # Dictionary to hold base names and their corresponding tables
        common_tables = {}
        for base_name, config in base_tables.items():
            # Determine the master table
            master_table = config.get('master_table', f'master_hub_{base_name}')
            if master_table in table_names:
                common_tables[base_name] = {'master_hub_': master_table}
                # Check for target tables with specified prefixes
                for prefix, target_key in config.get('targets', {}).items():
                    target_table = f"{prefix}{base_name}"
                    if target_table in table_names:
                        common_tables[base_name][prefix] = target_table
            else:
                logging.warning(f"Master table '{master_table}' for base '{base_name}' not found in dataset.")

        # Filter out base names that only have 'master_hub_' but no other matching prefixes
        common_tables_with_prefixes = {base_name: tables for base_name, tables in common_tables.items() if len(tables) > 1}

        logging.info(f"Identified {len(common_tables_with_prefixes)} common base names with 'master_hub_' and other specified prefixes.")
        return common_tables_with_prefixes

    except GoogleAPIError as e:
        logging.error(f"Google API Error: {e.message}")
        return {}
    except Exception as e:
        logging.error(f"An unexpected error occurred: {e}")
        return {}

def get_table_schema(client, dataset_name, table_name):
    """
    Retrieve the schema of a specified BigQuery table.

    Args:
        client (bigquery.Client): Initialized BigQuery client.
        dataset_name (str): The name of the dataset.
        table_name (str): The name of the table.

    Returns:
        dict: A dictionary mapping column names to their data types.
    """
    try:
        table_ref = client.dataset(dataset_name).table(table_name)
        table = client.get_table(table_ref)
        schema = {field.name: field.field_type for field in table.schema}
        logging.info(f"Retrieved schema for table '{table_name}'.")
        return schema
    except GoogleAPIError as e:
        logging.error(f"Failed to retrieve schema for table '{table_name}': {e.message}")
        return {}
    except Exception as e:
        logging.error(f"An unexpected error occurred while retrieving schema for table '{table_name}': {e}")
        return {}

def load_table_from_bigquery(client, dataset_name, table_name):
    """
    Load a table from BigQuery into a Pandas DataFrame.

    Args:
        client (bigquery.Client): Initialized BigQuery client.
        dataset_name (str): The name of the dataset.
        table_name (str): The name of the table.

    Returns:
        pd.DataFrame: DataFrame containing the table data.
    """
    try:
        query = f"SELECT * FROM `{PROJECT_ID}.{dataset_name}.{table_name}`"
        df = client.query(query).to_dataframe()
        logging.info(f"Loaded data from table '{table_name}' into DataFrame.")
        return df
    except GoogleAPIError as e:
        logging.error(f"Failed to load table '{table_name}': {e.message}")
        return pd.DataFrame()
    except Exception as e:
        logging.error(f"An unexpected error occurred while loading table '{table_name}': {e}")
        return pd.DataFrame()

def standardize_dataframe(df, exclude_columns=[]):
    """
    Standardize string columns in the DataFrame by stripping whitespace and converting to lowercase,
    excluding specified columns.

    Args:
        df (pd.DataFrame): The DataFrame to standardize.
        exclude_columns (list): Columns to exclude from standardization.

    Returns:
        pd.DataFrame: Standardized DataFrame.
    """
    df_copy = df.copy()
    for col in df_copy.columns:
        if col in exclude_columns:
            continue  # Skip standardizing this column
        if pd.api.types.is_string_dtype(df_copy[col]):
            df_copy[col] = df_copy[col].astype(str).str.strip().str.lower()
    logging.info("Standardized DataFrame for comparison.")
    return df_copy

def find_common_and_non_common_columns(df1, df2):
    """
    Identify common and unique columns between two DataFrames.

    Args:
        df1 (pd.DataFrame): First DataFrame.
        df2 (pd.DataFrame): Second DataFrame.

    Returns:
        tuple: (common_columns, df1_unique_columns, df2_unique_columns)
    """
    common_columns = list(set(df1.columns).intersection(set(df2.columns)))
    df1_unique_columns = list(set(df1.columns) - set(df2.columns))
    df2_unique_columns = list(set(df2.columns) - set(df1.columns))
    logging.info(f"Found {len(common_columns)} common columns, {len(df1_unique_columns)} unique to first table, {len(df2_unique_columns)} unique to second table.")
    return common_columns, df1_unique_columns, df2_unique_columns

def find_mismatches(df_master, df_target, columns_to_check, master_key, target_key, table1, table2, duplicates_master, duplicates_target, non_imp_columns, column_mapping=None):
    """
    Identify mismatches between two DataFrames based on specified columns and key columns.

    Args:
        df_master (pd.DataFrame): Source DataFrame (master_hub_ table).
        df_target (pd.DataFrame): Target DataFrame (prefixed table).
        columns_to_check (list): List of columns to apply mismatch checks.
        master_key (str): The key column in the master table.
        target_key (str): The key column in the target table.
        table1 (str): Name of the source table.
        table2 (str): Name of the target table.
        duplicates_master (pd.DataFrame): Duplicate keys in master table.
        duplicates_target (pd.DataFrame): Duplicate keys in target table.
        non_imp_columns (list): List of non-important columns to exclude.
        column_mapping (dict, optional): Mapping of master columns to target columns. Defaults to None.

    Returns:
        tuple: (mismatches, error_logs_m)
    """
    mismatches = []
    error_logs_m = []
    # Ensure key columns are present in both DataFrames
    if master_key not in df_master.columns or target_key not in df_target.columns:
        logging.error(f"Key columns '{master_key}' or '{target_key}' not found in the respective tables.")
        return mismatches, error_logs_m

    # Rename target key to match master key for easier comparison
    df_target_renamed = df_target.rename(columns={target_key: master_key})

    # Merge DataFrames on the master_key, excluding duplicates
    merged_df = pd.merge(
        df_master.drop_duplicates(subset=master_key),
        df_target_renamed.drop_duplicates(subset=master_key),
        on=master_key,
        suffixes=(f'_{table1}', f'_{table2}'),
        how='inner'
    )

    logging.info(f"Merged DataFrame has {len(merged_df)} records for mismatch comparison.")

    for index, row in merged_df.iterrows():
        key = row[master_key]
        for master_col in columns_to_check:
            if master_col.startswith('_boltic_') or master_col in non_imp_columns:
                continue  # Skip columns starting with '_boltic_' or non-important columns

            # Determine corresponding target column
            target_col = master_col  # Default: same name
            if column_mapping and master_col in column_mapping:
                target_col = column_mapping[master_col]

            # Check if both columns exist in the merged DataFrame
            master_value_col = f"{master_col}_{table1}"
            target_value_col = f"{target_col}_{table2}"
            if master_value_col not in row or target_value_col not in row:
                continue  # Skip if columns not present

            val_master = row.get(master_value_col)
            val_target = row.get(target_value_col)

            # Handle NaN values in comparison
            if pd.isna(val_master) and pd.isna(val_target):
                continue  # Both are NaN, treat as equal
            elif pd.isna(val_master) or pd.isna(val_target) or val_master != val_target:
                mismatch_detail = {
                    master_key: key,
                    'column': master_col,  # Report master column name
                    f'{table1}_value': val_master,
                    f'{table2}_value': val_target
                }
                mismatches.append(mismatch_detail)
                error_detail = {
                    'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    'issue': 'mismatch',
                    'error_message': '',
                    'source_table': table1,
                    'target_table': table2,
                    'issue_column': master_col,
                    'unique_identifier': f'{master_key}: {key}'
                }
                error_logs_m.append(error_detail)

    logging.info(f"Found {len(mismatches)} mismatches between '{table1}' and '{table2}'.")
    return mismatches, error_logs_m

def find_duplicates(df, key_column, table_name):
    """
    Detect duplicate key_column entries in the DataFrame and identify differences.

    Args:
        df (pd.DataFrame): The DataFrame to check.
        key_column (str): The key column to check for duplicates.
        table_name (str): Name of the table being checked.

    Returns:
        tuple: (duplicate_records_df, error_logs_m)
    """
    if key_column not in df.columns:
        logging.error(f"Key column '{key_column}' not found in DataFrame.")
        return pd.DataFrame(), []

    # Get all duplicate entries (keep=False to get all duplicates)
    duplicates_df = df[df.duplicated(subset=key_column, keep=False)]

    # Group by key_column
    grouped = duplicates_df.groupby(key_column)

    duplicate_records = []
    error_logs_m = []

    for key, group in grouped:
        if len(group) <= 1:
            continue  # Not a duplicate

        # Drop key_column and any columns starting with '_boltic_'
        group_non_key = group.drop(columns=[key_column] + [col for col in group.columns if col.startswith('_boltic_')])

        # Check if all rows are identical
        if group_non_key.nunique().sum() == 0:
            difference = "No difference exists"
        else:
            # Find which columns have differences
            cols_with_diff = group_non_key.columns[group_non_key.nunique() > 1].tolist()
            difference = "Difference in value of columns: " + ', '.join(cols_with_diff)

        duplicate_records.append({
            key_column: key,
            'Difference in value': difference
        })
        error_detail = {
            'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'issue': 'duplicate',
            'error_message': f'{difference}',
            'source_table': f'{table_name}',
            'target_table': '',
            'issue_column': '',
            'unique_identifier': f'{key_column}: {key}'
        }
        error_logs_m.append(error_detail)

    logging.info(f"Found {len(duplicate_records)} duplicate entries based on '{key_column}'.")
    return pd.DataFrame(duplicate_records), error_logs_m

def validate_data_types(schema_master, schema_target, master_key, table1_name, table2_name, columns_to_check):
    """
    Compare data types of specified columns between master and target schemas.

    Args:
        schema_master (dict): Schema of the master table.
        schema_target (dict): Schema of the target table.
        master_key (str): The key column for reference.
        table1_name (str): Name of the first table.
        table2_name (str): Name of the second table.
        columns_to_check (list): List of columns to validate data types.

    Returns:
        tuple: (data_type_issues_df, error_logs_m)
    """
    data_type_issues = []
    error_logs_m = []

    # Identify common columns to check
    common_columns = set(columns_to_check).intersection(set(schema_master.keys()), set(schema_target.keys()))

    for column in common_columns:
        type_master = schema_master[column]
        type_target = schema_target[column]
        if type_master != type_target:
            data_type_issues.append({
                'column_name': column,
                f'{table1_name}_data_type': type_master,
                f'{table2_name}_data_type': type_target
            })
            error_detail = {
                'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                'issue': 'data_type_issues',
                'error_message': f'{table1_name}_data_type: {type_master} , {table2_name}_data_type: {type_target}',
                'source_table': table1_name,
                'target_table': table2_name,
                'issue_column': column,
                'unique_identifier': ''
            }
            error_logs_m.append(error_detail)

    logging.info(f"Found {len(data_type_issues)} data type issues.")
    return pd.DataFrame(data_type_issues), error_logs_m

def validate_formats(df_master, df_target, key_column, target_key, target_table, master_table, columns_to_check):
    """
    Validate specific column formats using regular expressions and include corresponding target table values.

    Args:
        df_master (pd.DataFrame): The master DataFrame to validate.
        df_target (pd.DataFrame): The target DataFrame to fetch corresponding values.
        key_column (str): The key column in the master DataFrame.
        target_key (str): The key column in the target DataFrame.
        target_table (str): The name of the target table.
        master_table (str): The name of the master table.
        columns_to_check (list): List of columns to validate formats.

    Returns:
        tuple: (format_issues_df, error_logs_m)
    """
    format_issues = pd.DataFrame(columns=[key_column, 'column', 'value', 'issue', f'{target_table}_value'])
    error_logs_m = []

    for idx, row in df_master.iterrows():
        key_value = str(row[key_column]).strip()

        # GSTIN format validation
        if 'gstin' in columns_to_check and 'gstin' in df_master.columns:
            gstin = str(row['gstin']).strip()
            if not re.match(r'^[0-9]{2}[A-Z]{5}[0-9]{4}[A-Z]{1}[A-Z0-9]{3}$', gstin):
                # Fetch corresponding target value
                if key_value in df_target[target_key].astype(str).str.strip().values:
                    target_row = df_target[df_target[target_key].astype(str).str.strip() == key_value].iloc[0]
                    target_value = target_row['gstin'] if 'gstin' in target_row else "Column not present"
                else:
                    target_value = f"'{target_key}' not present"

                format_issues = pd.concat([format_issues, pd.DataFrame([{
                    key_column: key_value,
                    'column': 'gstin',
                    'value': row['gstin'],
                    'issue': 'Invalid GSTIN format',
                    f'{target_table}_value': target_value
                }])], ignore_index=True)
                error_detail = {
                    'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    'issue': 'format_issue',
                    'error_message': 'Invalid GSTIN format',
                    'source_table': master_table,
                    'target_table': '',
                    'issue_column': 'gstin',
                    'unique_identifier': f'{key_column}: {key_value}'
                }
                error_logs_m.append(error_detail)

        # Email format validation
        if 'email' in columns_to_check and 'email' in df_master.columns:
            email = str(row['email']).strip()
            if not re.match(r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$', email):
                # Fetch corresponding target value
                if key_value in df_target[target_key].astype(str).str.strip().values:
                    target_row = df_target[df_target[target_key].astype(str).str.strip() == key_value].iloc[0]
                    target_value = target_row['email'] if 'email' in target_row else "Column not present"
                else:
                    target_value = f"'{target_key}' not present"

                format_issues = pd.concat([format_issues, pd.DataFrame([{
                    key_column: key_value,
                    'column': 'email',
                    'value': row['email'],
                    'issue': 'Invalid email format',
                    f'{target_table}_value': target_value
                }])], ignore_index=True)
                error_detail = {
                    'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    'issue': 'format_issue',
                    'error_message': 'Invalid email format',
                    'source_table': master_table,
                    'target_table': '',
                    'issue_column': 'email',
                    'unique_identifier': f'{key_column}: {key_value}'
                }
                error_logs_m.append(error_detail)

        # Pincode format validation
        if 'pincode' in columns_to_check and 'pincode' in df_master.columns:
            pincode = str(row['pincode']).strip()
            if not re.match(r'^\d{6}$', pincode):
                # Fetch corresponding target value
                if key_value in df_target[target_key].astype(str).str.strip().values:
                    target_row = df_target[df_target[target_key].astype(str).str.strip() == key_value].iloc[0]
                    target_value = target_row['pincode'] if 'pincode' in target_row else "Column not present"
                else:
                    target_value = f"'{target_key}' not present"

                format_issues = pd.concat([format_issues, pd.DataFrame([{
                    key_column: key_value,
                    'column': 'pincode',
                    'value': row['pincode'],
                    'issue': 'Pincode must be exactly 6 digits',
                    f'{target_table}_value': target_value
                }])], ignore_index=True)
                error_detail = {
                    'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    'issue': 'format_issue',
                    'error_message': 'Pincode must be exactly 6 digits',
                    'source_table': master_table,
                    'target_table': '',
                    'issue_column': 'pincode',
                    'unique_identifier': f'{key_column}: {key_value}'
                }
                error_logs_m.append(error_detail)

        # Address length validation
        if 'address' in columns_to_check and 'address' in df_master.columns:
            address = str(row['address']).strip()
            if len(address) > 100:
                # Fetch corresponding target value
                if key_value in df_target[target_key].astype(str).str.strip().values:
                    target_row = df_target[df_target[target_key].astype(str).str.strip() == key_value].iloc[0]
                    target_value = target_row['address'] if 'address' in target_row else "Column not present"
                else:
                    target_value = f"'{target_key}' not present"

                format_issues = pd.concat([format_issues, pd.DataFrame([{
                    key_column: key_value,
                    'column': 'address',
                    'value': address,
                    'issue': 'Address exceeds 100 characters after stripping',
                    f'{target_table}_value': target_value
                }])], ignore_index=True)
                error_detail = {
                    'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    'issue': 'format_issue',
                    'error_message': 'Address exceeds 100 characters',
                    'source_table': master_table,
                    'target_table': '',
                    'issue_column': 'address',
                    'unique_identifier': f'{key_column}: {key_value}'
                }
                error_logs_m.append(error_detail)

    logging.info(f"Found {len(format_issues)} format issues.")
    return format_issues, error_logs_m

# def validate_pincode_mapping(df_master, df_target, key_column, target_key, target_table, client, master_table):
#     """
#     Validate pincode mapping by comparing with the all_india_PO_list reference table.
#     If a pincode issue is found in the master table, then check the corresponding pincode in the target table.

#     Args:
#         df_master (pd.DataFrame): The master DataFrame to validate.
#         df_target (pd.DataFrame): The target DataFrame to fetch corresponding values.
#         key_column (str): The key column in the master DataFrame.
#         target_key (str): The key column in the target DataFrame.
#         target_table (str): The name of the target table.
#         client (bigquery.Client): Initialized BigQuery client.
#         master_table (str): Name of the master table.

#     Returns:
#         tuple: (pincode_mapping_issues_df, error_logs_m)
#     """

#     error_logs_m = []
#     # Read the reference table from Analytics dataset
#     try:
#         reference_table = "all_india_PO_list"
#         reference_dataset = "analytics_data"
#         query = f"SELECT pincode, city, state FROM `{PROJECT_ID}.{reference_dataset}.{reference_table}`"
#         reference_df = client.query(query).to_dataframe()
#         reference_df['pincode'] = reference_df['pincode'].astype(str).str.strip()
#         reference_df['city'] = reference_df['city'].astype(str).str.strip().str.lower()
#         reference_df['state'] = reference_df['state'].astype(str).str.strip().str.lower()
#         logging.info(f"Loaded reference pincode mapping from '{reference_table}' in '{reference_dataset}' dataset.")
#     except Exception as e:
#         logging.error(f"Failed to load reference pincode mapping: {e}")
#         return pd.DataFrame(), error_logs_m

#     # Check if df_master has 'pincode', 'city', 'state' columns
#     required_columns = {'pincode', 'city', 'state'}
#     if not required_columns.issubset(df_master.columns):
#         logging.info(f"DataFrame does not have required columns for pincode mapping validation: {required_columns}")
#         return pd.DataFrame(), error_logs_m

#     # Initialize the issues DataFrame with a single target table details column
#     pincode_mapping_issues = pd.DataFrame(columns=[
#         key_column, 'pincode', 'state', 'city', 'issue',
#         f'{target_table}_details'
#     ])

#     # Iterate over each row in df_master to validate pincode mapping
#     for idx, row in df_master.iterrows():
#         key_value = str(row[key_column]).strip()
#         pincode = str(row['pincode']).strip()
#         city = str(row['city']).strip().lower()
#         state = str(row['state']).strip().lower()

#         # Fetch corresponding target row if exists
#         target_row = df_target[df_target[target_key].astype(str).str.strip() == key_value]
#         if not target_row.empty:
#             target_row = target_row.iloc[0]
#             target_pincode = target_row['pincode'] if 'pincode' in target_row and pd.notnull(target_row['pincode']) else "Pincode missing"
#             target_state = target_row['state'] if 'state' in target_row and pd.notnull(target_row['state']) else "State missing"
#             target_city = target_row['city'] if 'city' in target_row and pd.notnull(target_row['city']) else "City missing"
#             target_details = f"Pincode: {target_pincode}, State: {target_state}, City: {target_city}"
#         else:
#             target_details = f"Key '{key_column}' with value '{key_value}' not present in target table '{target_table}'."

#         # Check if pincode exists in reference
#         ref_matches = reference_df[reference_df['pincode'] == pincode]
#         if ref_matches.empty:
#             issue = f"Invalid pincode ({pincode})."
#             pincode_mapping_issues = pd.concat([pincode_mapping_issues, pd.DataFrame([{
#                 key_column: key_value,
#                 'pincode': pincode,
#                 'state': state,
#                 'city': city,
#                 'issue': issue,
#                 f'{target_table}_details': target_details
#             }])], ignore_index=True)
#             error_detail = {
#                 'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
#                 'issue': 'pincode_mapping',
#                 'error_message': f"{issue}. {target_table} Details: {target_details}",
#                 'source_table': master_table,
#                 'target_table': target_table,
#                 'issue_column': 'pincode',
#                 'unique_identifier': f'{key_column}: {key_value}'
#             }
#             error_logs_m.append(error_detail)
#             continue

#         # Check if any of the reference entries match both the city and state
#         exact_match = ref_matches[
#             (ref_matches['city'] == city) & (ref_matches['state'] == state)
#         ]
#         if not exact_match.empty:
#             continue  # No issue, mapping is correct

#         # Check for state mismatch
#         state_matches = ref_matches[ref_matches['state'] == state]

#         # Check for city mismatch
#         city_matches = ref_matches[ref_matches['city'] == city]

#         if state_matches.empty and city_matches.empty:
#             # Both state and city do not match
#             expected_entries = ref_matches[['state', 'city']].drop_duplicates()
#             expected_states = expected_entries['state'].tolist()
#             expected_cities = expected_entries['city'].tolist()
#             expected_states_str = ', '.join(expected_states)
#             expected_cities_str = ', '.join(expected_cities)
#             issue = f"Pincode {pincode} does not match state '{state}' and city '{city}'. Expected states: {expected_states_str}; Expected cities: {expected_cities_str}."
#         elif state_matches.empty:
#             # State does not match
#             expected_states = ref_matches['state'].unique().tolist()
#             expected_states_str = ', '.join(expected_states)
#             issue = f"Pincode {pincode} does not match state '{state}'. Expected states: {expected_states_str}."
#         elif city_matches.empty:
#             # City does not match
#             expected_cities = state_matches['city'].unique().tolist()
#             expected_cities_str = ', '.join(expected_cities)
#             issue = f"Pincode {pincode} does not match city '{city}'. Expected cities: {expected_cities_str}."
#         else:
#             # Other cases
#             issue = f"Pincode {pincode} has a mapping inconsistency."

#         pincode_mapping_issues = pd.concat([pincode_mapping_issues, pd.DataFrame([{
#             key_column: key_value,
#             'pincode': pincode,
#             'state': state,
#             'city': city,
#             'issue': issue,
#             f'{target_table}_details': target_details
#         }])], ignore_index=True)
#         error_detail = {
#             'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
#             'issue': 'pincode_mapping',
#             'error_message': f"{issue}. {target_table} Details: {target_details}",
#             'source_table': master_table,
#             'target_table': target_table,
#             'issue_column': 'pincode',
#             'unique_identifier': f'{key_column}: {key_value}'
#         }
#         error_logs_m.append(error_detail)

#     logging.info(f"Found {len(pincode_mapping_issues)} pincode mapping issues in master table '{master_table}'.")
#     return pincode_mapping_issues, error_logs_m

def create_table(doc, data, column_names):
    """
    Helper function to create a table in a docx document from a list of dictionaries.

    Args:
        doc (Document): The Word document object.
        data (list or list of dict): Data to populate the table.
        column_names (list): List of column names for the table headers.
    """
    if not data:
        return
    table = doc.add_table(rows=1, cols=len(column_names))
    table.style = 'Light List Accent 1'
    hdr_cells = table.rows[0].cells
    for i, col_name in enumerate(column_names):
        hdr_cells[i].text = col_name

    for row_data in data:
        row_cells = table.add_row().cells
        for i, col_name in enumerate(column_names):
            cell_value = str(row_data.get(col_name, '')).strip()
            row_cells[i].text = cell_value
    logging.info("Added table to the Word document.")

# def add_non_matching_keys_section(doc, df1_only_keys, table1_name, df2_only_keys, table2_name, key_column_master, key_column_target):
#     """
#     Add a section in the Word document for non-matching keys between two tables.

#     Args:
#         doc (Document): The Word document object.
#         df1_only_keys (list): Keys present only in table1.
#         table1_name (str): Name of the first table.
#         df2_only_keys (list): Keys present only in table2.
#         table2_name (str): Name of the second table.
#         key_column_master (str): The key column in the master table.
#         key_column_target (str): The key column in the target table.
#     """
#     if df1_only_keys or df2_only_keys:
#         if df1_only_keys:
#             doc.add_heading(f"'{key_column_master}' present only in '{table1_name}' and not in '{table2_name}' ({len(df1_only_keys)})", level=2)
#             create_table(doc, [{key_column_master: key[key_column_master]} for key in df1_only_keys], [key_column_master])
#         if df2_only_keys:
#             doc.add_heading(f"'{key_column_target}' present only in '{table2_name}' and not in '{table1_name}' ({len(df2_only_keys)})", level=2)
#             create_table(doc, [{key_column_target: key[key_column_target]} for key in df2_only_keys], [key_column_target])
#     else:
#         doc.add_paragraph("No non-matching keys found.")

# def add_table_of_contents(doc):
#     """
#     Adds a Table of Contents to the Word document.

#     Args:
#         doc (Document): The Word document object.
#     """
#     paragraph = doc.add_paragraph()
#     run = paragraph.add_run()
#     fldChar_begin = OxmlElement('w:fldChar')  # creates a new element
#     fldChar_begin.set(qn('w:fldCharType'), 'begin')  # sets attribute on element
#     instrText = OxmlElement('w:instrText')
#     instrText.set(qn('xml:space'), 'preserve')  # sets attribute on element
#     instrText.text = 'TOC \\o "1-2" \\h \\z \\u'  # change to what you need
#     fldChar_separate = OxmlElement('w:fldChar')
#     fldChar_separate.set(qn('w:fldCharType'), 'separate')
#     fldChar_end = OxmlElement('w:fldChar')
#     fldChar_end.set(qn('w:fldCharType'), 'end')
#     run._r.append(fldChar_begin)
#     run._r.append(instrText)
#     run._r.append(fldChar_separate)
#     run._r.append(fldChar_end)
#     logging.info("Added Table of Contents to the Word document.")

# def create_aggregated_document(all_results, base_name):
#     """
#     Creates a single Word document that presents all comparison results for a base table.

#     Args:
#         all_results (list): List of comparison result dictionaries.
#         base_name (str): The base name of the table.

#     Returns:
#         str: The filepath of the saved report.
#     """
#     doc = Document()
#     doc.add_heading(f'{base_name.capitalize()} Tables Comparison Report', level=0)
#     doc.add_paragraph(f'Report generated on {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n')

#     # Add Instruction for TOC Update
#     doc.add_paragraph(
#         "📌 **Note:** To update the Table of Contents and make the links clickable, go to the ‘References’ tab and click ‘Update Table’ or press F9 in Windows and Fn+F9 in Mac after opening this document in Microsoft Word.",
#         style='Intense Quote'
#     )

#     # Add Table of Contents
#     doc.add_heading('Table of Contents', level=1)
#     add_table_of_contents(doc)
#     doc.add_page_break()

#     for result in all_results:
#         table1_name = result['table1_name']
#         table2_name = result['table2_name']
#         key_column_master = result['key_column_master']
#         key_column_target = result['key_column_target']
#         doc.add_heading(f'Comparison: {table1_name} vs {table2_name}', level=1)

#         # Mismatches
#         if result['mismatches']:
#             doc.add_heading(f'Mismatches ({len(result["mismatches"])})', level=2)
#             column_names = [key_column_master, 'column', f'{table1_name}_value', f'{table2_name}_value']
#             create_table(doc, result['mismatches'], column_names)
#         else:
#             doc.add_heading("No mismatches found.", level=2)

#         # Null values in master table
#         if not result['null_values_master'].empty:
#             count_null_master = len(result['null_values_master'])
#             doc.add_heading(f'Null values in {table1_name} ({count_null_master})', level=2)
#             column_names = [key_column_master, 'column', table2_name]
#             create_table(doc, result['null_values_master'].to_dict('records'), column_names)
#         else:
#             doc.add_heading(f"No null values found in {table1_name}.", level=2)

#         # Null values in target table
#         if not result['null_values_target'].empty:
#             count_null_target = len(result['null_values_target'])
#             doc.add_heading(f'Null values in {table2_name} ({count_null_target})', level=2)
#             column_names = [key_column_target, 'column', table1_name]
#             create_table(doc, result['null_values_target'].to_dict('records'), column_names)
#         else:
#             doc.add_heading(f"No null values found in {table2_name}.", level=2)

#         # Duplicate keys in master table
#         if not result['duplicates_master'].empty:
#             count_dup_master = len(result['duplicates_master'])
#             doc.add_heading(f'Duplicate Keys in {table1_name} ({count_dup_master})', level=2)
#             create_table(doc, result['duplicates_master'].to_dict('records'), [key_column_master, 'Difference in value'])
#         else:
#             doc.add_heading("No duplicate keys found in master table.", level=2)

#         # Duplicate keys in target table
#         if not result['duplicates_target'].empty:
#             count_dup_target = len(result['duplicates_target'])
#             doc.add_heading(f'Duplicate Keys in {table2_name} ({count_dup_target})', level=2)
#             create_table(doc, result['duplicates_target'].to_dict('records'), [key_column_target, 'Difference in value'])
#         else:
#             doc.add_heading(f"No duplicate keys found in {table2_name}.", level=2)

#         # Data type issues
#         if not result['data_type_issues'].empty:
#             count_data_type_issues = len(result['data_type_issues'])
#             doc.add_heading(f'Data Type Issues ({count_data_type_issues})', level=2)
#             column_names = ['column_name', f'{table1_name}_data_type', f'{table2_name}_data_type']
#             create_table(doc, result['data_type_issues'].to_dict('records'), column_names)
#         else:
#             doc.add_heading("No data type issues found.", level=2)

#         # Format issues in master table with target values
#         if not result['format_issues_master'].empty:
#             count_format_issues_master = len(result['format_issues_master'])
#             doc.add_heading(f'Format Issues in {table1_name} ({count_format_issues_master})', level=2)
#             column_names_master = [key_column_master, 'column', 'value', 'issue', f'{table2_name}_value']
#             create_table(doc, result['format_issues_master'].to_dict('records'), column_names_master)
#         else:
#             doc.add_heading(f"No format issues found in {table1_name}.", level=2)

#         # Pincode Mapping Issues with target details
#         if not result['pincode_mapping_issues'].empty:
#             count_pincode_issues = len(result['pincode_mapping_issues'])
#             doc.add_heading(f'Pincode Mapping Issues in {table1_name} ({count_pincode_issues})', level=2)
#             column_names = [
#                 key_column_master, 'pincode', 'state', 'city', 'issue',
#                 f'{table2_name}_details'
#             ]
#             create_table(doc, result['pincode_mapping_issues'].to_dict('records'), column_names)
#         else:
#             doc.add_heading("No pincode mapping issues found.", level=2)

#         # Non-matching keys in master DataFrame
#         if result['df_master_only_keys']:
#             count_master_only = len(result['df_master_only_keys'])
#             doc.add_heading(f'Keys only in {table1_name} ({count_master_only})', level=2)
#             column_names = [key_column_master]
#             create_table(doc, result['df_master_only_keys'], column_names)
#         else:
#             doc.add_heading(f"No keys found only in {table1_name}.", level=2)

#         # Non-matching keys in target DataFrame
#         if result['df_target_only_keys']:
#             count_target_only = len(result['df_target_only_keys'])
#             doc.add_heading(f'Keys only in {table2_name} ({count_target_only})', level=2)
#             column_names = [key_column_target]
#             create_table(doc, result['df_target_only_keys'], column_names)
#         else:
#             doc.add_heading(f"No keys found only in {table2_name}.", level=2)

#         doc.add_page_break()  # Optional: Add a page break between comparisons

#     # Save the aggregated document to the current directory
#     timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
#     report_filename = f"{base_name}_comparison_report_aggregated_{timestamp}.docx"
#     doc.save(report_filename)
#     logging.info(f"Saved aggregated comparison report as '{report_filename}'.")

#     return report_filename  # Return the filename for further processing

# def send_slack_alert(message):
#     """
#     Send a message to a specified Slack channel.

#     Args:
#         message (str): The message to send.
#     """
#     if not slack_client:
#         logging.warning("Slack client is not initialized. Skipping Slack notification.")
#         return

#     try:
#         response = slack_client.chat_postMessage(
#             channel=SLACK_CHANNEL,
#             text=message
#         )
#         logging.info(f"Message sent to {SLACK_CHANNEL}: {response['ts']}")
#     except SlackApiError as e:
#         logging.error(f"Error sending message to Slack: {e.response['error']}")

# def upload_file_to_slack(filepath, title=None):
#     """
#     Upload a file to the specified Slack channel using files_upload_v2.

#     Args:
#         filepath (str): The path to the file to upload.
#         title (str, optional): The title for the uploaded file. Defaults to the file's basename.
#     """
#     if not slack_client:
#         logging.warning("Slack client is not initialized. Skipping file upload.")
#         return

#     try:
#         with open(filepath, 'rb') as f:
#             response = slack_client.files_upload_v2(
#                 channel=SLACK_CHANNEL,
#                 file=f,
#                 filename=os.path.basename(filepath),  # Explicitly set the filename with extension
#                 title=title if title else os.path.basename(filepath),  # Set the title
#                 initial_comment=title if title else "File uploaded."  # Optional: Add an initial comment
#             )

#         # Verify if the upload was successful
#         if response.get('ok'):
#             file_permalink = response['file']['permalink']
#             logging.info(f"File uploaded to Slack channel '{SLACK_CHANNEL}': {file_permalink}")
#         else:
#             logging.error(f"Failed to upload file to Slack: {response}")
#     except SlackApiError as e:
#         logging.error(f"Slack API Error during file upload: {e.response['error']}")
#     except Exception as e:
#         logging.error(f"Unexpected error during file upload: {e}")

# def find_non_matching_keys(df_master, df_target, master_key, target_key, duplicates_master, duplicates_target, master_table, target_table):
#     """
#     Identify keys present in df_master but not in df_target and vice versa, including duplicates.

#     Args:
#         df_master (pd.DataFrame): Source DataFrame.
#         df_target (pd.DataFrame): Target DataFrame.
#         master_key (str): The key column in the master table.
#         target_key (str): The key column in the target table.
#         duplicates_master (pd.DataFrame): Duplicate keys in master table.
#         duplicates_target (pd.DataFrame): Duplicate keys in target table.
#         master_table (str): Name of the master table.
#         target_table (str): Name of the target table.

#     Returns:
#         tuple: (master_only_keys, target_only_keys, error_logs_m)
#     """
#     error_logs_m = []
#     # Include all keys, including duplicates
#     keys_master = set(df_master[master_key].astype(str).str.strip())
#     keys_target = set(df_target[target_key].astype(str).str.strip())

#     # Keys present only in master
#     master_only = keys_master - keys_target
#     # Keys present only in target
#     target_only = keys_target - keys_master

#     logging.info(f"Found {len(master_only)} keys in source not in target and {len(target_only)} keys in target not in source.")

#     # Convert to list of dictionaries for consistency
#     master_only_keys = [{master_key: key} for key in master_only]
#     target_only_keys = [{target_key: key} for key in target_only]

#     # Log errors for keys only in master
#     for key in master_only:
#         error_detail = {
#             'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
#             'issue': 'missing_key',
#             'error_message': f"Key '{master_key}' with value '{key}' is present only in '{master_table}' and missing in '{target_table}'.",
#             'source_table': master_table,
#             'target_table': target_table, 
#             'issue_column': master_key,
#             'unique_identifier': f"{master_key}: {key}"
#         }
#         error_logs_m.append(error_detail)

#     # Log errors for keys only in target
#     for key in target_only:
#         error_detail = {
#             'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
#             'issue': 'missing_key',
#             'error_message': f"Key '{target_key}' with value '{key}' is present only in '{target_table}' and missing in '{master_table}'.",
#             'source_table': target_table,
#             'target_table': master_table,
#             'issue_column': target_key,
#             'unique_identifier': f"{target_key}: {key}"
#         }
#         error_logs_m.append(error_detail)

#     return master_only_keys, target_only_keys, error_logs_m

# def find_detailed_nulls(df_master, df_target, master_key, target_key, master_table, target_table, columns_to_check):
#     """
#     Identify null values in both master and target tables for specified columns and fetch corresponding values or indicate missing keys.

#     Args:
#         df_master (pd.DataFrame): Source DataFrame (master_hub_ table).
#         df_target (pd.DataFrame): Target DataFrame (prefixed table).
#         master_key (str): The key column in the master table.
#         target_key (str): The key column in the target table.
#         master_table (str): Name of the master table.
#         target_table (str): Name of the target table.
#         columns_to_check (list): List of columns to check for null values.

#     Returns:
#         tuple: (null_values_master, null_values_target, error_logs_m)
#     """
#     null_values_master = []
#     null_values_target = []
#     error_logs_m = []

#     # Find nulls in master
#     null_master = df_master[df_master[columns_to_check].isnull().any(axis=1)]
#     for idx, row in null_master.iterrows():
#         key_value = str(row[master_key]).strip()
#         for column in columns_to_check:
#             if column == master_key or column.startswith('_boltic_'):
#                 continue  # Skip key column and non-important columns
#             if column not in row:
#                 continue  # Skip if column is not in the row
#             if pd.isnull(row[column]):
#                 if key_value in df_target[target_key].astype(str).str.strip().values:
#                     target_row = df_target[df_target[target_key].astype(str).str.strip() == key_value].iloc[0]
#                     target_value = target_row[column] if column in target_row else "Column not present"
#                 else:
#                     target_value = f"'{target_key}' not present"
#                 null_record = {
#                     master_key: key_value,
#                     'column': column,
#                     target_table: target_value
#                 }
#                 error_detail = {
#                     'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
#                     'issue': 'null',
#                     'error_message': 'Null in columns',
#                     'source_table': master_table,
#                     'target_table': '',
#                     'issue_column': column,
#                     'unique_identifier': f'{master_key} : {key_value}'
#                 }
#                 error_logs_m.append(error_detail)
#                 null_values_master.append(null_record)

#     # Find nulls in target
#     null_target = df_target[df_target[columns_to_check].isnull().any(axis=1)]
#     for idx, row in null_target.iterrows():
#         key_value = str(row[target_key]).strip()
#         for column in columns_to_check:
#             if column == target_key or column.startswith('_boltic_'):
#                 continue  # Skip key column and non-important columns
#             if column not in row:
#                 continue  # Skip if column is not in the row
#             if pd.isnull(row[column]):
#                 if key_value in df_master[master_key].astype(str).str.strip().values:
#                     master_row = df_master[df_master[master_key].astype(str).str.strip() == key_value].iloc[0]
#                     master_value = master_row[column] if column in master_row else "Column not present"
#                 else:
#                     master_value = f"'{master_key}' not present"
#                 null_record = {
#                     target_key: key_value,
#                     'column': column,
#                     master_table: master_value
#                 }
#                 error_detail = {
#                     'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
#                     'issue': 'null',
#                     'error_message': 'Null in columns',
#                     'source_table': target_table,
#                     'target_table': '',
#                     'issue_column': column,
#                     'unique_identifier': f'{target_key} : {key_value}'
#                 }
#                 error_logs_m.append(error_detail)
#                 null_values_target.append(null_record)

#     logging.info(f"Found {len(null_values_master)} null values in master table '{master_table}'.")
#     logging.info(f"Found {len(null_values_target)} null values in target table '{target_table}'.")
#     return null_values_master, null_values_target, error_logs_m

def validate_pincode_mapping(df_master, df_target, key_column, target_key, target_table, client, master_table):
    """
    Validate pincode mapping by comparing with the all_india_PO_list reference table.
    If a pincode issue is found in the master table, then check the corresponding pincode in the target table.

    Args:
        df_master (pd.DataFrame): The master DataFrame to validate.
        df_target (pd.DataFrame): The target DataFrame to fetch corresponding values.
        key_column (str): The key column in the master DataFrame.
        target_key (str): The key column in the target DataFrame.
        target_table (str): The name of the target table.
        client (bigquery.Client): Initialized BigQuery client.
        master_table (str): Name of the master table.

    Returns:
        tuple: (pincode_mapping_issues_df, error_logs_m)
    """

    error_logs_m = []
    # Read the reference table from Analytics dataset
    try:
        reference_table = "all_india_po_list"
        reference_dataset = "analytics_data"
        query = f"SELECT pincode, city, state FROM `{PROJECT_ID}.{reference_dataset}.{reference_table}`"
        reference_df = client.query(query).to_dataframe()
        reference_df['pincode'] = reference_df['pincode'].astype(str).str.strip()
        reference_df['city'] = reference_df['city'].astype(str).str.strip().str.lower()
        reference_df['state'] = reference_df['state'].astype(str).str.strip().str.lower()
        logging.info(f"Loaded reference pincode mapping from '{reference_table}' in '{reference_dataset}' dataset.")
    except Exception as e:
        logging.error(f"Failed to load reference pincode mapping: {e}")
        return pd.DataFrame(), error_logs_m

    # Check if df_master has 'pincode', 'city', 'state' columns
    required_columns = {'pincode', 'city', 'state'}
    if not required_columns.issubset(df_master.columns):
        logging.info(f"DataFrame does not have required columns for pincode mapping validation: {required_columns}")
        return pd.DataFrame(), error_logs_m

    # Initialize the issues DataFrame with a single target table details column
    pincode_mapping_issues = pd.DataFrame(columns=[
        key_column, 'pincode', 'state', 'city', 'issue',
        f'{target_table}_details'
    ])

    # Iterate over each row in df_master to validate pincode mapping
    for idx, row in df_master.iterrows():
        key_value = str(row[key_column]).strip()
        pincode = str(row['pincode']).strip()
        city = str(row['city']).strip().lower()
        state = str(row['state']).strip().lower()

        # Fetch corresponding target row if exists
        target_row = df_target[df_target[target_key].astype(str).str.strip() == key_value]
        if not target_row.empty:
            target_row = target_row.iloc[0]
            target_pincode = target_row['pincode'] if 'pincode' in target_row and pd.notnull(target_row['pincode']) else "Pincode missing"
            target_state = target_row['state'] if 'state' in target_row and pd.notnull(target_row['state']) else "State missing"
            target_city = target_row['city'] if 'city' in target_row and pd.notnull(target_row['city']) else "City missing"
            target_details = f"Pincode: {target_pincode}, State: {target_state}, City: {target_city}"
        else:
            target_details = f"Key '{key_column}' with value '{key_value}' not present in target table '{target_table}'."

        # Check if pincode exists in reference
        ref_matches = reference_df[reference_df['pincode'] == pincode]
        if ref_matches.empty:
            issue = f"Invalid pincode ({pincode})."
            pincode_mapping_issues = pd.concat([pincode_mapping_issues, pd.DataFrame([{
                key_column: key_value,
                'pincode': pincode,
                'state': state,
                'city': city,
                'issue': issue,
                f'{target_table}_details': target_details
            }])], ignore_index=True)
            error_detail = {
                'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                'issue': 'pincode_mapping',
                'error_message': f"{issue}. {target_table} Details: {target_details}",
                'source_table': master_table,
                'target_table': target_table,
                'issue_column': 'pincode',
                'unique_identifier': f'{key_column}: {key_value}'
            }
            error_logs_m.append(error_detail)
            continue

        # Check if any of the reference entries match both the city and state
        exact_match = ref_matches[
            (ref_matches['city'] == city) & (ref_matches['state'] == state)
        ]
        if not exact_match.empty:
            continue  # No issue, mapping is correct

        # Check for state mismatch
        state_matches = ref_matches[ref_matches['state'] == state]

        # Check for city mismatch
        city_matches = ref_matches[ref_matches['city'] == city]

        if state_matches.empty and city_matches.empty:
            # Both state and city do not match
            expected_entries = ref_matches[['state', 'city']].drop_duplicates()
            expected_states = expected_entries['state'].tolist()
            expected_cities = expected_entries['city'].tolist()
            expected_states_str = ', '.join(expected_states)
            expected_cities_str = ', '.join(expected_cities)
            issue = f"Pincode {pincode} does not match state '{state}' and city '{city}'. Expected states: {expected_states_str}; Expected cities: {expected_cities_str}."
        elif state_matches.empty:
            # State does not match
            expected_states = ref_matches['state'].unique().tolist()
            expected_states_str = ', '.join(expected_states)
            issue = f"Pincode {pincode} does not match state '{state}'. Expected states: {expected_states_str}."
        elif city_matches.empty:
            # City does not match
            expected_cities = state_matches['city'].unique().tolist()
            expected_cities_str = ', '.join(expected_cities)
            issue = f"Pincode {pincode} does not match city '{city}'. Expected cities: {expected_cities_str}."
        else:
            # Other cases
            issue = f"Pincode {pincode} has a mapping inconsistency."

        pincode_mapping_issues = pd.concat([pincode_mapping_issues, pd.DataFrame([{
            key_column: key_value,
            'pincode': pincode,
            'state': state,
            'city': city,
            'issue': issue,
            f'{target_table}_details': target_details
        }])], ignore_index=True)
        error_detail = {
            'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'issue': 'pincode_mapping',
            'error_message': f"{issue}. {target_table} Details: {target_details}",
            'source_table': master_table,
            'target_table': target_table,
            'issue_column': 'pincode',
            'unique_identifier': f'{key_column}: {key_value}'
        }
        error_logs_m.append(error_detail)

    logging.info(f"Found {len(pincode_mapping_issues)} pincode mapping issues in master table '{master_table}'.")
    return pincode_mapping_issues, error_logs_m

def create_table(doc, data, column_names):
    """
    Helper function to create a table in a docx document from a list of dictionaries.

    Args:
        doc (Document): The Word document object.
        data (list or list of dict): Data to populate the table.
        column_names (list): List of column names for the table headers.
    """
    if not data:
        return
    table = doc.add_table(rows=1, cols=len(column_names))
    table.style = 'Light List Accent 1'
    hdr_cells = table.rows[0].cells
    for i, col_name in enumerate(column_names):
        hdr_cells[i].text = col_name

    for row_data in data:
        row_cells = table.add_row().cells
        for i, col_name in enumerate(column_names):
            cell_value = str(row_data.get(col_name, '')).strip()
            row_cells[i].text = cell_value
    logging.info("Added table to the Word document.")

def add_non_matching_keys_section(doc, df1_only_keys, table1_name, df2_only_keys, table2_name, key_column_master, key_column_target):
    """
    Add a section in the Word document for non-matching keys between two tables.

    Args:
        doc (Document): The Word document object.
        df1_only_keys (list): Keys present only in table1.
        table1_name (str): Name of the first table.
        df2_only_keys (list): Keys present only in table2.
        table2_name (str): Name of the second table.
        key_column_master (str): The key column in the master table.
        key_column_target (str): The key column in the target table.
    """
    if df1_only_keys or df2_only_keys:
        if df1_only_keys:
            doc.add_heading(f"'{key_column_master}' present only in '{table1_name}' and not in '{table2_name}' ({len(df1_only_keys)})", level=2)
            create_table(doc, [{key_column_master: key[key_column_master]} for key in df1_only_keys], [key_column_master])
        if df2_only_keys:
            doc.add_heading(f"'{key_column_target}' present only in '{table2_name}' and not in '{table1_name}' ({len(df2_only_keys)})", level=2)
            create_table(doc, [{key_column_target: key[key_column_target]} for key in df2_only_keys], [key_column_target])
    else:
        doc.add_paragraph("No non-matching keys found.")

def add_table_of_contents(doc):
    """
    Adds a Table of Contents to the Word document.

    Args:
        doc (Document): The Word document object.
    """
    paragraph = doc.add_paragraph()
    run = paragraph.add_run()
    fldChar_begin = OxmlElement('w:fldChar')  # creates a new element
    fldChar_begin.set(qn('w:fldCharType'), 'begin')  # sets attribute on element
    instrText = OxmlElement('w:instrText')
    instrText.set(qn('xml:space'), 'preserve')  # sets attribute on element
    instrText.text = 'TOC \\o "1-2" \\h \\z \\u'  # change to what you need
    fldChar_separate = OxmlElement('w:fldChar')
    fldChar_separate.set(qn('w:fldCharType'), 'separate')
    fldChar_end = OxmlElement('w:fldChar')
    fldChar_end.set(qn('w:fldCharType'), 'end')
    run._r.append(fldChar_begin)
    run._r.append(instrText)
    run._r.append(fldChar_separate)
    run._r.append(fldChar_end)
    logging.info("Added Table of Contents to the Word document.")

def create_aggregated_document(all_results, base_name):
    """
    Creates a single Word document that presents all comparison results for a base table.

    Args:
        all_results (list): List of comparison result dictionaries.
        base_name (str): The base name of the table.

    Returns:
        str: The filepath of the saved report.
    """
    doc = Document()
    doc.add_heading(f'{base_name.capitalize()} Tables Comparison Report', level=0)
    doc.add_paragraph(f'Report generated on {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n')

    # Add Instruction for TOC Update
    doc.add_paragraph(
        "📌 **Note:** To update the Table of Contents and make the links clickable, go to the ‘References’ tab and click ‘Update Table’ or press F9 in Windows and Fn+F9 in Mac after opening this document in Microsoft Word.",
        style='Intense Quote'
    )

    # Add Table of Contents
    doc.add_heading('Table of Contents', level=1)
    add_table_of_contents(doc)
    doc.add_page_break()

    for result in all_results:
        table1_name = result['table1_name']
        table2_name = result['table2_name']
        key_column_master = result['key_column_master']
        key_column_target = result['key_column_target']
        doc.add_heading(f'Comparison: {table1_name} vs {table2_name}', level=1)

        # Mismatches
        if result['mismatches']:
            doc.add_heading(f'Mismatches ({len(result["mismatches"])})', level=2)
            column_names = [key_column_master, 'column', f'{table1_name}_value', f'{table2_name}_value']
            create_table(doc, result['mismatches'], column_names)
        else:
            doc.add_heading("No mismatches found.", level=2)

        # Null values in master table
        if not result['null_values_master'].empty:
            count_null_master = len(result['null_values_master'])
            doc.add_heading(f'Null values in {table1_name} ({count_null_master})', level=2)
            column_names = [key_column_master, 'column', table2_name]
            create_table(doc, result['null_values_master'].to_dict('records'), column_names)
        else:
            doc.add_heading(f"No null values found in {table1_name}.", level=2)

        # Null values in target table
        if not result['null_values_target'].empty:
            count_null_target = len(result['null_values_target'])
            doc.add_heading(f'Null values in {table2_name} ({count_null_target})', level=2)
            column_names = [key_column_target, 'column', table1_name]
            create_table(doc, result['null_values_target'].to_dict('records'), column_names)
        else:
            doc.add_heading(f"No null values found in {table2_name}.", level=2)

        # Duplicate keys in master table
        if not result['duplicates_master'].empty:
            count_dup_master = len(result['duplicates_master'])
            doc.add_heading(f'Duplicate Keys in {table1_name} ({count_dup_master})', level=2)
            create_table(doc, result['duplicates_master'].to_dict('records'), [key_column_master, 'Difference in value'])
        else:
            doc.add_heading("No duplicate keys found in master table.", level=2)

        # Duplicate keys in target table
        if not result['duplicates_target'].empty:
            count_dup_target = len(result['duplicates_target'])
            doc.add_heading(f'Duplicate Keys in {table2_name} ({count_dup_target})', level=2)
            create_table(doc, result['duplicates_target'].to_dict('records'), [key_column_target, 'Difference in value'])
        else:
            doc.add_heading(f"No duplicate keys found in {table2_name}.", level=2)

        # Data type issues
        if not result['data_type_issues'].empty:
            count_data_type_issues = len(result['data_type_issues'])
            doc.add_heading(f'Data Type Issues ({count_data_type_issues})', level=2)
            column_names = ['column_name', f'{table1_name}_data_type', f'{table2_name}_data_type']
            create_table(doc, result['data_type_issues'].to_dict('records'), column_names)
        else:
            doc.add_heading("No data type issues found.", level=2)

        # Format issues in master table with target values
        if not result['format_issues_master'].empty:
            count_format_issues_master = len(result['format_issues_master'])
            doc.add_heading(f'Format Issues in {table1_name} ({count_format_issues_master})', level=2)
            column_names_master = [key_column_master, 'column', 'value', 'issue', f'{table2_name}_value']
            create_table(doc, result['format_issues_master'].to_dict('records'), column_names_master)
        else:
            doc.add_heading(f"No format issues found in {table1_name}.", level=2)

        # Pincode Mapping Issues with target details
        if not result['pincode_mapping_issues'].empty:
            count_pincode_issues = len(result['pincode_mapping_issues'])
            doc.add_heading(f'Pincode Mapping Issues in {table1_name} ({count_pincode_issues})', level=2)
            column_names = [
                key_column_master, 'pincode', 'state', 'city', 'issue',
                f'{table2_name}_details'
            ]
            create_table(doc, result['pincode_mapping_issues'].to_dict('records'), column_names)
        else:
            doc.add_heading("No pincode mapping issues found.", level=2)

        # Non-matching keys in master DataFrame
        if result['df_master_only_keys']:
            count_master_only = len(result['df_master_only_keys'])
            doc.add_heading(f'Keys only in {table1_name} ({count_master_only})', level=2)
            column_names = [key_column_master]
            create_table(doc, result['df_master_only_keys'], column_names)
        else:
            doc.add_heading(f"No keys found only in {table1_name}.", level=2)

        # Non-matching keys in target DataFrame
        if result['df_target_only_keys']:
            count_target_only = len(result['df_target_only_keys'])
            doc.add_heading(f'Keys only in {table2_name} ({count_target_only})', level=2)
            column_names = [key_column_target]
            create_table(doc, result['df_target_only_keys'], column_names)
        else:
            doc.add_heading(f"No keys found only in {table2_name}.", level=2)

        doc.add_page_break()  # Optional: Add a page break between comparisons

    # Save the aggregated document to the current directory
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    report_filename = f"{base_name}_comparison_report_aggregated_{timestamp}.docx"
    doc.save(report_filename)
    logging.info(f"Saved aggregated comparison report as '{report_filename}'.")

    return report_filename  # Return the filename for further processing

def send_slack_alert(message):
    """
    Send a message to a specified Slack channel.

    Args:
        message (str): The message to send.
    """
    if not slack_client:
        logging.warning("Slack client is not initialized. Skipping Slack notification.")
        return

    try:
        response = slack_client.chat_postMessage(
            channel=SLACK_CHANNEL,
            text=message
        )
        logging.info(f"Message sent to {SLACK_CHANNEL}: {response['ts']}")
    except SlackApiError as e:
        logging.error(f"Error sending message to Slack: {e.response['error']}")

def upload_file_to_slack(filepath, title=None):
    """
    Upload a file to the specified Slack channel using files_upload_v2.

    Args:
        filepath (str): The path to the file to upload.
        title (str, optional): The title for the uploaded file. Defaults to the file's basename.
    """
    if not slack_client:
        logging.warning("Slack client is not initialized. Skipping file upload.")
        return

    try:
        with open(filepath, 'rb') as f:
            response = slack_client.files_upload_v2(
                channel=SLACK_CHANNEL,
                file=f,
                filename=os.path.basename(filepath),  # Explicitly set the filename with extension
                title=title if title else os.path.basename(filepath),  # Set the title
                initial_comment=title if title else "File uploaded."  # Optional: Add an initial comment
            )

        # Verify if the upload was successful
        if response.get('ok'):
            file_permalink = response['file']['permalink']
            logging.info(f"File uploaded to Slack channel '{SLACK_CHANNEL}': {file_permalink}")
        else:
            logging.error(f"Failed to upload file to Slack: {response}")
    except SlackApiError as e:
        logging.error(f"Slack API Error during file upload: {e.response['error']}")
    except Exception as e:
        logging.error(f"Unexpected error during file upload: {e}")

def find_non_matching_keys(df_master, df_target, master_key, target_key, duplicates_master, duplicates_target, master_table, target_table):
    """
    Identify keys present in df_master but not in df_target and vice versa, including duplicates.

    Args:
        df_master (pd.DataFrame): Source DataFrame.
        df_target (pd.DataFrame): Target DataFrame.
        master_key (str): The key column in the master table.
        target_key (str): The key column in the target table.
        duplicates_master (pd.DataFrame): Duplicate keys in master table.
        duplicates_target (pd.DataFrame): Duplicate keys in target table.
        master_table (str): Name of the master table.
        target_table (str): Name of the target table.

    Returns:
        tuple: (master_only_keys, target_only_keys, error_logs_m)
    """
    error_logs_m = []
    # Include all keys, including duplicates
    keys_master = set(df_master[master_key].astype(str).str.strip())
    keys_target = set(df_target[target_key].astype(str).str.strip())

    # Keys present only in master
    master_only = keys_master - keys_target
    # Keys present only in target
    target_only = keys_target - keys_master

    logging.info(f"Found {len(master_only)} keys in source not in target and {len(target_only)} keys in target not in source.")

    # Convert to list of dictionaries for consistency
    master_only_keys = [{master_key: key} for key in master_only]
    target_only_keys = [{target_key: key} for key in target_only]

    # Log errors for keys only in master
    for key in master_only:
        error_detail = {
            'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'issue': 'missing_key',
            'error_message': f"Key '{master_key}' with value '{key}' is present only in '{master_table}' and missing in '{target_table}'.",
            'source_table': master_table,
            'target_table': target_table, 
            'issue_column': master_key,
            'unique_identifier': f"{master_key}: {key}"
        }
        error_logs_m.append(error_detail)

    # Log errors for keys only in target
    for key in target_only:
        error_detail = {
            'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'issue': 'missing_key',
            'error_message': f"Key '{target_key}' with value '{key}' is present only in '{target_table}' and missing in '{master_table}'.",
            'source_table': target_table,
            'target_table': master_table,
            'issue_column': target_key,
            'unique_identifier': f"{target_key}: {key}"
        }
        error_logs_m.append(error_detail)

    return master_only_keys, target_only_keys, error_logs_m

def find_detailed_nulls(df_master, df_target, master_key, target_key, master_table, target_table, columns_to_check):
    """
    Identify null values in both master and target tables for specified columns and fetch corresponding values or indicate missing keys.

    Args:
        df_master (pd.DataFrame): Source DataFrame (master_hub_ table).
        df_target (pd.DataFrame): Target DataFrame (prefixed table).
        master_key (str): The key column in the master table.
        target_key (str): The key column in the target table.
        master_table (str): Name of the master table.
        target_table (str): Name of the target table.
        columns_to_check (list): List of columns to check for null values.

    Returns:
        tuple: (null_values_master, null_values_target, error_logs_m)
    """
    null_values_master = []
    null_values_target = []
    error_logs_m = []

    # Find nulls in master
    null_master = df_master[df_master[columns_to_check].isnull().any(axis=1)]
    for idx, row in null_master.iterrows():
        key_value = str(row[master_key]).strip()
        for column in columns_to_check:
            if column == master_key or column.startswith('_boltic_'):
                continue  # Skip key column and non-important columns
            if column not in row:
                continue  # Skip if column is not in the row
            if pd.isnull(row[column]):
                if key_value in df_target[target_key].astype(str).str.strip().values:
                    target_row = df_target[df_target[target_key].astype(str).str.strip() == key_value].iloc[0]
                    target_value = target_row[column] if column in target_row else "Column not present"
                else:
                    target_value = f"'{target_key}' not present"
                null_record = {
                    master_key: key_value,
                    'column': column,
                    target_table: target_value
                }
                error_detail = {
                    'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    'issue': 'null',
                    'error_message': 'Null in columns',
                    'source_table': master_table,
                    'target_table': '',
                    'issue_column': column,
                    'unique_identifier': f'{master_key} : {key_value}'
                }
                error_logs_m.append(error_detail)
                null_values_master.append(null_record)

    # Find nulls in target
    null_target = df_target[df_target[columns_to_check].isnull().any(axis=1)]
    for idx, row in null_target.iterrows():
        key_value = str(row[target_key]).strip()
        for column in columns_to_check:
            if column == target_key or column.startswith('_boltic_'):
                continue  # Skip key column and non-important columns
            if column not in row:
                continue  # Skip if column is not in the row
            if pd.isnull(row[column]):
                if key_value in df_master[master_key].astype(str).str.strip().values:
                    master_row = df_master[df_master[master_key].astype(str).str.strip() == key_value].iloc[0]
                    master_value = master_row[column] if column in master_row else "Column not present"
                else:
                    master_value = f"'{master_key}' not present"
                null_record = {
                    target_key: key_value,
                    'column': column,
                    master_table: master_value
                }
                error_detail = {
                    'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    'issue': 'null',
                    'error_message': 'Null in columns',
                    'source_table': target_table,
                    'target_table': '',
                    'issue_column': column,
                    'unique_identifier': f'{target_key} : {key_value}'
                }
                error_logs_m.append(error_detail)
                null_values_target.append(null_record)

    logging.info(f"Found {len(null_values_master)} null values in master table '{master_table}'.")
    logging.info(f"Found {len(null_values_target)} null values in target table '{target_table}'.")
    return null_values_master, null_values_target, error_logs_m

def validate_pincode_mapping(df_master, df_target, key_column, target_key, target_table, client, master_table):
    """
    Validate pincode mapping by comparing with the all_india_PO_list reference table.
    If a pincode issue is found in the master table, then check the corresponding pincode in the target table.

    Args:
        df_master (pd.DataFrame): The master DataFrame to validate.
        df_target (pd.DataFrame): The target DataFrame to fetch corresponding values.
        key_column (str): The key column in the master DataFrame.
        target_key (str): The key column in the target DataFrame.
        target_table (str): The name of the target table.
        client (bigquery.Client): Initialized BigQuery client.
        master_table (str): Name of the master table.

    Returns:
        tuple: (pincode_mapping_issues_df, error_logs_m)
    """

    error_logs_m = []
    # Read the reference table from Analytics dataset
    try:
        reference_table = "all_india_po_list"
        reference_dataset = "analytics_data"
        query = f"SELECT pincode, city, state FROM `{PROJECT_ID}.{reference_dataset}.{reference_table}`"
        reference_df = client.query(query).to_dataframe()
        reference_df['pincode'] = reference_df['pincode'].astype(str).str.strip()
        reference_df['city'] = reference_df['city'].astype(str).str.strip().str.lower()
        reference_df['state'] = reference_df['state'].astype(str).str.strip().str.lower()
        logging.info(f"Loaded reference pincode mapping from '{reference_table}' in '{reference_dataset}' dataset.")
    except Exception as e:
        logging.error(f"Failed to load reference pincode mapping: {e}")
        return pd.DataFrame(), error_logs_m

    # Check if df_master has 'pincode', 'city', 'state' columns
    required_columns = {'pincode', 'city', 'state'}
    if not required_columns.issubset(df_master.columns):
        logging.info(f"DataFrame does not have required columns for pincode mapping validation: {required_columns}")
        return pd.DataFrame(), error_logs_m

    # Initialize the issues DataFrame with a single target table details column
    pincode_mapping_issues = pd.DataFrame(columns=[
        key_column, 'pincode', 'state', 'city', 'issue',
        f'{target_table}_details'
    ])

    # Iterate over each row in df_master to validate pincode mapping
    for idx, row in df_master.iterrows():
        key_value = str(row[key_column]).strip()
        pincode = str(row['pincode']).strip()
        city = str(row['city']).strip().lower()
        state = str(row['state']).strip().lower()

        # Fetch corresponding target row if exists
        target_row = df_target[df_target[target_key].astype(str).str.strip() == key_value]
        if not target_row.empty:
            target_row = target_row.iloc[0]
            target_pincode = target_row['pincode'] if 'pincode' in target_row and pd.notnull(target_row['pincode']) else "Pincode missing"
            target_state = target_row['state'] if 'state' in target_row and pd.notnull(target_row['state']) else "State missing"
            target_city = target_row['city'] if 'city' in target_row and pd.notnull(target_row['city']) else "City missing"
            target_details = f"Pincode: {target_pincode}, State: {target_state}, City: {target_city}"
        else:
            target_details = f"Key '{key_column}' with value '{key_value}' not present in target table '{target_table}'."

        # Check if pincode exists in reference
        ref_matches = reference_df[reference_df['pincode'] == pincode]
        if ref_matches.empty:
            issue = f"Invalid pincode ({pincode})."
            pincode_mapping_issues = pd.concat([pincode_mapping_issues, pd.DataFrame([{
                key_column: key_value,
                'pincode': pincode,
                'state': state,
                'city': city,
                'issue': issue,
                f'{target_table}_details': target_details
            }])], ignore_index=True)
            error_detail = {
                'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                'issue': 'pincode_mapping',
                'error_message': f"{issue}. {target_table} Details: {target_details}",
                'source_table': master_table,
                'target_table': target_table,
                'issue_column': 'pincode',
                'unique_identifier': f'{key_column}: {key_value}'
            }
            error_logs_m.append(error_detail)
            continue

        # Check if any of the reference entries match both the city and state
        exact_match = ref_matches[
            (ref_matches['city'] == city) & (ref_matches['state'] == state)
        ]
        if not exact_match.empty:
            continue  # No issue, mapping is correct

        # Check for state mismatch
        state_matches = ref_matches[ref_matches['state'] == state]

        # Check for city mismatch
        city_matches = ref_matches[ref_matches['city'] == city]

        if state_matches.empty and city_matches.empty:
            # Both state and city do not match
            expected_entries = ref_matches[['state', 'city']].drop_duplicates()
            expected_states = expected_entries['state'].tolist()
            expected_cities = expected_entries['city'].tolist()
            expected_states_str = ', '.join(expected_states)
            expected_cities_str = ', '.join(expected_cities)
            issue = f"Pincode {pincode} does not match state '{state}' and city '{city}'. Expected states: {expected_states_str}; Expected cities: {expected_cities_str}."
        elif state_matches.empty:
            # State does not match
            expected_states = ref_matches['state'].unique().tolist()
            expected_states_str = ', '.join(expected_states)
            issue = f"Pincode {pincode} does not match state '{state}'. Expected states: {expected_states_str}."
        elif city_matches.empty:
            # City does not match
            expected_cities = state_matches['city'].unique().tolist()
            expected_cities_str = ', '.join(expected_cities)
            issue = f"Pincode {pincode} does not match city '{city}'. Expected cities: {expected_cities_str}."
        else:
            # Other cases
            issue = f"Pincode {pincode} has a mapping inconsistency."

        pincode_mapping_issues = pd.concat([pincode_mapping_issues, pd.DataFrame([{
            key_column: key_value,
            'pincode': pincode,
            'state': state,
            'city': city,
            'issue': issue,
            f'{target_table}_details': target_details
        }])], ignore_index=True)
        error_detail = {
            'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'issue': 'pincode_mapping',
            'error_message': f"{issue}. {target_table} Details: {target_details}",
            'source_table': master_table,
            'target_table': target_table,
            'issue_column': 'pincode',
            'unique_identifier': f'{key_column}: {key_value}'
        }
        error_logs_m.append(error_detail)

    logging.info(f"Found {len(pincode_mapping_issues)} pincode mapping issues in master table '{master_table}'.")
    return pincode_mapping_issues, error_logs_m

def compare_tables(client, dataset_name, base_name, master_table, target_table, master_key, target_key, column_mapping=None):
    """
    Compare two tables and generate a report.

    Args:
        client (bigquery.Client): Initialized BigQuery client.
        dataset_name (str): The name of the dataset.
        base_name (str): The base name of the table.
        master_table (str): Name of the master_hub_ table.
        target_table (str): Name of the target prefixed table.
        master_key (str): The key column in the master table.
        target_key (str): The key column in the target table.

    Returns:
        dict: A dictionary containing all comparison results.
    """
    logging.info(f"Starting comparison for base table '{base_name}': '{master_table}' vs '{target_table}'.")

    # Initialize comparison results
    mismatches = []
    null_values_master = []
    null_values_target = []
    data_type_issues = pd.DataFrame()
    format_issues_master = pd.DataFrame()
    pincode_mapping_issues = pd.DataFrame()
    duplicates_master = pd.DataFrame()
    duplicates_target = pd.DataFrame()
    master_only_keys = []
    target_only_keys = []

    # Load data
    df_master = load_table_from_bigquery(client, dataset_name, master_table)
    df_target = load_table_from_bigquery(client, dataset_name, target_table)

    # Apply standardization
    df_master = standardize_dataframe(df_master, exclude_columns=[master_key])
    df_target = standardize_dataframe(df_target, exclude_columns=[target_key])

    # Apply active filter if defined
    base_table_info = BASE_TABLES.get(base_name, {})
    active_filter = base_table_info.get('active_filter')
    perform_checks = base_table_info.get('perform_checks', True)

    if active_filter:
        column = active_filter.get('column')
        value = active_filter.get('value')
        if column and column in df_master.columns:
            initial_count = len(df_master)
            df_master = df_master[df_master[column] == value]
            filtered_count = len(df_master)
            logging.info(f"Filtered '{base_name}' master table: {initial_count - filtered_count} records excluded based on {column} = {value}.")
        else:
            logging.warning(f"Active filter specified but column '{column}' not found in master table '{master_table}'.")

    if df_master.empty or df_target.empty:
        logging.warning(f"One of the tables '{master_table}' or '{target_table}' is empty. Skipping comparison.")
        return None

    # Identify BigNumeric columns in master and target tables
    schema_master = get_table_schema(client, dataset_name, master_table)
    schema_target = get_table_schema(client, dataset_name, target_table)
    bignumeric_columns_master = [col for col, dtype in schema_master.items() if dtype == 'BIGNUMERIC']
    bignumeric_columns_target = [col for col, dtype in schema_target.items() if dtype == 'BIGNUMERIC']

    # Format BigNumeric columns in master table
    for col in bignumeric_columns_master:
        if col in df_master.columns:
            df_master[col] = df_master[col].apply(lambda x: format(x, '.0f') if pd.notnull(x) else x)

    # Format BigNumeric columns in target table
    for col in bignumeric_columns_target:
        if col in df_target.columns:
            df_target[col] = df_target[col].apply(lambda x: format(x, '.0f') if pd.notnull(x) else x)

    # Get imp_columns and non_imp_columns
    imp_columns = Imp_columns.get(base_name, None)
    non_imp_columns = Non_imp_columns.get(base_name, [])

    # Identify common columns
    common_columns, master_unique_cols, target_unique_cols = find_common_and_non_common_columns(df_master, df_target)

    if not common_columns:
        logging.warning(f"No common columns found between '{master_table}' and '{target_table}'. Skipping comparison.")
        return None

    # Apply column mapping if provided
    if column_mapping:
        # Rename master columns to target columns for comparison
        for master_col, target_col in column_mapping.items():
            if master_col in df_master.columns and target_col in df_target.columns:
                # Ensure both columns are standardized
                df_master[master_col] = df_master[master_col].astype(str).str.strip().str.lower()
                df_target[target_col] = df_target[target_col].astype(str).str.strip().str.lower()
        # Adjust common_columns by replacing master_col with target_col
        adjusted_common_columns = set(common_columns)
        for master_col, target_col in column_mapping.items():
            if master_col in adjusted_common_columns and target_col in adjusted_common_columns:
                adjusted_common_columns.remove(master_col)
                adjusted_common_columns.remove(target_col)
                adjusted_common_columns.add(master_col)  # Use master_col as the unified name
        common_columns = list(adjusted_common_columns)

    # Determine columns to check based on imp_columns
    if imp_columns:
        columns_to_check = [col for col in imp_columns if col in common_columns]
        logging.info(f"Important columns defined for '{base_name}': {columns_to_check}")
    else:
        columns_to_check = [col for col in common_columns if col not in non_imp_columns]
        logging.info(f"No important columns defined for '{base_name}'. Applying checks to all columns except non_imp_columns: {columns_to_check}")

    if perform_checks:
        # Find duplicates in both tables
        duplicates_master, error_logs_m = find_duplicates(df_master, master_key, master_table)
        ERROR_LOG_M.extend(error_logs_m)
        duplicates_target, error_logs_m = find_duplicates(df_target, target_key, target_table)
        ERROR_LOG_M.extend(error_logs_m)

    if not duplicates_master.empty:
        logging.warning(f"Duplicate keys found in source table '{master_table}'. These will be reported but not used in mismatch comparison.")
    if not duplicates_target.empty:
        logging.warning(f"Duplicate keys found in target table '{target_table}'. These will be reported but not used in mismatch comparison.")

    # Perform mismatch comparison
    if perform_checks:
        mismatches, error_logs_m = find_mismatches(
            df_master,
            df_target,
            columns_to_check,
            master_key,
            target_key,
            master_table,
            target_table,
            duplicates_master,
            duplicates_target,
            non_imp_columns,
            column_mapping  # Pass column_mapping to handle differently named columns
        )
        ERROR_LOG_M.extend(error_logs_m)

    # Find detailed null values in both tables
    if perform_checks:
        null_values_master, null_values_target, error_logs_m = find_detailed_nulls(
            df_master,
            df_target,
            master_key,
            target_key,
            master_table,
            target_table,
            columns_to_check  # Pass columns_to_check
        )
        ERROR_LOG_M.extend(error_logs_m)

    # Validate data types between master and target schemas
    if perform_checks:
        data_type_issues, error_logs_m = validate_data_types(
            schema_master,
            schema_target,
            master_key,
            master_table,
            target_table,
            columns_to_check  # Pass columns_to_check
        )
        ERROR_LOG_M.extend(error_logs_m)

    # Validate formats in master table only
    if perform_checks:
        format_issues_master, error_logs_m = validate_formats(
            df_master,
            df_target,
            master_key,
            target_key,
            target_table,
            master_table,
            columns_to_check  # Pass columns_to_check
        )
        ERROR_LOG_M.extend(error_logs_m)

        # Validate pincode mapping if applicable
        pincode_mapping_issues = pd.DataFrame()
        if {'pincode', 'city', 'state'}.issubset(df_master.columns):
            pincode_mapping_issues, error_logs_m = validate_pincode_mapping(
                df_master,
                df_target,
                master_key,
                target_key,
                target_table,
                client,
                master_table
            )
            ERROR_LOG_M.extend(error_logs_m)

    # Find non-matching keys
    master_only_keys, target_only_keys, error_logs_m = find_non_matching_keys(
        df_master, df_target, master_key, target_key, duplicates_master, duplicates_target, master_table, target_table
    )
    ERROR_LOG_M.extend(error_logs_m)

    # Compile results
    results = {
        'mismatches': mismatches,
        'null_values_master': pd.DataFrame(null_values_master),
        'null_values_target': pd.DataFrame(null_values_target),
        'duplicates_master': duplicates_master,
        'duplicates_target': duplicates_target,
        'data_type_issues': data_type_issues,
        'format_issues_master': format_issues_master,
        'pincode_mapping_issues': pincode_mapping_issues,
        'key_column_master': master_key,
        'key_column_target': target_key,
        'df_master_only_keys': master_only_keys,
        'df_target_only_keys': target_only_keys,
        'table1_name': master_table,
        'table2_name': target_table,
        'df_master': df_master,
        'df_target': df_target
    }

    logging.info(f"Completed comparison for '{master_table}' vs '{target_table}'.")
    return results

def generate_string_schema(df):
    """
    Generates a BigQuery schema with all fields as STRING.
    
    Args:
        df (pd.DataFrame): The DataFrame for which to generate the schema.
        
    Returns:
        list: A list of SchemaField objects with type STRING.
    """
    schema = [SchemaField(column, "STRING", mode="NULLABLE") for column in df.columns]
    return schema

def _upload_dataframe_to_bigquery(client, analytics_dataset, table_name, df):
    """
    Helper function to upload a DataFrame to BigQuery.

    Args:
        client (bigquery.Client): Initialized BigQuery client.
        analytics_dataset (str): The name of the Analytics dataset.
        table_name (str): The name of the table to upload.
        df (pd.DataFrame): The DataFrame to upload.

    Returns:
        None
    """
    if df.empty:
        logging.info(f"No data to upload for '{table_name}'. Skipping.")
        return

    # Convert all columns to string type
    df = df.astype(str)

    # Generate BigQuery schema with all fields as STRING
    schema = generate_string_schema(df)

    # Ensure table name doesn't exceed BigQuery's maximum length (1,024 characters)
    if len(table_name) > 1024:
        original_table_name = table_name
        table_name = table_name[:1021] + '...'
        logging.warning(f"Table name truncated from '{original_table_name}' to '{table_name}' due to length constraints.")

    # Define the full table ID
    table_id = f"{client.project}.{analytics_dataset}.{table_name}"

    # Upload the DataFrame to BigQuery
    try:
        job = client.load_table_from_dataframe(
            df,
            table_id,
            job_config=bigquery.LoadJobConfig(
                write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
                schema=schema  # Using the provided schema with all fields as STRING
            )
        )
        job.result()  # Wait for the job to complete
        logging.info(f"Successfully uploaded '{table_id}' with {len(df)} records.")
    except Exception as e:
        logging.error(f"Failed to upload '{table_id}' to BigQuery: {e}")

def upload_comparison_results_to_bigquery(client, analytics_dataset, ERROR_LOG_M):
    """
    Uploads the ERROR_LOG_M to BigQuery as a separate table in the Analytics dataset.

    Args:
        client (bigquery.Client): Initialized BigQuery client.
        analytics_dataset (str): The name of the Analytics dataset.
        ERROR_LOG_M (list): The error log data as a list of dictionaries.

    Returns:
        None
    """
    # Handle ERROR_LOG separately
    if ERROR_LOG_M is not None:
        # Determine the DataFrame to upload
        if isinstance(ERROR_LOG_M, pd.DataFrame):
            error_df = ERROR_LOG_M
        elif isinstance(ERROR_LOG_M, list):
            error_df = pd.DataFrame(ERROR_LOG_M)
        else:
            logging.warning("Unsupported data type for ERROR_LOG. Skipping upload.")
            error_df = None

        if error_df is not None and not error_df.empty:
            _upload_dataframe_to_bigquery(client, analytics_dataset, "error_logs_master_hub", error_df)
        else:
            logging.info("No error logs to upload.")
    else:
        logging.info("No error logs present.")

def main():
    """
    Main function to orchestrate the comparison of multiple base tables against their master_hub_ counterparts.
    """
    try:
        # Initialize BigQuery client
        try:
            client = get_bigquery_client(PROJECT_ID)
        except Exception:
            logging.error("Exiting due to BigQuery client initialization failure.")
            return

        # Find common tables with 'master_hub_' and other prefixes, passing BASE_TABLES
        common_tables = find_common_tables_with_master_hub(client, DATASET_ID, PREFIXES, BASE_TABLES)

        if not common_tables:
            logging.info("No common tables found with 'master_hub_' and the specified prefixes.")
            return

        # Iterate over each base table and perform comparisons
        for base_name, tables in common_tables.items():
            base_table_info = BASE_TABLES.get(base_name)
            if not base_table_info:
                logging.warning(f"No configuration found for base table '{base_name}'. Skipping.")
                continue

            master_key = base_table_info.get('master_key')
            target_tables = base_table_info.get('targets', {})
            column_mapping = base_table_info.get('column_mapping', {})
            

            master_table = tables.get('master_hub_')
            if not master_table:
                logging.warning(f"Master table 'master_hub_{base_name}' not found. Skipping.")
                continue

            all_results = []

            # Iterate through each prefix and its corresponding target_key
            for prefix, target_key in target_tables.items():
                target_table = tables.get(prefix)
                if not target_table:
                    logging.warning(f"Target table with prefix '{prefix}' for base table '{base_name}' not found. Skipping.")
                    continue

                comparison_result = compare_tables(
                    client, 
                    DATASET_ID, 
                    base_name, 
                    master_table, 
                    target_table, 
                    master_key, 
                    target_key,  # Pass the correct target_key per prefix
                    column_mapping  # Pass the column_mapping
                )
                if comparison_result:
                    all_results.append(comparison_result)

                    # Prepare and send a separate Slack message for each comparison
                    total_mismatches = len(comparison_result['mismatches'])
                    total_nulls_master = len(comparison_result['null_values_master'])
                    total_nulls_target = len(comparison_result['null_values_target'])
                    total_dup_master = len(comparison_result['duplicates_master'])
                    total_dup_target = len(comparison_result['duplicates_target'])
                    total_data_type_issues = len(comparison_result['data_type_issues'])
                    total_format_issues_master = len(comparison_result['format_issues_master'])
                    total_pincode_issues = len(comparison_result['pincode_mapping_issues'])
                    total_non_matching_source = len(comparison_result.get('df_master_only_keys', []))
                    total_non_matching_target = len(comparison_result.get('df_target_only_keys', []))

                    message = (
                        f"✅ *Comparison Report Generated for `{base_name}`*\n"
                        f"*Tables Compared: `{comparison_result['table1_name']}` vs `{comparison_result['table2_name']}`*\n"
                        f"- *Total Mismatches between values of same column name of both tables : `{total_mismatches}`*\n"
                        f"- *Total Null Values in `{comparison_result['table1_name']}`: `{total_nulls_master}`*\n"
                        f"- *Total Null Values in `{comparison_result['table2_name']}`: `{total_nulls_target}`*\n"
                        f"- *Duplicate `{master_key}` in `{comparison_result['table1_name']}`: `{total_dup_master}`*\n"
                        f"- *Duplicate `{target_key}` in `{comparison_result['table2_name']}`: `{total_dup_target}`*\n"
                        f"- *Total Data Type Issues(mismatch between datatype in columns with same name of both tables): `{total_data_type_issues}`*\n"
                        f"- *Total Format/Value Issues(gstin, email, pincode) in `{comparison_result['table1_name']}`: `{total_format_issues_master}`*\n"
                        f"- *Total Pincode Mapping Issues in `{comparison_result['table1_name']}`: `{total_pincode_issues}`*\n"
                         "- *Non-Matching Keys*:\n"
                        f"--*`{master_key}` only in `{comparison_result['table1_name']}` and not in `{comparison_result['table2_name']}`:`{total_non_matching_source}`,*\n"
                        f"--*`{target_key}` only in `{comparison_result['table2_name']}` and not in `{comparison_result['table1_name']}`:`{total_non_matching_target}`*"
                    )

                    send_slack_alert(message)
            
            if all_results:
                # Generate aggregated report for the base name and get the filepath
                report_filepath = create_aggregated_document(all_results, base_name)
                
                # Upload the report to Slack using the updated function
                upload_file_to_slack(report_filepath, title=f"{base_name.capitalize()} Comparison Report")
                
                # Remove the local report file after successful upload
                try:
                    os.remove(report_filepath)
                    logging.info(f"Removed local report file '{report_filepath}'.")
                except Exception as e:
                    logging.error(f"Failed to remove local report file '{report_filepath}': {e}")
                time.sleep(30)
            else:
                logging.info(f"No comparison results to report for base name '{base_name}'.")
               

        # Upload error logs to BigQuery after all comparisons
        upload_comparison_results_to_bigquery(
            client, 
            'analytics_data',
            ERROR_LOG_M
            )

        logging.info("All comparisons completed.")
    except Exception as e:
        # Capture the full traceback
        tb = traceback.format_exc()
        logging.error("An unexpected error occurred in the main process.", exc_info=True)

        # Prepare a detailed error message for Slack
        error_message = (
            f"❌ *Comparison Process Failed*\n"
            f"*Error:* {str(e)}\n"
            f"*Traceback:*\n```{tb}```"
        )
        send_slack_alert(error_message)

        # Optionally, exit the script with a non-zero status
        sys.exit(1)

main()


2024-11-27 17:23:42,532 - INFO - Slack client initialized successfully.
2024-11-27 17:23:42,991 - INFO - BigQuery client initialized successfully.
2024-11-27 17:23:44,005 - INFO - Found 173 tables in dataset 'Impetus_dev_prod'.
2024-11-27 17:23:44,006 - INFO - Identified 2 common base names with 'master_hub_' and other specified prefixes.
2024-11-27 17:23:44,007 - INFO - Starting comparison for base table 'supplier': 'master_hub_supplier' vs 'procuro_supplier'.
2024-11-27 17:23:45,246 - INFO - Loaded data from table 'master_hub_supplier' into DataFrame.
2024-11-27 17:23:46,086 - INFO - Loaded data from table 'procuro_supplier' into DataFrame.
2024-11-27 17:23:46,099 - INFO - Standardized DataFrame for comparison.
2024-11-27 17:23:46,108 - INFO - Standardized DataFrame for comparison.
2024-11-27 17:23:46,111 - INFO - Filtered 'supplier' master table: 0 records excluded based on is_active = True.
2024-11-27 17:23:46,267 - INFO - Retrieved schema for table 'master_hub_supplier'.
2024-11-2

In [1]:
#imp and non imp column added

import time
import os
import re
import logging
from datetime import datetime
import pandas as pd
from google.cloud import bigquery
from google.api_core.exceptions import GoogleAPIError
from docx import Document
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from slack_sdk import WebClient
from slack_sdk.errors import SlackApiError
from google.cloud.bigquery import SchemaField
import traceback
import sys

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Configuration
PROJECT_ID = 'fynd-jio-impetus-non-prod'       # Replace with your project ID
DATASET_ID = 'Impetus_dev_sit'                 # Replace with your dataset ID
PREFIXES = ['procuro_', 'costing_engine_', 'scan_pack_', 'pigeon_']  # Define your prefixes

# Error log list
ERROR_LOG_M = []

# Mapping of base table names to their key columns in master and target tables
BASE_TABLES = {
    'brand': {
        'master_key': 'code',
        'targets': {
            'procuro_': 'code',
            'costing_engine_': 'code'
        },
        'active_filter': {
            'column': 'is_active',
            'value': True
        },
        'perform_checks': True  # Default behavior
    },
    'brand_pm_mapping': {
        'master_key': 'pm_id',
        'targets': {
            'costing_engine_': 'pm_id'
        },
        'perform_checks': True
    },
    'brick': {
        'master_key': 'brick_code',
        'targets': {
            'costing_engine_': 'code'
        },
        'perform_checks': True
    },
    'coe_bom_element_type_mapping': {
        'master_key': 'coe_name',
        'targets': {
            'costing_engine_': 'coe_name'
        },
        'perform_checks': True
    },
    # 'event_log': {
    #     'master_key': 'user_id',
    #     'targets': {
    #         'costing_engine_': 'user_id'
    #     },
    #     'perform_checks': True
    # },
    'supplier': {
        'master_key': 'supplier_code',
        'targets': {
            'procuro_': 'supplier_code',
            'costing_engine_': 'supplier_code'
        },
        'active_filter': {  # Apply active filter
            'column': 'is_active',
            'value': True
        },
        'perform_checks': True
    },
    'vendor_details': {  # Newly added entry
        'master_key': 'supplier_code',  # Using supplier_code as the key
        'master_table': 'master_hub_supplier',  # Specify the master table explicitly
        'targets': {
            'scan_pack_': 'vendor_code'
        },
        'active_filter': {  # Apply active filter
            'column': 'is_active',
            'value': True
        },
        'perform_checks': True
    },
    'hsn_tax_mapping': {  # Newly added base table for HSN Codes
        'master_key': 'hsn_code',  # Assuming 'hsn_code' is the key column
        'master_table': 'master_hub_hsn',
        'targets': {
            'procuro_': 'hsn_code',
        },
        'perform_checks': False  # Only perform key comparisons
    },
    'config_buyer_brand_mapping': {  # Newly added entry
        'master_key': 'id', 
        'master_table': 'master_hub_buyer_brand_mapping',  # Specify the master table explicitly
        'targets': {
            'costing_engine_': 'id'
        },
        'perform_checks': True
    },    
}

# Define Non-Important Columns
Non_imp_columns = {
    'supplier': ['id', '_id', 'updated_at', 'created_at'],
    'vendor_details': ['id', '_id', 'updated_at', 'created_at']  # Add if applicable
}

# Define Important Columns
Imp_columns = {
    'brand': ['name', 'id', 'slug', 'code'],
    # Add more base tables and their important columns as needed
}

# Slack configuration
SLACK_TOKEN = "xoxb-2151238541-7946286860052-5FCcfqBPem0xKigGlIcKdLgX"  # Replace with your Slack token
SLACK_CHANNEL = "C07UN19ETK5"  # Replace with your Slack channel ID

# Initialize Slack client
if SLACK_TOKEN and SLACK_CHANNEL:
    slack_client = WebClient(token=SLACK_TOKEN)
    logging.info("Slack client initialized successfully.")
else:
    slack_client = None
    logging.warning("Slack token or channel not found. Slack notifications will be disabled.")

def get_bigquery_client(project_id):
    """
    Initialize and return a BigQuery client.

    Args:
        project_id (str): GCP project ID.

    Returns:
        bigquery.Client: An initialized BigQuery client.
    """
    try:
        client = bigquery.Client(project=project_id)
        logging.info("BigQuery client initialized successfully.")
        return client
    except Exception as e:
        logging.error(f"Failed to initialize BigQuery client: {e}")
        raise

def find_common_tables_with_master_hub(client, dataset_name, prefixes, base_tables):
    """
    Find tables in the specified dataset that share the same base name after removing the 'master_hub_' prefix
    and exist with other given prefixes.

    Args:
        client (bigquery.Client): Initialized BigQuery client.
        dataset_name (str): The name of the dataset to search within.
        prefixes (list): List of prefixes to compare with 'master_hub_'.
        base_tables (dict): The BASE_TABLES dictionary containing base table configurations.

    Returns:
        dict: A dictionary where keys are base names and values are dictionaries showing which prefixes have tables.
    """
    try:
        # Reference the dataset
        dataset_ref = client.dataset(dataset_name)

        # List all tables in the dataset
        tables = client.list_tables(dataset_ref)
        table_names = [table.table_id for table in tables]
        logging.info(f"Found {len(table_names)} tables in dataset '{dataset_name}'.")

        # Dictionary to hold base names and their corresponding tables
        common_tables = {}
        for base_name, config in base_tables.items():
            # Determine the master table
            master_table = config.get('master_table', f'master_hub_{base_name}')
            if master_table in table_names:
                common_tables[base_name] = {'master_hub_': master_table}
                # Check for target tables with specified prefixes
                for prefix, target_key in config.get('targets', {}).items():
                    target_table = f"{prefix}{base_name}"
                    if target_table in table_names:
                        common_tables[base_name][prefix] = target_table
            else:
                logging.warning(f"Master table '{master_table}' for base '{base_name}' not found in dataset.")

        # Filter out base names that only have 'master_hub_' but no other matching prefixes
        common_tables_with_prefixes = {base_name: tables for base_name, tables in common_tables.items() if len(tables) > 1}

        logging.info(f"Identified {len(common_tables_with_prefixes)} common base names with 'master_hub_' and other specified prefixes.")
        return common_tables_with_prefixes

    except GoogleAPIError as e:
        logging.error(f"Google API Error: {e.message}")
        return {}
    except Exception as e:
        logging.error(f"An unexpected error occurred: {e}")
        return {}

def get_table_schema(client, dataset_name, table_name):
    """
    Retrieve the schema of a specified BigQuery table.

    Args:
        client (bigquery.Client): Initialized BigQuery client.
        dataset_name (str): The name of the dataset.
        table_name (str): The name of the table.

    Returns:
        dict: A dictionary mapping column names to their data types.
    """
    try:
        table_ref = client.dataset(dataset_name).table(table_name)
        table = client.get_table(table_ref)
        schema = {field.name: field.field_type for field in table.schema}
        logging.info(f"Retrieved schema for table '{table_name}'.")
        return schema
    except GoogleAPIError as e:
        logging.error(f"Failed to retrieve schema for table '{table_name}': {e.message}")
        return {}
    except Exception as e:
        logging.error(f"An unexpected error occurred while retrieving schema for table '{table_name}': {e}")
        return {}

def load_table_from_bigquery(client, dataset_name, table_name):
    """
    Load a table from BigQuery into a Pandas DataFrame.

    Args:
        client (bigquery.Client): Initialized BigQuery client.
        dataset_name (str): The name of the dataset.
        table_name (str): The name of the table.

    Returns:
        pd.DataFrame: DataFrame containing the table data.
    """
    try:
        query = f"SELECT * FROM `{PROJECT_ID}.{dataset_name}.{table_name}`"
        df = client.query(query).to_dataframe()
        logging.info(f"Loaded data from table '{table_name}' into DataFrame.")
        return df
    except GoogleAPIError as e:
        logging.error(f"Failed to load table '{table_name}': {e.message}")
        return pd.DataFrame()
    except Exception as e:
        logging.error(f"An unexpected error occurred while loading table '{table_name}': {e}")
        return pd.DataFrame()

def standardize_dataframe(df, exclude_columns=[]):
    """
    Standardize string columns in the DataFrame by stripping whitespace and converting to lowercase,
    excluding specified columns.

    Args:
        df (pd.DataFrame): The DataFrame to standardize.
        exclude_columns (list): Columns to exclude from standardization.

    Returns:
        pd.DataFrame: Standardized DataFrame.
    """
    df_copy = df.copy()
    for col in df_copy.columns:
        if col in exclude_columns:
            continue  # Skip standardizing this column
        if pd.api.types.is_string_dtype(df_copy[col]):
            df_copy[col] = df_copy[col].astype(str).str.strip().str.lower()
    logging.info("Standardized DataFrame for comparison.")
    return df_copy

def find_common_and_non_common_columns(df1, df2):
    """
    Identify common and unique columns between two DataFrames.

    Args:
        df1 (pd.DataFrame): First DataFrame.
        df2 (pd.DataFrame): Second DataFrame.

    Returns:
        tuple: (common_columns, df1_unique_columns, df2_unique_columns)
    """
    common_columns = list(set(df1.columns).intersection(set(df2.columns)))
    df1_unique_columns = list(set(df1.columns) - set(df2.columns))
    df2_unique_columns = list(set(df2.columns) - set(df1.columns))
    logging.info(f"Found {len(common_columns)} common columns, {len(df1_unique_columns)} unique to first table, {len(df2_unique_columns)} unique to second table.")
    return common_columns, df1_unique_columns, df2_unique_columns

def find_mismatches(df_master, df_target, columns_to_check, master_key, target_key, table1, table2, duplicates_master, duplicates_target, non_imp_columns):
    """
    Identify mismatches between two DataFrames based on specified columns and key columns.

    Args:
        df_master (pd.DataFrame): Source DataFrame (master_hub_ table).
        df_target (pd.DataFrame): Target DataFrame (prefixed table).
        columns_to_check (list): List of columns to apply mismatch checks.
        master_key (str): The key column in the master table.
        target_key (str): The key column in the target table.
        table1 (str): Name of the source table.
        table2 (str): Name of the target table.
        duplicates_master (pd.DataFrame): Duplicate keys in master table.
        duplicates_target (pd.DataFrame): Duplicate keys in target table.
        non_imp_columns (list): List of non-important columns to exclude.

    Returns:
        tuple: (mismatches, error_logs_m)
    """
    mismatches = []
    error_logs_m = []
    # Ensure key columns are present in both DataFrames
    if master_key not in df_master.columns or target_key not in df_target.columns:
        logging.error(f"Key columns '{master_key}' or '{target_key}' not found in the respective tables.")
        return mismatches, error_logs_m

    # Rename target key to match master key for easier comparison
    df_target_renamed = df_target.rename(columns={target_key: master_key})

    # Merge DataFrames on the master_key, excluding duplicates
    merged_df = pd.merge(
        df_master.drop_duplicates(subset=master_key),
        df_target_renamed.drop_duplicates(subset=master_key),
        on=master_key,
        suffixes=(f'_{table1}', f'_{table2}'),
        how='inner'
    )

    logging.info(f"Merged DataFrame has {len(merged_df)} records for mismatch comparison.")

    for index, row in merged_df.iterrows():
        key = row[master_key]
        for column in columns_to_check:
            if column.startswith('_boltic_') or column in non_imp_columns:
                continue  # Skip columns starting with '_boltic_' or non-important columns
            if column not in row:
                continue  # Skip if column is not in the merged row
            val_master = row.get(f"{column}_{table1}")
            val_target = row.get(f"{column}_{table2}")
            # Handle NaN values in comparison
            if pd.isna(val_master) and pd.isna(val_target):
                continue  # Both are NaN, treat as equal
            elif pd.isna(val_master) or pd.isna(val_target) or val_master != val_target:
                mismatch_detail = {
                    master_key: key,
                    'column': column,
                    f'{table1}_value': val_master,
                    f'{table2}_value': val_target
                }
                mismatches.append(mismatch_detail)
                error_detail = {
                    'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    'issue': 'mismatch',
                    'error_message': '',
                    'source_table': table1,
                    'target_table': table2,
                    'issue_column': column,
                    'unique_identifier': f'{master_key}: {key}'
                }
                error_logs_m.append(error_detail)

    logging.info(f"Found {len(mismatches)} mismatches between '{table1}' and '{table2}'.")
    return mismatches, error_logs_m

def find_duplicates(df, key_column, table_name):
    """
    Detect duplicate key_column entries in the DataFrame and identify differences.

    Args:
        df (pd.DataFrame): The DataFrame to check.
        key_column (str): The key column to check for duplicates.
        table_name (str): Name of the table being checked.

    Returns:
        tuple: (duplicate_records_df, error_logs_m)
    """
    if key_column not in df.columns:
        logging.error(f"Key column '{key_column}' not found in DataFrame.")
        return pd.DataFrame(), []

    # Get all duplicate entries (keep=False to get all duplicates)
    duplicates_df = df[df.duplicated(subset=key_column, keep=False)]

    # Group by key_column
    grouped = duplicates_df.groupby(key_column)

    duplicate_records = []
    error_logs_m = []

    for key, group in grouped:
        if len(group) <= 1:
            continue  # Not a duplicate

        # Drop key_column and any columns starting with '_boltic_'
        group_non_key = group.drop(columns=[key_column] + [col for col in group.columns if col.startswith('_boltic_')])

        # Check if all rows are identical
        if group_non_key.nunique().sum() == 0:
            difference = "No difference exists"
        else:
            # Find which columns have differences
            cols_with_diff = group_non_key.columns[group_non_key.nunique() > 1].tolist()
            difference = "Difference in value of columns: " + ', '.join(cols_with_diff)

        duplicate_records.append({
            key_column: key,
            'Difference in value': difference
        })
        error_detail = {
            'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'issue': 'duplicate',
            'error_message': f'{difference}',
            'source_table': f'{table_name}',
            'target_table': '',
            'issue_column': '',
            'unique_identifier': f'{key_column}: {key}'
        }
        error_logs_m.append(error_detail)

    logging.info(f"Found {len(duplicate_records)} duplicate entries based on '{key_column}'.")
    return pd.DataFrame(duplicate_records), error_logs_m

def validate_data_types(schema_master, schema_target, master_key, table1_name, table2_name, columns_to_check):
    """
    Compare data types of specified columns between master and target schemas.

    Args:
        schema_master (dict): Schema of the master table.
        schema_target (dict): Schema of the target table.
        master_key (str): The key column for reference.
        table1_name (str): Name of the first table.
        table2_name (str): Name of the second table.
        columns_to_check (list): List of columns to validate data types.

    Returns:
        tuple: (data_type_issues_df, error_logs_m)
    """
    data_type_issues = []
    error_logs_m = []

    # Identify common columns to check
    common_columns = set(columns_to_check).intersection(set(schema_master.keys()), set(schema_target.keys()))

    for column in common_columns:
        type_master = schema_master[column]
        type_target = schema_target[column]
        if type_master != type_target:
            data_type_issues.append({
                'column_name': column,
                f'{table1_name}_data_type': type_master,
                f'{table2_name}_data_type': type_target
            })
            error_detail = {
                'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                'issue': 'data_type_issues',
                'error_message': f'{table1_name}_data_type: {type_master} , {table2_name}_data_type: {type_target}',
                'source_table': table1_name,
                'target_table': table2_name,
                'issue_column': column,
                'unique_identifier': ''
            }
            error_logs_m.append(error_detail)

    logging.info(f"Found {len(data_type_issues)} data type issues.")
    return pd.DataFrame(data_type_issues), error_logs_m

def validate_formats(df_master, df_target, key_column, target_key, target_table, master_table, columns_to_check):
    """
    Validate specific column formats using regular expressions and include corresponding target table values.

    Args:
        df_master (pd.DataFrame): The master DataFrame to validate.
        df_target (pd.DataFrame): The target DataFrame to fetch corresponding values.
        key_column (str): The key column in the master DataFrame.
        target_key (str): The key column in the target DataFrame.
        target_table (str): The name of the target table.
        master_table (str): The name of the master table.
        columns_to_check (list): List of columns to validate formats.

    Returns:
        tuple: (format_issues_df, error_logs_m)
    """
    format_issues = pd.DataFrame(columns=[key_column, 'column', 'value', 'issue', f'{target_table}_value'])
    error_logs_m = []

    for idx, row in df_master.iterrows():
        key_value = str(row[key_column]).strip()

        # GSTIN format validation
        if 'gstin' in columns_to_check and 'gstin' in df_master.columns:
            gstin = str(row['gstin']).strip()
            if not re.match(r'^[0-9]{2}[A-Z]{5}[0-9]{4}[A-Z]{1}[A-Z0-9]{3}$', gstin):
                # Fetch corresponding target value
                if key_value in df_target[target_key].astype(str).str.strip().values:
                    target_row = df_target[df_target[target_key].astype(str).str.strip() == key_value].iloc[0]
                    target_value = target_row['gstin'] if 'gstin' in target_row else "Column not present"
                else:
                    target_value = f"'{target_key}' not present"

                format_issues = pd.concat([format_issues, pd.DataFrame([{
                    key_column: key_value,
                    'column': 'gstin',
                    'value': row['gstin'],
                    'issue': 'Invalid GSTIN format',
                    f'{target_table}_value': target_value
                }])], ignore_index=True)
                error_detail = {
                    'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    'issue': 'format_issue',
                    'error_message': 'Invalid GSTIN format',
                    'source_table': master_table,
                    'target_table': '',
                    'issue_column': 'gstin',
                    'unique_identifier': f'{key_column}: {key_value}'
                }
                error_logs_m.append(error_detail)

        # Email format validation
        if 'email' in columns_to_check and 'email' in df_master.columns:
            email = str(row['email']).strip()
            if not re.match(r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$', email):
                # Fetch corresponding target value
                if key_value in df_target[target_key].astype(str).str.strip().values:
                    target_row = df_target[df_target[target_key].astype(str).str.strip() == key_value].iloc[0]
                    target_value = target_row['email'] if 'email' in target_row else "Column not present"
                else:
                    target_value = f"'{target_key}' not present"

                format_issues = pd.concat([format_issues, pd.DataFrame([{
                    key_column: key_value,
                    'column': 'email',
                    'value': row['email'],
                    'issue': 'Invalid email format',
                    f'{target_table}_value': target_value
                }])], ignore_index=True)
                error_detail = {
                    'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    'issue': 'format_issue',
                    'error_message': 'Invalid email format',
                    'source_table': master_table,
                    'target_table': '',
                    'issue_column': 'email',
                    'unique_identifier': f'{key_column}: {key_value}'
                }
                error_logs_m.append(error_detail)

        # Pincode format validation
        if 'pincode' in columns_to_check and 'pincode' in df_master.columns:
            pincode = str(row['pincode']).strip()
            if not re.match(r'^\d{6}$', pincode):
                # Fetch corresponding target value
                if key_value in df_target[target_key].astype(str).str.strip().values:
                    target_row = df_target[df_target[target_key].astype(str).str.strip() == key_value].iloc[0]
                    target_value = target_row['pincode'] if 'pincode' in target_row else "Column not present"
                else:
                    target_value = f"'{target_key}' not present"

                format_issues = pd.concat([format_issues, pd.DataFrame([{
                    key_column: key_value,
                    'column': 'pincode',
                    'value': row['pincode'],
                    'issue': 'Pincode must be exactly 6 digits',
                    f'{target_table}_value': target_value
                }])], ignore_index=True)
                error_detail = {
                    'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    'issue': 'format_issue',
                    'error_message': 'Pincode must be exactly 6 digits',
                    'source_table': master_table,
                    'target_table': '',
                    'issue_column': 'pincode',
                    'unique_identifier': f'{key_column}: {key_value}'
                }

                error_logs_m.append(error_detail)

    logging.info(f"Found {len(format_issues)} format issues.")
    return format_issues, error_logs_m

def validate_pincode_mapping(df_master, df_target, key_column, target_key, target_table, client, master_table):
    """
    Validate pincode mapping by comparing with the all_india_PO_list reference table.
    If a pincode issue is found in the master table, then check the corresponding pincode in the target table.

    Args:
        df_master (pd.DataFrame): The master DataFrame to validate.
        df_target (pd.DataFrame): The target DataFrame to fetch corresponding values.
        key_column (str): The key column in the master DataFrame.
        target_key (str): The key column in the target DataFrame.
        target_table (str): The name of the target table.
        client (bigquery.Client): Initialized BigQuery client.
        master_table (str): Name of the master table.

    Returns:
        tuple: (pincode_mapping_issues_df, error_logs_m)
    """

    error_logs_m = []
    # Read the reference table from Analytics dataset
    try:
        reference_table = "all_india_po_list"
        reference_dataset = "analytics_data"
        query = f"SELECT pincode, city, state FROM `{PROJECT_ID}.{reference_dataset}.{reference_table}`"
        reference_df = client.query(query).to_dataframe()
        reference_df['pincode'] = reference_df['pincode'].astype(str).str.strip()
        reference_df['city'] = reference_df['city'].astype(str).str.strip().str.lower()
        reference_df['state'] = reference_df['state'].astype(str).str.strip().str.lower()
        logging.info(f"Loaded reference pincode mapping from '{reference_table}' in '{reference_dataset}' dataset.")
    except Exception as e:
        logging.error(f"Failed to load reference pincode mapping: {e}")
        return pd.DataFrame(), error_logs_m

    # Check if df_master has 'pincode', 'city', 'state' columns
    required_columns = {'pincode', 'city', 'state'}
    if not required_columns.issubset(df_master.columns):
        logging.info(f"DataFrame does not have required columns for pincode mapping validation: {required_columns}")
        return pd.DataFrame(), error_logs_m

    # Initialize the issues DataFrame with a single target table details column
    pincode_mapping_issues = pd.DataFrame(columns=[
        key_column, 'pincode', 'state', 'city', 'issue',
        f'{target_table}_details'
    ])

    # Iterate over each row in df_master to validate pincode mapping
    for idx, row in df_master.iterrows():
        key_value = str(row[key_column]).strip()
        pincode = str(row['pincode']).strip()
        city = str(row['city']).strip().lower()
        state = str(row['state']).strip().lower()

        # Fetch corresponding target row if exists
        target_row = df_target[df_target[target_key].astype(str).str.strip() == key_value]
        if not target_row.empty:
            target_row = target_row.iloc[0]
            target_pincode = target_row['pincode'] if 'pincode' in target_row and pd.notnull(target_row['pincode']) else "Pincode missing"
            target_state = target_row['state'] if 'state' in target_row and pd.notnull(target_row['state']) else "State missing"
            target_city = target_row['city'] if 'city' in target_row and pd.notnull(target_row['city']) else "City missing"
            target_details = f"Pincode: {target_pincode}, State: {target_state}, City: {target_city}"
        else:
            target_details = f"Key '{key_column}' with value '{key_value}' not present in target table '{target_table}'."

        # Check if pincode exists in reference
        ref_matches = reference_df[reference_df['pincode'] == pincode]
        if ref_matches.empty:
            issue = f"Invalid pincode ({pincode})."
            pincode_mapping_issues = pd.concat([pincode_mapping_issues, pd.DataFrame([{
                key_column: key_value,
                'pincode': pincode,
                'state': state,
                'city': city,
                'issue': issue,
                f'{target_table}_details': target_details
            }])], ignore_index=True)
            error_detail = {
                'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                'issue': 'pincode_mapping',
                'error_message': f"{issue}. {target_table} Details: {target_details}",
                'source_table': master_table,
                'target_table': target_table,
                'issue_column': 'pincode',
                'unique_identifier': f'{key_column}: {key_value}'
            }
            error_logs_m.append(error_detail)
            continue

        # Check if any of the reference entries match both the city and state
        exact_match = ref_matches[
            (ref_matches['city'] == city) & (ref_matches['state'] == state)
        ]
        if not exact_match.empty:
            continue  # No issue, mapping is correct

        # Check for state mismatch
        state_matches = ref_matches[ref_matches['state'] == state]

        # Check for city mismatch
        city_matches = ref_matches[ref_matches['city'] == city]

        if state_matches.empty and city_matches.empty:
            # Both state and city do not match
            expected_entries = ref_matches[['state', 'city']].drop_duplicates()
            expected_states = expected_entries['state'].tolist()
            expected_cities = expected_entries['city'].tolist()
            expected_states_str = ', '.join(expected_states)
            expected_cities_str = ', '.join(expected_cities)
            issue = f"Pincode {pincode} does not match state '{state}' and city '{city}'. Expected states: {expected_states_str}; Expected cities: {expected_cities_str}."
        elif state_matches.empty:
            # State does not match
            expected_states = ref_matches['state'].unique().tolist()
            expected_states_str = ', '.join(expected_states)
            issue = f"Pincode {pincode} does not match state '{state}'. Expected states: {expected_states_str}."
        elif city_matches.empty:
            # City does not match
            expected_cities = state_matches['city'].unique().tolist()
            expected_cities_str = ', '.join(expected_cities)
            issue = f"Pincode {pincode} does not match city '{city}'. Expected cities: {expected_cities_str}."
        else:
            # Other cases
            issue = f"Pincode {pincode} has a mapping inconsistency."

        pincode_mapping_issues = pd.concat([pincode_mapping_issues, pd.DataFrame([{
            key_column: key_value,
            'pincode': pincode,
            'state': state,
            'city': city,
            'issue': issue,
            f'{target_table}_details': target_details
        }])], ignore_index=True)
        error_detail = {
            'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'issue': 'pincode_mapping',
            'error_message': f"{issue}. {target_table} Details: {target_details}",
            'source_table': master_table,
            'target_table': target_table,
            'issue_column': 'pincode',
            'unique_identifier': f'{key_column}: {key_value}'
        }
        error_logs_m.append(error_detail)

    logging.info(f"Found {len(pincode_mapping_issues)} pincode mapping issues in master table '{master_table}'.")
    return pincode_mapping_issues, error_logs_m

def create_table(doc, data, column_names):
    """
    Helper function to create a table in a docx document from a list of dictionaries.

    Args:
        doc (Document): The Word document object.
        data (list or list of dict): Data to populate the table.
        column_names (list): List of column names for the table headers.
    """
    if not data:
        return
    table = doc.add_table(rows=1, cols=len(column_names))
    table.style = 'Light List Accent 1'
    hdr_cells = table.rows[0].cells
    for i, col_name in enumerate(column_names):
        hdr_cells[i].text = col_name

    for row_data in data:
        row_cells = table.add_row().cells
        for i, col_name in enumerate(column_names):
            cell_value = str(row_data.get(col_name, '')).strip()
            row_cells[i].text = cell_value
    logging.info("Added table to the Word document.")

def add_non_matching_keys_section(doc, df1_only_keys, table1_name, df2_only_keys, table2_name, key_column_master, key_column_target):
    """
    Add a section in the Word document for non-matching keys between two tables.

    Args:
        doc (Document): The Word document object.
        df1_only_keys (list): Keys present only in table1.
        table1_name (str): Name of the first table.
        df2_only_keys (list): Keys present only in table2.
        table2_name (str): Name of the second table.
        key_column_master (str): The key column in the master table.
        key_column_target (str): The key column in the target table.
    """
    if df1_only_keys or df2_only_keys:
        if df1_only_keys:
            doc.add_heading(f"'{key_column_master}' present only in '{table1_name}' and not in '{table2_name}' ({len(df1_only_keys)})", level=2)
            create_table(doc, [{key_column_master: key[key_column_master]} for key in df1_only_keys], [key_column_master])
        if df2_only_keys:
            doc.add_heading(f"'{key_column_target}' present only in '{table2_name}' and not in '{table1_name}' ({len(df2_only_keys)})", level=2)
            create_table(doc, [{key_column_target: key[key_column_target]} for key in df2_only_keys], [key_column_target])
    else:
        doc.add_paragraph("No non-matching keys found.")

def add_table_of_contents(doc):
    """
    Adds a Table of Contents to the Word document.

    Args:
        doc (Document): The Word document object.
    """
    paragraph = doc.add_paragraph()
    run = paragraph.add_run()
    fldChar_begin = OxmlElement('w:fldChar')  # creates a new element
    fldChar_begin.set(qn('w:fldCharType'), 'begin')  # sets attribute on element
    instrText = OxmlElement('w:instrText')
    instrText.set(qn('xml:space'), 'preserve')  # sets attribute on element
    instrText.text = 'TOC \\o "1-2" \\h \\z \\u'  # change to what you need
    fldChar_separate = OxmlElement('w:fldChar')
    fldChar_separate.set(qn('w:fldCharType'), 'separate')
    fldChar_end = OxmlElement('w:fldChar')
    fldChar_end.set(qn('w:fldCharType'), 'end')
    run._r.append(fldChar_begin)
    run._r.append(instrText)
    run._r.append(fldChar_separate)
    run._r.append(fldChar_end)
    logging.info("Added Table of Contents to the Word document.")

def create_aggregated_document(all_results, base_name):
    """
    Creates a single Word document that presents all comparison results for a base table.

    Args:
        all_results (list): List of comparison result dictionaries.
        base_name (str): The base name of the table.

    Returns:
        str: The filepath of the saved report.
    """
    doc = Document()
    doc.add_heading(f'{base_name.capitalize()} Tables Comparison Report', level=0)
    doc.add_paragraph(f'Report generated on {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n')

    # Add Instruction for TOC Update
    doc.add_paragraph(
        "📌 **Note:** To update the Table of Contents and make the links clickable, go to the ‘References’ tab and click ‘Update Table’ or press F9 in Windows and Fn+F9 in Mac after opening this document in Microsoft Word.",
        style='Intense Quote'
    )

    # Add Table of Contents
    doc.add_heading('Table of Contents', level=1)
    add_table_of_contents(doc)
    doc.add_page_break()

    for result in all_results:
        table1_name = result['table1_name']
        table2_name = result['table2_name']
        key_column_master = result['key_column_master']
        key_column_target = result['key_column_target']
        doc.add_heading(f'Comparison: {table1_name} vs {table2_name}', level=1)

        # Mismatches
        if result['mismatches']:
            doc.add_heading(f'Mismatches ({len(result["mismatches"])})', level=2)
            column_names = [key_column_master, 'column', f'{table1_name}_value', f'{table2_name}_value']
            create_table(doc, result['mismatches'], column_names)
        else:
            doc.add_heading("No mismatches found.", level=2)

        # Null values in master table
        if not result['null_values_master'].empty:
            count_null_master = len(result['null_values_master'])
            doc.add_heading(f'Null values in {table1_name} ({count_null_master})', level=2)
            column_names = [key_column_master, 'column', table2_name]
            create_table(doc, result['null_values_master'].to_dict('records'), column_names)
        else:
            doc.add_heading(f"No null values found in {table1_name}.", level=2)

        # Null values in target table
        if not result['null_values_target'].empty:
            count_null_target = len(result['null_values_target'])
            doc.add_heading(f'Null values in {table2_name} ({count_null_target})', level=2)
            column_names = [key_column_target, 'column', table1_name]
            create_table(doc, result['null_values_target'].to_dict('records'), column_names)
        else:
            doc.add_heading(f"No null values found in {table2_name}.", level=2)

        # Duplicate keys in master table
        if not result['duplicates_master'].empty:
            count_dup_master = len(result['duplicates_master'])
            doc.add_heading(f'Duplicate Keys in {table1_name} ({count_dup_master})', level=2)
            create_table(doc, result['duplicates_master'].to_dict('records'), [key_column_master, 'Difference in value'])
        else:
            doc.add_heading("No duplicate keys found in master table.", level=2)

        # Duplicate keys in target table
        if not result['duplicates_target'].empty:
            count_dup_target = len(result['duplicates_target'])
            doc.add_heading(f'Duplicate Keys in {table2_name} ({count_dup_target})', level=2)
            create_table(doc, result['duplicates_target'].to_dict('records'), [key_column_target, 'Difference in value'])
        else:
            doc.add_heading(f"No duplicate keys found in {table2_name}.", level=2)

        # Data type issues
        if not result['data_type_issues'].empty:
            count_data_type_issues = len(result['data_type_issues'])
            doc.add_heading(f'Data Type Issues ({count_data_type_issues})', level=2)
            column_names = ['column_name', f'{table1_name}_data_type', f'{table2_name}_data_type']
            create_table(doc, result['data_type_issues'].to_dict('records'), column_names)
        else:
            doc.add_heading("No data type issues found.", level=2)

        # Format issues in master table with target values
        if not result['format_issues_master'].empty:
            count_format_issues_master = len(result['format_issues_master'])
            doc.add_heading(f'Format Issues in {table1_name} ({count_format_issues_master})', level=2)
            column_names_master = [key_column_master, 'column', 'value', 'issue', f'{table2_name}_value']
            create_table(doc, result['format_issues_master'].to_dict('records'), column_names_master)
        else:
            doc.add_heading(f"No format issues found in {table1_name}.", level=2)

        # Pincode Mapping Issues with target details
        if not result['pincode_mapping_issues'].empty:
            count_pincode_issues = len(result['pincode_mapping_issues'])
            doc.add_heading(f'Pincode Mapping Issues in {table1_name} ({count_pincode_issues})', level=2)
            column_names = [
                key_column_master, 'pincode', 'state', 'city', 'issue',
                f'{table2_name}_details'
            ]
            create_table(doc, result['pincode_mapping_issues'].to_dict('records'), column_names)
        else:
            doc.add_heading("No pincode mapping issues found.", level=2)

        # Non-matching keys in master DataFrame
        if result['df_master_only_keys']:
            count_master_only = len(result['df_master_only_keys'])
            doc.add_heading(f'Keys only in {table1_name} ({count_master_only})', level=2)
            column_names = [key_column_master]
            create_table(doc, result['df_master_only_keys'], column_names)
        else:
            doc.add_heading(f"No keys found only in {table1_name}.", level=2)

        # Non-matching keys in target DataFrame
        if result['df_target_only_keys']:
            count_target_only = len(result['df_target_only_keys'])
            doc.add_heading(f'Keys only in {table2_name} ({count_target_only})', level=2)
            column_names = [key_column_target]
            create_table(doc, result['df_target_only_keys'], column_names)
        else:
            doc.add_heading(f"No keys found only in {table2_name}.", level=2)

        doc.add_page_break()  # Optional: Add a page break between comparisons

    # Save the aggregated document to the current directory
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    report_filename = f"{base_name}_comparison_report_aggregated_{timestamp}.docx"
    doc.save(report_filename)
    logging.info(f"Saved aggregated comparison report as '{report_filename}'.")

    return report_filename  # Return the filename for further processing

def send_slack_alert(message):
    """
    Send a message to a specified Slack channel.

    Args:
        message (str): The message to send.
    """
    if not slack_client:
        logging.warning("Slack client is not initialized. Skipping Slack notification.")
        return

    try:
        response = slack_client.chat_postMessage(
            channel=SLACK_CHANNEL,
            text=message
        )
        logging.info(f"Message sent to {SLACK_CHANNEL}: {response['ts']}")
    except SlackApiError as e:
        logging.error(f"Error sending message to Slack: {e.response['error']}")

def upload_file_to_slack(filepath, title=None):
    """
    Upload a file to the specified Slack channel using files_upload_v2.

    Args:
        filepath (str): The path to the file to upload.
        title (str, optional): The title for the uploaded file. Defaults to the file's basename.
    """
    if not slack_client:
        logging.warning("Slack client is not initialized. Skipping file upload.")
        return

    try:
        with open(filepath, 'rb') as f:
            response = slack_client.files_upload_v2(
                channel=SLACK_CHANNEL,
                file=f,
                filename=os.path.basename(filepath),  # Explicitly set the filename with extension
                title=title if title else os.path.basename(filepath),  # Set the title
                initial_comment=title if title else "File uploaded."  # Optional: Add an initial comment
            )

        # Verify if the upload was successful
        if response.get('ok'):
            file_permalink = response['file']['permalink']
            logging.info(f"File uploaded to Slack channel '{SLACK_CHANNEL}': {file_permalink}")
        else:
            logging.error(f"Failed to upload file to Slack: {response}")
    except SlackApiError as e:
        logging.error(f"Slack API Error during file upload: {e.response['error']}")
    except Exception as e:
        logging.error(f"Unexpected error during file upload: {e}")

def find_non_matching_keys(df_master, df_target, master_key, target_key, duplicates_master, duplicates_target, master_table, target_table):
    """
    Identify keys present in df_master but not in df_target and vice versa, including duplicates.

    Args:
        df_master (pd.DataFrame): Source DataFrame.
        df_target (pd.DataFrame): Target DataFrame.
        master_key (str): The key column in the master table.
        target_key (str): The key column in the target table.
        duplicates_master (pd.DataFrame): Duplicate keys in master table.
        duplicates_target (pd.DataFrame): Duplicate keys in target table.
        master_table (str): Name of the master table.
        target_table (str): Name of the target table.

    Returns:
        tuple: (master_only_keys, target_only_keys, error_logs_m)
    """
    error_logs_m = []
    # Include all keys, including duplicates
    keys_master = set(df_master[master_key].astype(str).str.strip())
    keys_target = set(df_target[target_key].astype(str).str.strip())

    # Keys present only in master
    master_only = keys_master - keys_target
    # Keys present only in target
    target_only = keys_target - keys_master

    logging.info(f"Found {len(master_only)} keys in source not in target and {len(target_only)} keys in target not in source.")

    # Convert to list of dictionaries for consistency
    master_only_keys = [{master_key: key} for key in master_only]
    target_only_keys = [{target_key: key} for key in target_only]

    # Log errors for keys only in master
    for key in master_only:
        error_detail = {
            'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'issue': 'missing_key',
            'error_message': f"Key '{master_key}' with value '{key}' is present only in '{master_table}' and missing in '{target_table}'.",
            'source_table': master_table,
            'target_table': target_table, 
            'issue_column': master_key,
            'unique_identifier': f"{master_key}: {key}"
        }
        error_logs_m.append(error_detail)

    # Log errors for keys only in target
    for key in target_only:
        error_detail = {
            'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'issue': 'missing_key',
            'error_message': f"Key '{target_key}' with value '{key}' is present only in '{target_table}' and missing in '{master_table}'.",
            'source_table': target_table,
            'target_table': master_table,
            'issue_column': target_key,
            'unique_identifier': f"{target_key}: {key}"
        }
        error_logs_m.append(error_detail)

    return master_only_keys, target_only_keys, error_logs_m

def find_detailed_nulls(df_master, df_target, master_key, target_key, master_table, target_table, columns_to_check):
    """
    Identify null values in both master and target tables for specified columns and fetch corresponding values or indicate missing keys.

    Args:
        df_master (pd.DataFrame): Source DataFrame (master_hub_ table).
        df_target (pd.DataFrame): Target DataFrame (prefixed table).
        master_key (str): The key column in the master table.
        target_key (str): The key column in the target table.
        master_table (str): Name of the master table.
        target_table (str): Name of the target table.
        columns_to_check (list): List of columns to check for null values.

    Returns:
        tuple: (null_values_master, null_values_target, error_logs_m)
    """
    null_values_master = []
    null_values_target = []
    error_logs_m = []

    # Find nulls in master
    null_master = df_master[df_master[columns_to_check].isnull().any(axis=1)]
    for idx, row in null_master.iterrows():
        key_value = str(row[master_key]).strip()
        for column in columns_to_check:
            if column == master_key or column.startswith('_boltic_'):
                continue  # Skip key column and non-important columns
            if column not in row:
                continue  # Skip if column is not in the row
            if pd.isnull(row[column]):
                if key_value in df_target[target_key].astype(str).str.strip().values:
                    target_row = df_target[df_target[target_key].astype(str).str.strip() == key_value].iloc[0]
                    target_value = target_row[column] if column in target_row else "Column not present"
                else:
                    target_value = f"'{target_key}' not present"
                null_record = {
                    master_key: key_value,
                    'column': column,
                    target_table: target_value
                }
                error_detail = {
                    'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    'issue': 'null',
                    'error_message': 'Null in columns',
                    'source_table': master_table,
                    'target_table': '',
                    'issue_column': column,
                    'unique_identifier': f'{master_key} : {key_value}'
                }
                error_logs_m.append(error_detail)
                null_values_master.append(null_record)

    # Find nulls in target
    null_target = df_target[df_target[columns_to_check].isnull().any(axis=1)]
    for idx, row in null_target.iterrows():
        key_value = str(row[target_key]).strip()
        for column in columns_to_check:
            if column == target_key or column.startswith('_boltic_'):
                continue  # Skip key column and non-important columns
            if column not in row:
                continue  # Skip if column is not in the row
            if pd.isnull(row[column]):
                if key_value in df_master[master_key].astype(str).str.strip().values:
                    master_row = df_master[df_master[master_key].astype(str).str.strip() == key_value].iloc[0]
                    master_value = master_row[column] if column in master_row else "Column not present"
                else:
                    master_value = f"'{master_key}' not present"
                null_record = {
                    target_key: key_value,
                    'column': column,
                    master_table: master_value
                }
                error_detail = {
                    'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    'issue': 'null',
                    'error_message': 'Null in columns',
                    'source_table': target_table,
                    'target_table': '',
                    'issue_column': column,
                    'unique_identifier': f'{target_key} : {key_value}'
                }
                error_logs_m.append(error_detail)
                null_values_target.append(null_record)

    logging.info(f"Found {len(null_values_master)} null values in master table '{master_table}'.")
    logging.info(f"Found {len(null_values_target)} null values in target table '{target_table}'.")
    return null_values_master, null_values_target, error_logs_m

def validate_pincode_mapping(df_master, df_target, key_column, target_key, target_table, client, master_table):
    """
    Validate pincode mapping by comparing with the all_india_PO_list reference table.
    If a pincode issue is found in the master table, then check the corresponding pincode in the target table.

    Args:
        df_master (pd.DataFrame): The master DataFrame to validate.
        df_target (pd.DataFrame): The target DataFrame to fetch corresponding values.
        key_column (str): The key column in the master DataFrame.
        target_key (str): The key column in the target DataFrame.
        target_table (str): The name of the target table.
        client (bigquery.Client): Initialized BigQuery client.
        master_table (str): Name of the master table.

    Returns:
        tuple: (pincode_mapping_issues_df, error_logs_m)
    """

    error_logs_m = []
    # Read the reference table from Analytics dataset
    try:
        reference_table = "all_india_PO_list"
        reference_dataset = "Analytics"
        query = f"SELECT pincode, city, state FROM `{PROJECT_ID}.{reference_dataset}.{reference_table}`"
        reference_df = client.query(query).to_dataframe()
        reference_df['pincode'] = reference_df['pincode'].astype(str).str.strip()
        reference_df['city'] = reference_df['city'].astype(str).str.strip().str.lower()
        reference_df['state'] = reference_df['state'].astype(str).str.strip().str.lower()
        logging.info(f"Loaded reference pincode mapping from '{reference_table}' in '{reference_dataset}' dataset.")
    except Exception as e:
        logging.error(f"Failed to load reference pincode mapping: {e}")
        return pd.DataFrame(), error_logs_m

    # Check if df_master has 'pincode', 'city', 'state' columns
    required_columns = {'pincode', 'city', 'state'}
    if not required_columns.issubset(df_master.columns):
        logging.info(f"DataFrame does not have required columns for pincode mapping validation: {required_columns}")
        return pd.DataFrame(), error_logs_m

    # Initialize the issues DataFrame with a single target table details column
    pincode_mapping_issues = pd.DataFrame(columns=[
        key_column, 'pincode', 'state', 'city', 'issue',
        f'{target_table}_details'
    ])

    # Iterate over each row in df_master to validate pincode mapping
    for idx, row in df_master.iterrows():
        key_value = str(row[key_column]).strip()
        pincode = str(row['pincode']).strip()
        city = str(row['city']).strip().lower()
        state = str(row['state']).strip().lower()

        # Fetch corresponding target row if exists
        target_row = df_target[df_target[target_key].astype(str).str.strip() == key_value]
        if not target_row.empty:
            target_row = target_row.iloc[0]
            target_pincode = target_row['pincode'] if 'pincode' in target_row and pd.notnull(target_row['pincode']) else "Pincode missing"
            target_state = target_row['state'] if 'state' in target_row and pd.notnull(target_row['state']) else "State missing"
            target_city = target_row['city'] if 'city' in target_row and pd.notnull(target_row['city']) else "City missing"
            target_details = f"Pincode: {target_pincode}, State: {target_state}, City: {target_city}"
        else:
            target_details = f"Key '{key_column}' with value '{key_value}' not present in target table '{target_table}'."

        # Check if pincode exists in reference
        ref_matches = reference_df[reference_df['pincode'] == pincode]
        if ref_matches.empty:
            issue = f"Invalid pincode ({pincode})."
            pincode_mapping_issues = pd.concat([pincode_mapping_issues, pd.DataFrame([{
                key_column: key_value,
                'pincode': pincode,
                'state': state,
                'city': city,
                'issue': issue,
                f'{target_table}_details': target_details
            }])], ignore_index=True)
            error_detail = {
                'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                'issue': 'pincode_mapping',
                'error_message': f"{issue}. {target_table} Details: {target_details}",
                'source_table': master_table,
                'target_table': target_table,
                'issue_column': 'pincode',
                'unique_identifier': f'{key_column}: {key_value}'
            }
            error_logs_m.append(error_detail)
            continue

        # Check if any of the reference entries match both the city and state
        exact_match = ref_matches[
            (ref_matches['city'] == city) & (ref_matches['state'] == state)
        ]
        if not exact_match.empty:
            continue  # No issue, mapping is correct

        # Check for state mismatch
        state_matches = ref_matches[ref_matches['state'] == state]

        # Check for city mismatch
        city_matches = ref_matches[ref_matches['city'] == city]

        if state_matches.empty and city_matches.empty:
            # Both state and city do not match
            expected_entries = ref_matches[['state', 'city']].drop_duplicates()
            expected_states = expected_entries['state'].tolist()
            expected_cities = expected_entries['city'].tolist()
            expected_states_str = ', '.join(expected_states)
            expected_cities_str = ', '.join(expected_cities)
            issue = f"Pincode {pincode} does not match state '{state}' and city '{city}'. Expected states: {expected_states_str}; Expected cities: {expected_cities_str}."
        elif state_matches.empty:
            # State does not match
            expected_states = ref_matches['state'].unique().tolist()
            expected_states_str = ', '.join(expected_states)
            issue = f"Pincode {pincode} does not match state '{state}'. Expected states: {expected_states_str}."
        elif city_matches.empty:
            # City does not match
            expected_cities = state_matches['city'].unique().tolist()
            expected_cities_str = ', '.join(expected_cities)
            issue = f"Pincode {pincode} does not match city '{city}'. Expected cities: {expected_cities_str}."
        else:
            # Other cases
            issue = f"Pincode {pincode} has a mapping inconsistency."

        pincode_mapping_issues = pd.concat([pincode_mapping_issues, pd.DataFrame([{
            key_column: key_value,
            'pincode': pincode,
            'state': state,
            'city': city,
            'issue': issue,
            f'{target_table}_details': target_details
        }])], ignore_index=True)
        error_detail = {
            'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'issue': 'pincode_mapping',
            'error_message': f"{issue}. {target_table} Details: {target_details}",
            'source_table': master_table,
            'target_table': target_table,
            'issue_column': 'pincode',
            'unique_identifier': f'{key_column}: {key_value}'
        }
        error_logs_m.append(error_detail)

    logging.info(f"Found {len(pincode_mapping_issues)} pincode mapping issues in master table '{master_table}'.")
    return pincode_mapping_issues, error_logs_m

def create_table(doc, data, column_names):
    """
    Helper function to create a table in a docx document from a list of dictionaries.

    Args:
        doc (Document): The Word document object.
        data (list or list of dict): Data to populate the table.
        column_names (list): List of column names for the table headers.
    """
    if not data:
        return
    table = doc.add_table(rows=1, cols=len(column_names))
    table.style = 'Light List Accent 1'
    hdr_cells = table.rows[0].cells
    for i, col_name in enumerate(column_names):
        hdr_cells[i].text = col_name

    for row_data in data:
        row_cells = table.add_row().cells
        for i, col_name in enumerate(column_names):
            cell_value = str(row_data.get(col_name, '')).strip()
            row_cells[i].text = cell_value
    logging.info("Added table to the Word document.")

def add_non_matching_keys_section(doc, df1_only_keys, table1_name, df2_only_keys, table2_name, key_column_master, key_column_target):
    """
    Add a section in the Word document for non-matching keys between two tables.

    Args:
        doc (Document): The Word document object.
        df1_only_keys (list): Keys present only in table1.
        table1_name (str): Name of the first table.
        df2_only_keys (list): Keys present only in table2.
        table2_name (str): Name of the second table.
        key_column_master (str): The key column in the master table.
        key_column_target (str): The key column in the target table.
    """
    if df1_only_keys or df2_only_keys:
        if df1_only_keys:
            doc.add_heading(f"'{key_column_master}' present only in '{table1_name}' and not in '{table2_name}' ({len(df1_only_keys)})", level=2)
            create_table(doc, [{key_column_master: key[key_column_master]} for key in df1_only_keys], [key_column_master])
        if df2_only_keys:
            doc.add_heading(f"'{key_column_target}' present only in '{table2_name}' and not in '{table1_name}' ({len(df2_only_keys)})", level=2)
            create_table(doc, [{key_column_target: key[key_column_target]} for key in df2_only_keys], [key_column_target])
    else:
        doc.add_paragraph("No non-matching keys found.")

def add_table_of_contents(doc):
    """
    Adds a Table of Contents to the Word document.

    Args:
        doc (Document): The Word document object.
    """
    paragraph = doc.add_paragraph()
    run = paragraph.add_run()
    fldChar_begin = OxmlElement('w:fldChar')  # creates a new element
    fldChar_begin.set(qn('w:fldCharType'), 'begin')  # sets attribute on element
    instrText = OxmlElement('w:instrText')
    instrText.set(qn('xml:space'), 'preserve')  # sets attribute on element
    instrText.text = 'TOC \\o "1-2" \\h \\z \\u'  # change to what you need
    fldChar_separate = OxmlElement('w:fldChar')
    fldChar_separate.set(qn('w:fldCharType'), 'separate')
    fldChar_end = OxmlElement('w:fldChar')
    fldChar_end.set(qn('w:fldCharType'), 'end')
    run._r.append(fldChar_begin)
    run._r.append(instrText)
    run._r.append(fldChar_separate)
    run._r.append(fldChar_end)
    logging.info("Added Table of Contents to the Word document.")

def create_aggregated_document(all_results, base_name):
    """
    Creates a single Word document that presents all comparison results for a base table.

    Args:
        all_results (list): List of comparison result dictionaries.
        base_name (str): The base name of the table.

    Returns:
        str: The filepath of the saved report.
    """
    doc = Document()
    doc.add_heading(f'{base_name.capitalize()} Tables Comparison Report', level=0)
    doc.add_paragraph(f'Report generated on {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n')

    # Add Instruction for TOC Update
    doc.add_paragraph(
        "📌 **Note:** To update the Table of Contents and make the links clickable, go to the ‘References’ tab and click ‘Update Table’ or press F9 in Windows and Fn+F9 in Mac after opening this document in Microsoft Word.",
        style='Intense Quote'
    )

    # Add Table of Contents
    doc.add_heading('Table of Contents', level=1)
    add_table_of_contents(doc)
    doc.add_page_break()

    for result in all_results:
        table1_name = result['table1_name']
        table2_name = result['table2_name']
        key_column_master = result['key_column_master']
        key_column_target = result['key_column_target']
        doc.add_heading(f'Comparison: {table1_name} vs {table2_name}', level=1)

        # Mismatches
        if result['mismatches']:
            doc.add_heading(f'Mismatches ({len(result["mismatches"])})', level=2)
            column_names = [key_column_master, 'column', f'{table1_name}_value', f'{table2_name}_value']
            create_table(doc, result['mismatches'], column_names)
        else:
            doc.add_heading("No mismatches found.", level=2)

        # Null values in master table
        if not result['null_values_master'].empty:
            count_null_master = len(result['null_values_master'])
            doc.add_heading(f'Null values in {table1_name} ({count_null_master})', level=2)
            column_names = [key_column_master, 'column', table2_name]
            create_table(doc, result['null_values_master'].to_dict('records'), column_names)
        else:
            doc.add_heading(f"No null values found in {table1_name}.", level=2)

        # Null values in target table
        if not result['null_values_target'].empty:
            count_null_target = len(result['null_values_target'])
            doc.add_heading(f'Null values in {table2_name} ({count_null_target})', level=2)
            column_names = [key_column_target, 'column', table1_name]
            create_table(doc, result['null_values_target'].to_dict('records'), column_names)
        else:
            doc.add_heading(f"No null values found in {table2_name}.", level=2)

        # Duplicate keys in master table
        if not result['duplicates_master'].empty:
            count_dup_master = len(result['duplicates_master'])
            doc.add_heading(f'Duplicate Keys in {table1_name} ({count_dup_master})', level=2)
            create_table(doc, result['duplicates_master'].to_dict('records'), [key_column_master, 'Difference in value'])
        else:
            doc.add_heading("No duplicate keys found in master table.", level=2)

        # Duplicate keys in target table
        if not result['duplicates_target'].empty:
            count_dup_target = len(result['duplicates_target'])
            doc.add_heading(f'Duplicate Keys in {table2_name} ({count_dup_target})', level=2)
            create_table(doc, result['duplicates_target'].to_dict('records'), [key_column_target, 'Difference in value'])
        else:
            doc.add_heading(f"No duplicate keys found in {table2_name}.", level=2)

        # Data type issues
        if not result['data_type_issues'].empty:
            count_data_type_issues = len(result['data_type_issues'])
            doc.add_heading(f'Data Type Issues ({count_data_type_issues})', level=2)
            column_names = ['column_name', f'{table1_name}_data_type', f'{table2_name}_data_type']
            create_table(doc, result['data_type_issues'].to_dict('records'), column_names)
        else:
            doc.add_heading("No data type issues found.", level=2)

        # Format issues in master table with target values
        if not result['format_issues_master'].empty:
            count_format_issues_master = len(result['format_issues_master'])
            doc.add_heading(f'Format Issues in {table1_name} ({count_format_issues_master})', level=2)
            column_names_master = [key_column_master, 'column', 'value', 'issue', f'{table2_name}_value']
            create_table(doc, result['format_issues_master'].to_dict('records'), column_names_master)
        else:
            doc.add_heading(f"No format issues found in {table1_name}.", level=2)

        # Pincode Mapping Issues with target details
        if not result['pincode_mapping_issues'].empty:
            count_pincode_issues = len(result['pincode_mapping_issues'])
            doc.add_heading(f'Pincode Mapping Issues in {table1_name} ({count_pincode_issues})', level=2)
            column_names = [
                key_column_master, 'pincode', 'state', 'city', 'issue',
                f'{table2_name}_details'
            ]
            create_table(doc, result['pincode_mapping_issues'].to_dict('records'), column_names)
        else:
            doc.add_heading("No pincode mapping issues found.", level=2)

        # Non-matching keys in master DataFrame
        if result['df_master_only_keys']:
            count_master_only = len(result['df_master_only_keys'])
            doc.add_heading(f'Keys only in {table1_name} ({count_master_only})', level=2)
            column_names = [key_column_master]
            create_table(doc, result['df_master_only_keys'], column_names)
        else:
            doc.add_heading(f"No keys found only in {table1_name}.", level=2)

        # Non-matching keys in target DataFrame
        if result['df_target_only_keys']:
            count_target_only = len(result['df_target_only_keys'])
            doc.add_heading(f'Keys only in {table2_name} ({count_target_only})', level=2)
            column_names = [key_column_target]
            create_table(doc, result['df_target_only_keys'], column_names)
        else:
            doc.add_heading(f"No keys found only in {table2_name}.", level=2)

        doc.add_page_break()  # Optional: Add a page break between comparisons

    # Save the aggregated document to the current directory
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    report_filename = f"{base_name}_comparison_report_aggregated_{timestamp}.docx"
    doc.save(report_filename)
    logging.info(f"Saved aggregated comparison report as '{report_filename}'.")

    return report_filename  # Return the filename for further processing

def send_slack_alert(message):
    """
    Send a message to a specified Slack channel.

    Args:
        message (str): The message to send.
    """
    if not slack_client:
        logging.warning("Slack client is not initialized. Skipping Slack notification.")
        return

    try:
        response = slack_client.chat_postMessage(
            channel=SLACK_CHANNEL,
            text=message
        )
        logging.info(f"Message sent to {SLACK_CHANNEL}: {response['ts']}")
    except SlackApiError as e:
        logging.error(f"Error sending message to Slack: {e.response['error']}")

def upload_file_to_slack(filepath, title=None):
    """
    Upload a file to the specified Slack channel using files_upload_v2.

    Args:
        filepath (str): The path to the file to upload.
        title (str, optional): The title for the uploaded file. Defaults to the file's basename.
    """
    if not slack_client:
        logging.warning("Slack client is not initialized. Skipping file upload.")
        return

    try:
        with open(filepath, 'rb') as f:
            response = slack_client.files_upload_v2(
                channel=SLACK_CHANNEL,
                file=f,
                filename=os.path.basename(filepath),  # Explicitly set the filename with extension
                title=title if title else os.path.basename(filepath),  # Set the title
                initial_comment=title if title else "File uploaded."  # Optional: Add an initial comment
            )

        # Verify if the upload was successful
        if response.get('ok'):
            file_permalink = response['file']['permalink']
            logging.info(f"File uploaded to Slack channel '{SLACK_CHANNEL}': {file_permalink}")
        else:
            logging.error(f"Failed to upload file to Slack: {response}")
    except SlackApiError as e:
        logging.error(f"Slack API Error during file upload: {e.response['error']}")
    except Exception as e:
        logging.error(f"Unexpected error during file upload: {e}")

def find_non_matching_keys(df_master, df_target, master_key, target_key, duplicates_master, duplicates_target, master_table, target_table):
    """
    Identify keys present in df_master but not in df_target and vice versa, including duplicates.

    Args:
        df_master (pd.DataFrame): Source DataFrame.
        df_target (pd.DataFrame): Target DataFrame.
        master_key (str): The key column in the master table.
        target_key (str): The key column in the target table.
        duplicates_master (pd.DataFrame): Duplicate keys in master table.
        duplicates_target (pd.DataFrame): Duplicate keys in target table.
        master_table (str): Name of the master table.
        target_table (str): Name of the target table.

    Returns:
        tuple: (master_only_keys, target_only_keys, error_logs_m)
    """
    error_logs_m = []
    # Include all keys, including duplicates
    keys_master = set(df_master[master_key].astype(str).str.strip())
    keys_target = set(df_target[target_key].astype(str).str.strip())

    # Keys present only in master
    master_only = keys_master - keys_target
    # Keys present only in target
    target_only = keys_target - keys_master

    logging.info(f"Found {len(master_only)} keys in source not in target and {len(target_only)} keys in target not in source.")

    # Convert to list of dictionaries for consistency
    master_only_keys = [{master_key: key} for key in master_only]
    target_only_keys = [{target_key: key} for key in target_only]

    # Log errors for keys only in master
    for key in master_only:
        error_detail = {
            'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'issue': 'missing_key',
            'error_message': f"Key '{master_key}' with value '{key}' is present only in '{master_table}' and missing in '{target_table}'.",
            'source_table': master_table,
            'target_table': target_table, 
            'issue_column': master_key,
            'unique_identifier': f"{master_key}: {key}"
        }
        error_logs_m.append(error_detail)

    # Log errors for keys only in target
    for key in target_only:
        error_detail = {
            'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'issue': 'missing_key',
            'error_message': f"Key '{target_key}' with value '{key}' is present only in '{target_table}' and missing in '{master_table}'.",
            'source_table': target_table,
            'target_table': master_table,
            'issue_column': target_key,
            'unique_identifier': f"{target_key}: {key}"
        }
        error_logs_m.append(error_detail)

    return master_only_keys, target_only_keys, error_logs_m

def find_detailed_nulls(df_master, df_target, master_key, target_key, master_table, target_table, columns_to_check):
    """
    Identify null values in both master and target tables for specified columns and fetch corresponding values or indicate missing keys.

    Args:
        df_master (pd.DataFrame): Source DataFrame (master_hub_ table).
        df_target (pd.DataFrame): Target DataFrame (prefixed table).
        master_key (str): The key column in the master table.
        target_key (str): The key column in the target table.
        master_table (str): Name of the master table.
        target_table (str): Name of the target table.
        columns_to_check (list): List of columns to check for null values.

    Returns:
        tuple: (null_values_master, null_values_target, error_logs_m)
    """
    null_values_master = []
    null_values_target = []
    error_logs_m = []

    # Find nulls in master
    null_master = df_master[df_master[columns_to_check].isnull().any(axis=1)]
    for idx, row in null_master.iterrows():
        key_value = str(row[master_key]).strip()
        for column in columns_to_check:
            if column == master_key or column.startswith('_boltic_'):
                continue  # Skip key column and non-important columns
            if column not in row:
                continue  # Skip if column is not in the row
            if pd.isnull(row[column]):
                if key_value in df_target[target_key].astype(str).str.strip().values:
                    target_row = df_target[df_target[target_key].astype(str).str.strip() == key_value].iloc[0]
                    target_value = target_row[column] if column in target_row else "Column not present"
                else:
                    target_value = f"'{target_key}' not present"
                null_record = {
                    master_key: key_value,
                    'column': column,
                    target_table: target_value
                }
                error_detail = {
                    'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    'issue': 'null',
                    'error_message': 'Null in columns',
                    'source_table': master_table,
                    'target_table': '',
                    'issue_column': column,
                    'unique_identifier': f'{master_key} : {key_value}'
                }
                error_logs_m.append(error_detail)
                null_values_master.append(null_record)

    # Find nulls in target
    null_target = df_target[df_target[columns_to_check].isnull().any(axis=1)]
    for idx, row in null_target.iterrows():
        key_value = str(row[target_key]).strip()
        for column in columns_to_check:
            if column == target_key or column.startswith('_boltic_'):
                continue  # Skip key column and non-important columns
            if column not in row:
                continue  # Skip if column is not in the row
            if pd.isnull(row[column]):
                if key_value in df_master[master_key].astype(str).str.strip().values:
                    master_row = df_master[df_master[master_key].astype(str).str.strip() == key_value].iloc[0]
                    master_value = master_row[column] if column in master_row else "Column not present"
                else:
                    master_value = f"'{master_key}' not present"
                null_record = {
                    target_key: key_value,
                    'column': column,
                    master_table: master_value
                }
                error_detail = {
                    'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    'issue': 'null',
                    'error_message': 'Null in columns',
                    'source_table': target_table,
                    'target_table': '',
                    'issue_column': column,
                    'unique_identifier': f'{target_key} : {key_value}'
                }
                error_logs_m.append(error_detail)
                null_values_target.append(null_record)

    logging.info(f"Found {len(null_values_master)} null values in master table '{master_table}'.")
    logging.info(f"Found {len(null_values_target)} null values in target table '{target_table}'.")
    return null_values_master, null_values_target, error_logs_m

def validate_pincode_mapping(df_master, df_target, key_column, target_key, target_table, client, master_table):
    """
    Validate pincode mapping by comparing with the all_india_PO_list reference table.
    If a pincode issue is found in the master table, then check the corresponding pincode in the target table.

    Args:
        df_master (pd.DataFrame): The master DataFrame to validate.
        df_target (pd.DataFrame): The target DataFrame to fetch corresponding values.
        key_column (str): The key column in the master DataFrame.
        target_key (str): The key column in the target DataFrame.
        target_table (str): The name of the target table.
        client (bigquery.Client): Initialized BigQuery client.
        master_table (str): Name of the master table.

    Returns:
        tuple: (pincode_mapping_issues_df, error_logs_m)
    """

    error_logs_m = []
    # Read the reference table from Analytics dataset
    try:
        reference_table = "all_india_PO_list"
        reference_dataset = "Analytics"
        query = f"SELECT pincode, city, state FROM `{PROJECT_ID}.{reference_dataset}.{reference_table}`"
        reference_df = client.query(query).to_dataframe()
        reference_df['pincode'] = reference_df['pincode'].astype(str).str.strip()
        reference_df['city'] = reference_df['city'].astype(str).str.strip().str.lower()
        reference_df['state'] = reference_df['state'].astype(str).str.strip().str.lower()
        logging.info(f"Loaded reference pincode mapping from '{reference_table}' in '{reference_dataset}' dataset.")
    except Exception as e:
        logging.error(f"Failed to load reference pincode mapping: {e}")
        return pd.DataFrame(), error_logs_m

    # Check if df_master has 'pincode', 'city', 'state' columns
    required_columns = {'pincode', 'city', 'state'}
    if not required_columns.issubset(df_master.columns):
        logging.info(f"DataFrame does not have required columns for pincode mapping validation: {required_columns}")
        return pd.DataFrame(), error_logs_m

    # Initialize the issues DataFrame with a single target table details column
    pincode_mapping_issues = pd.DataFrame(columns=[
        key_column, 'pincode', 'state', 'city', 'issue',
        f'{target_table}_details'
    ])

    # Iterate over each row in df_master to validate pincode mapping
    for idx, row in df_master.iterrows():
        key_value = str(row[key_column]).strip()
        pincode = str(row['pincode']).strip()
        city = str(row['city']).strip().lower()
        state = str(row['state']).strip().lower()

        # Fetch corresponding target row if exists
        target_row = df_target[df_target[target_key].astype(str).str.strip() == key_value]
        if not target_row.empty:
            target_row = target_row.iloc[0]
            target_pincode = target_row['pincode'] if 'pincode' in target_row and pd.notnull(target_row['pincode']) else "Pincode missing"
            target_state = target_row['state'] if 'state' in target_row and pd.notnull(target_row['state']) else "State missing"
            target_city = target_row['city'] if 'city' in target_row and pd.notnull(target_row['city']) else "City missing"
            target_details = f"Pincode: {target_pincode}, State: {target_state}, City: {target_city}"
        else:
            target_details = f"Key '{key_column}' with value '{key_value}' not present in target table '{target_table}'."

        # Check if pincode exists in reference
        ref_matches = reference_df[reference_df['pincode'] == pincode]
        if ref_matches.empty:
            issue = f"Invalid pincode ({pincode})."
            pincode_mapping_issues = pd.concat([pincode_mapping_issues, pd.DataFrame([{
                key_column: key_value,
                'pincode': pincode,
                'state': state,
                'city': city,
                'issue': issue,
                f'{target_table}_details': target_details
            }])], ignore_index=True)
            error_detail = {
                'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                'issue': 'pincode_mapping',
                'error_message': f"{issue}. {target_table} Details: {target_details}",
                'source_table': master_table,
                'target_table': target_table,
                'issue_column': 'pincode',
                'unique_identifier': f'{key_column}: {key_value}'
            }
            error_logs_m.append(error_detail)
            continue

        # Check if any of the reference entries match both the city and state
        exact_match = ref_matches[
            (ref_matches['city'] == city) & (ref_matches['state'] == state)
        ]
        if not exact_match.empty:
            continue  # No issue, mapping is correct

        # Check for state mismatch
        state_matches = ref_matches[ref_matches['state'] == state]

        # Check for city mismatch
        city_matches = ref_matches[ref_matches['city'] == city]

        if state_matches.empty and city_matches.empty:
            # Both state and city do not match
            expected_entries = ref_matches[['state', 'city']].drop_duplicates()
            expected_states = expected_entries['state'].tolist()
            expected_cities = expected_entries['city'].tolist()
            expected_states_str = ', '.join(expected_states)
            expected_cities_str = ', '.join(expected_cities)
            issue = f"Pincode {pincode} does not match state '{state}' and city '{city}'. Expected states: {expected_states_str}; Expected cities: {expected_cities_str}."
        elif state_matches.empty:
            # State does not match
            expected_states = ref_matches['state'].unique().tolist()
            expected_states_str = ', '.join(expected_states)
            issue = f"Pincode {pincode} does not match state '{state}'. Expected states: {expected_states_str}."
        elif city_matches.empty:
            # City does not match
            expected_cities = state_matches['city'].unique().tolist()
            expected_cities_str = ', '.join(expected_cities)
            issue = f"Pincode {pincode} does not match city '{city}'. Expected cities: {expected_cities_str}."
        else:
            # Other cases
            issue = f"Pincode {pincode} has a mapping inconsistency."

        pincode_mapping_issues = pd.concat([pincode_mapping_issues, pd.DataFrame([{
            key_column: key_value,
            'pincode': pincode,
            'state': state,
            'city': city,
            'issue': issue,
            f'{target_table}_details': target_details
        }])], ignore_index=True)
        error_detail = {
            'time_stamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'issue': 'pincode_mapping',
            'error_message': f"{issue}. {target_table} Details: {target_details}",
            'source_table': master_table,
            'target_table': target_table,
            'issue_column': 'pincode',
            'unique_identifier': f'{key_column}: {key_value}'
        }
        error_logs_m.append(error_detail)

    logging.info(f"Found {len(pincode_mapping_issues)} pincode mapping issues in master table '{master_table}'.")
    return pincode_mapping_issues, error_logs_m

def compare_tables(client, dataset_name, base_name, master_table, target_table, master_key, target_key):
    """
    Compare two tables and generate a report.

    Args:
        client (bigquery.Client): Initialized BigQuery client.
        dataset_name (str): The name of the dataset.
        base_name (str): The base name of the table.
        master_table (str): Name of the master_hub_ table.
        target_table (str): Name of the target prefixed table.
        master_key (str): The key column in the master table.
        target_key (str): The key column in the target table.

    Returns:
        dict: A dictionary containing all comparison results.
    """
    logging.info(f"Starting comparison for base table '{base_name}': '{master_table}' vs '{target_table}'.")

    # Initialize comparison results
    mismatches = []
    null_values_master = []
    null_values_target = []
    data_type_issues = pd.DataFrame()
    format_issues_master = pd.DataFrame()
    pincode_mapping_issues = pd.DataFrame()
    duplicates_master = pd.DataFrame()
    duplicates_target = pd.DataFrame()
    master_only_keys = []
    target_only_keys = []

    # Load data
    df_master = load_table_from_bigquery(client, dataset_name, master_table)
    df_target = load_table_from_bigquery(client, dataset_name, target_table)

    # Apply standardization
    df_master = standardize_dataframe(df_master, exclude_columns=[master_key])
    df_target = standardize_dataframe(df_target, exclude_columns=[target_key])

    # Apply active filter if defined
    base_table_info = BASE_TABLES.get(base_name, {})
    active_filter = base_table_info.get('active_filter')
    perform_checks = base_table_info.get('perform_checks', True)

    if active_filter:
        column = active_filter.get('column')
        value = active_filter.get('value')
        if column and column in df_master.columns:
            initial_count = len(df_master)
            df_master = df_master[df_master[column] == value]
            filtered_count = len(df_master)
            logging.info(f"Filtered '{base_name}' master table: {initial_count - filtered_count} records excluded based on {column} = {value}.")
        else:
            logging.warning(f"Active filter specified but column '{column}' not found in master table '{master_table}'.")

    if df_master.empty or df_target.empty:
        logging.warning(f"One of the tables '{master_table}' or '{target_table}' is empty. Skipping comparison.")
        return None

    # Identify BigNumeric columns in master and target tables
    schema_master = get_table_schema(client, dataset_name, master_table)
    schema_target = get_table_schema(client, dataset_name, target_table)
    bignumeric_columns_master = [col for col, dtype in schema_master.items() if dtype == 'BIGNUMERIC']
    bignumeric_columns_target = [col for col, dtype in schema_target.items() if dtype == 'BIGNUMERIC']

    # Format BigNumeric columns in master table
    for col in bignumeric_columns_master:
        if col in df_master.columns:
            df_master[col] = df_master[col].apply(lambda x: format(x, '.0f') if pd.notnull(x) else x)

    # Format BigNumeric columns in target table
    for col in bignumeric_columns_target:
        if col in df_target.columns:
            df_target[col] = df_target[col].apply(lambda x: format(x, '.0f') if pd.notnull(x) else x)

    # Get imp_columns and non_imp_columns
    imp_columns = Imp_columns.get(base_name, None)
    non_imp_columns = Non_imp_columns.get(base_name, [])

    # Identify common columns
    common_columns, master_unique_cols, target_unique_cols = find_common_and_non_common_columns(df_master, df_target)

    if not common_columns:
        logging.warning(f"No common columns found between '{master_table}' and '{target_table}'. Skipping comparison.")
        return None

    # Determine columns to check based on imp_columns
    if imp_columns:
        columns_to_check = [col for col in imp_columns if col in common_columns]
        logging.info(f"Important columns defined for '{base_name}': {columns_to_check}")
    else:
        columns_to_check = [col for col in common_columns if col not in non_imp_columns]
        logging.info(f"No important columns defined for '{base_name}'. Applying checks to all columns except non_imp_columns: {columns_to_check}")

    if perform_checks:
        # Find duplicates in both tables
        duplicates_master, error_logs_m = find_duplicates(df_master, master_key, master_table)
        ERROR_LOG_M.extend(error_logs_m)
        duplicates_target, error_logs_m = find_duplicates(df_target, target_key, target_table)
        ERROR_LOG_M.extend(error_logs_m)

    if not duplicates_master.empty:
        logging.warning(f"Duplicate keys found in source table '{master_table}'. These will be reported but not used in mismatch comparison.")
    if not duplicates_target.empty:
        logging.warning(f"Duplicate keys found in target table '{target_table}'. These will be reported but not used in mismatch comparison.")

    # Perform mismatch comparison
    if perform_checks:
        mismatches, error_logs_m = find_mismatches(
            df_master,
            df_target,
            columns_to_check,
            master_key,
            target_key,
            master_table,
            target_table,
            duplicates_master,
            duplicates_target,
            non_imp_columns
        )
        ERROR_LOG_M.extend(error_logs_m)

    # Find detailed null values in both tables
    if perform_checks:
        null_values_master, null_values_target, error_logs_m = find_detailed_nulls(
            df_master,
            df_target,
            master_key,
            target_key,
            master_table,
            target_table,
            columns_to_check  # Pass columns_to_check
        )
        ERROR_LOG_M.extend(error_logs_m)

    # Validate data types between master and target schemas
    if perform_checks:
        data_type_issues, error_logs_m = validate_data_types(
            schema_master,
            schema_target,
            master_key,
            master_table,
            target_table,
            columns_to_check  # Pass columns_to_check
        )
        ERROR_LOG_M.extend(error_logs_m)

    # Validate formats in master table only
    if perform_checks:
        format_issues_master, error_logs_m = validate_formats(
            df_master,
            df_target,
            master_key,
            target_key,
            target_table,
            master_table,
            columns_to_check  # Pass columns_to_check
        )
        ERROR_LOG_M.extend(error_logs_m)

        # Validate pincode mapping if applicable
        pincode_mapping_issues = pd.DataFrame()
        if {'pincode', 'city', 'state'}.issubset(df_master.columns):
            pincode_mapping_issues, error_logs_m = validate_pincode_mapping(
                df_master,
                df_target,
                master_key,
                target_key,
                target_table,
                client,
                master_table
            )
            ERROR_LOG_M.extend(error_logs_m)

    # Find non-matching keys
    master_only_keys, target_only_keys, error_logs_m = find_non_matching_keys(
        df_master, df_target, master_key, target_key, duplicates_master, duplicates_target, master_table, target_table
    )
    ERROR_LOG_M.extend(error_logs_m)

    # Compile results
    results = {
        'mismatches': mismatches,
        'null_values_master': pd.DataFrame(null_values_master),
        'null_values_target': pd.DataFrame(null_values_target),
        'duplicates_master': duplicates_master,
        'duplicates_target': duplicates_target,
        'data_type_issues': data_type_issues,
        'format_issues_master': format_issues_master,
        'pincode_mapping_issues': pincode_mapping_issues,
        'key_column_master': master_key,
        'key_column_target': target_key,
        'df_master_only_keys': master_only_keys,
        'df_target_only_keys': target_only_keys,
        'table1_name': master_table,
        'table2_name': target_table,
        'df_master': df_master,
        'df_target': df_target
    }

    logging.info(f"Completed comparison for '{master_table}' vs '{target_table}'.")
    return results

def generate_string_schema(df):
    """
    Generates a BigQuery schema with all fields as STRING.
    
    Args:
        df (pd.DataFrame): The DataFrame for which to generate the schema.
        
    Returns:
        list: A list of SchemaField objects with type STRING.
    """
    schema = [SchemaField(column, "STRING", mode="NULLABLE") for column in df.columns]
    return schema

def _upload_dataframe_to_bigquery(client, analytics_dataset, table_name, df):
    """
    Helper function to upload a DataFrame to BigQuery.

    Args:
        client (bigquery.Client): Initialized BigQuery client.
        analytics_dataset (str): The name of the Analytics dataset.
        table_name (str): The name of the table to upload.
        df (pd.DataFrame): The DataFrame to upload.

    Returns:
        None
    """
    if df.empty:
        logging.info(f"No data to upload for '{table_name}'. Skipping.")
        return

    # Convert all columns to string type
    df = df.astype(str)

    # Generate BigQuery schema with all fields as STRING
    schema = generate_string_schema(df)

    # Ensure table name doesn't exceed BigQuery's maximum length (1,024 characters)
    if len(table_name) > 1024:
        original_table_name = table_name
        table_name = table_name[:1021] + '...'
        logging.warning(f"Table name truncated from '{original_table_name}' to '{table_name}' due to length constraints.")

    # Define the full table ID
    table_id = f"{client.project}.{analytics_dataset}.{table_name}"

    # Upload the DataFrame to BigQuery
    try:
        job = client.load_table_from_dataframe(
            df,
            table_id,
            job_config=bigquery.LoadJobConfig(
                write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
                schema=schema  # Using the provided schema with all fields as STRING
            )
        )
        job.result()  # Wait for the job to complete
        logging.info(f"Successfully uploaded '{table_id}' with {len(df)} records.")
    except Exception as e:
        logging.error(f"Failed to upload '{table_id}' to BigQuery: {e}")

def upload_comparison_results_to_bigquery(client, analytics_dataset, ERROR_LOG_M):
    """
    Uploads the ERROR_LOG_M to BigQuery as a separate table in the Analytics dataset.

    Args:
        client (bigquery.Client): Initialized BigQuery client.
        analytics_dataset (str): The name of the Analytics dataset.
        ERROR_LOG_M (list): The error log data as a list of dictionaries.

    Returns:
        None
    """
    # Handle ERROR_LOG separately
    if ERROR_LOG_M is not None:
        # Determine the DataFrame to upload
        if isinstance(ERROR_LOG_M, pd.DataFrame):
            error_df = ERROR_LOG_M
        elif isinstance(ERROR_LOG_M, list):
            error_df = pd.DataFrame(ERROR_LOG_M)
        else:
            logging.warning("Unsupported data type for ERROR_LOG. Skipping upload.")
            error_df = None

        if error_df is not None and not error_df.empty:
            _upload_dataframe_to_bigquery(client, analytics_dataset, "error_logs", error_df)
        else:
            logging.info("No error logs to upload.")
    else:
        logging.info("No error logs present.")

def main():
    """
    Main function to orchestrate the comparison of multiple base tables against their master_hub_ counterparts.
    """
    try:
        # Initialize BigQuery client
        try:
            client = get_bigquery_client(PROJECT_ID)
        except Exception:
            logging.error("Exiting due to BigQuery client initialization failure.")
            return

        # Find common tables with 'master_hub_' and other prefixes, passing BASE_TABLES
        common_tables = find_common_tables_with_master_hub(client, DATASET_ID, PREFIXES, BASE_TABLES)

        if not common_tables:
            logging.info("No common tables found with 'master_hub_' and the specified prefixes.")
            return

        # Iterate over each base table and perform comparisons
        for base_name, tables in common_tables.items():
            base_table_info = BASE_TABLES.get(base_name)
            if not base_table_info:
                logging.warning(f"No configuration found for base table '{base_name}'. Skipping.")
                continue

            master_key = base_table_info.get('master_key')
            target_tables = base_table_info.get('targets', {})
            

            master_table = tables.get('master_hub_')
            if not master_table:
                logging.warning(f"Master table 'master_hub_{base_name}' not found. Skipping.")
                continue

            all_results = []

            # Iterate through each prefix and its corresponding target_key
            for prefix, target_key in target_tables.items():
                target_table = tables.get(prefix)
                if not target_table:
                    logging.warning(f"Target table with prefix '{prefix}' for base table '{base_name}' not found. Skipping.")
                    continue

                comparison_result = compare_tables(
                    client, 
                    DATASET_ID, 
                    base_name, 
                    master_table, 
                    target_table, 
                    master_key, 
                    target_key  # Pass the correct target_key per prefix
                )
                if comparison_result:
                    all_results.append(comparison_result)

                    # Prepare and send a separate Slack message for each comparison
                    total_mismatches = len(comparison_result['mismatches'])
                    total_nulls_master = len(comparison_result['null_values_master'])
                    total_nulls_target = len(comparison_result['null_values_target'])
                    total_dup_master = len(comparison_result['duplicates_master'])
                    total_dup_target = len(comparison_result['duplicates_target'])
                    total_data_type_issues = len(comparison_result['data_type_issues'])
                    total_format_issues_master = len(comparison_result['format_issues_master'])
                    total_pincode_issues = len(comparison_result['pincode_mapping_issues'])
                    total_non_matching_source = len(comparison_result.get('df_master_only_keys', []))
                    total_non_matching_target = len(comparison_result.get('df_target_only_keys', []))

                    message = (
                        f"✅ *Comparison Report Generated for `{base_name}`*\n"
                        f"*Tables Compared: `{comparison_result['table1_name']}` vs `{comparison_result['table2_name']}`*\n"
                        f"- *Total Mismatches between values of same column name of both tables : `{total_mismatches}`*\n"
                        f"- *Total Null Values in `{comparison_result['table1_name']}`: `{total_nulls_master}`*\n"
                        f"- *Total Null Values in `{comparison_result['table2_name']}`: `{total_nulls_target}`*\n"
                        f"- *Duplicate `{master_key}` in `{comparison_result['table1_name']}`: `{total_dup_master}`*\n"
                        f"- *Duplicate `{target_key}` in `{comparison_result['table2_name']}`: `{total_dup_target}`*\n"
                        f"- *Total Data Type Issues(mismatch between datatype in columns with same name of both tables): `{total_data_type_issues}`*\n"
                        f"- *Total Format/Value Issues(gstin, email, pincode) in `{comparison_result['table1_name']}`: `{total_format_issues_master}`*\n"
                        f"- *Total Pincode Mapping Issues in `{comparison_result['table1_name']}`: `{total_pincode_issues}`*\n"
                         "- *Non-Matching Keys*:\n"
                        f"--*`{master_key}` only in `{comparison_result['table1_name']}` and not in `{comparison_result['table2_name']}`:`{total_non_matching_source}`,*\n"
                        f"--*`{target_key}` only in `{comparison_result['table2_name']}` and not in `{comparison_result['table1_name']}`:`{total_non_matching_target}`*"
                    )

                    send_slack_alert(message)

        # Upload error logs to BigQuery after all comparisons
        upload_comparison_results_to_bigquery(
            client, 
            'Analytics',
            ERROR_LOG_M
            )

        logging.info("All comparisons completed.")
    except Exception as e:
        # Capture the full traceback
        tb = traceback.format_exc()
        logging.error("An unexpected error occurred in the main process.", exc_info=True)

        # Prepare a detailed error message for Slack
        error_message = (
            f"❌ *Comparison Process Failed*\n"
            f"*Error:* {str(e)}\n"
            f"*Traceback:*\n```{tb}```"
        )
        send_slack_alert(error_message)

        # Optionally, exit the script with a non-zero status
        sys.exit(1)

main()


2024-11-27 09:24:13,345 - INFO - Slack client initialized successfully.
2024-11-27 09:24:13,819 - INFO - BigQuery client initialized successfully.
2024-11-27 09:24:14,925 - INFO - Found 211 tables in dataset 'Impetus_dev_sit'.
2024-11-27 09:24:14,926 - INFO - Identified 8 common base names with 'master_hub_' and other specified prefixes.
2024-11-27 09:24:14,927 - INFO - Starting comparison for base table 'brand': 'master_hub_brand' vs 'procuro_brand'.
2024-11-27 09:24:15,788 - INFO - Loaded data from table 'master_hub_brand' into DataFrame.
2024-11-27 09:24:16,538 - INFO - Loaded data from table 'procuro_brand' into DataFrame.
2024-11-27 09:24:16,542 - INFO - Standardized DataFrame for comparison.
2024-11-27 09:24:16,545 - INFO - Standardized DataFrame for comparison.
2024-11-27 09:24:16,546 - INFO - Filtered 'brand' master table: 22 records excluded based on is_active = True.
2024-11-27 09:24:16,647 - INFO - Retrieved schema for table 'master_hub_brand'.
2024-11-27 09:24:16,731 - INFO

In [None]:
#log fixed
#mismatch fixed
#main.py
#count in each heading
#Duplicate_table_fixed
#Big_numeric_readibiliy
#paragraph to heading
#ToC added
#format issue enchaned
#pincode issue added and enchanced
#enchanced scalabiltiy incase target_key changes acorrding to table
# working error log


import time
import os
import re
import logging
from datetime import datetime
import pandas as pd
from google.cloud import bigquery
from google.api_core.exceptions import GoogleAPIError
from docx import Document
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from slack_sdk import WebClient
from slack_sdk.errors import SlackApiError
from google.cloud.bigquery import SchemaField
import traceback
import sys
from datetime import datetime

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Configuration
PROJECT_ID = 'fynd-jio-impetus-non-prod'       # Replace with your project ID
DATASET_ID = 'Impetus_dev_sit'                 # Replace with your dataset ID
PREFIXES = ['procuro_', 'costing_engine_', 'scan_pack_', 'pigeon_']  # Define your prefixes
# time_stamp,issue, error_message, tables_compared, issue_table, issue_column, unique_identifer
ERROR_LOG_M = []
# Mapping of base table names to their key columns in master and target tables
# BASE_TABLES = {
#     'brand': {'master_key': 'code', 'target_key': 'code'},
#     'brand_pm_mapping': {'master_key': 'pm_id', 'target_key': 'pm_id'},
#     'brick': {'master_key': 'brick_code', 'target_key': 'code'},  # Different key columns
#     'coe_bom_element_type_mapping': {'master_key': 'coe_name', 'target_key': 'coe_name'},
#     'event_log': {'master_key': 'user_id', 'target_key': 'user_id'},
#     'supplier': {'master_key': 'supplier_code', 'target_key': 'supplier_code'}
# }
# Mapping of base table names to their master key and target keys per prefix

# Get the current datetime
now = datetime.now()

Non_imp_columns = {
    'supplier': ['id', '_id', 'updated_at', 'created_at'],
    'vendor_details': ['id', '_id', 'updated_at', 'created_at']  # Add if applicable
}


# Define Important Columns
Imp_columns = {
    'brand': ['name', 'id', 'slug', 'code'],  
}


BASE_TABLES = {
    'brand': {
        'master_key': 'code',
        'targets': {
            'procuro_': 'code',
            'costing_engine_': 'code'
        },
        'active_filter': {
            'column': 'is_active',
            'value': True
        },
        'perform_checks': True  # Default behavior
    },
    'brand_pm_mapping': {
        'master_key': 'pm_id',
        'targets': {
            'costing_engine_': 'pm_id'
        },
        'perform_checks': True
    },
    'brick': {
        'master_key': 'brick_code',
        'targets': {
            'costing_engine_': 'code'
        },
        'perform_checks': True
    },
    'coe_bom_element_type_mapping': {
        'master_key': 'coe_name',
        'targets': {
            'costing_engine_': 'coe_name'
        },
        'perform_checks': True
    },
    'event_log': {
        'master_key': 'user_id',
        'targets': {
            'costing_engine_': 'user_id'
        },
        'perform_checks': True
    },
    'supplier': {
        'master_key': 'supplier_code',
        'targets': {
            'procuro_': 'supplier_code',
            'costing_engine_': 'supplier_code'
        },
        'active_filter': {  # Apply active filter
            'column': 'is_active',
            'value': True
        },
        'perform_checks': True
    },
    'vendor_details': {  # Newly added entry
        'master_key': 'supplier_code',  # Using supplier_code as the key
        'master_table': 'master_hub_supplier',  # Specify the master table explicitly
        'targets': {
            'scan_pack_': 'vendor_code'
        },
        'active_filter': {  # Apply active filter
            'column': 'is_active',
            'value': True
        },
        'perform_checks': True
    },
    'hsn_tax_mapping': {  # Newly added base table for HSN Codes
        'master_key': 'hsn_code',  # Assuming 'hsn_code' is the key column
        'master_table': 'master_hub_hsn',
        'targets': {
            'procuro_': 'hsn_code',
        },
        'perform_checks': False  # Only perform key comparisons
    }
}



# Slack configuration
SLACK_TOKEN = "xoxb-2151238541-7946286860052-5FCcfqBPem0xKigGlIcKdLgX"
SLACK_CHANNEL = "C07UN19ETK5"

# Initialize Slack client
if SLACK_TOKEN and SLACK_CHANNEL:
    slack_client = WebClient(token=SLACK_TOKEN)
    logging.info("Slack client initialized successfully.")
else:
    slack_client = None
    logging.warning("Slack token or channel not found. Slack notifications will be disabled.")


def get_bigquery_client(project_id):
    """
    Initialize and return a BigQuery client.

    Args:
        project_id (str): GCP project ID.

    Returns:
        bigquery.Client: An initialized BigQuery client.
    """
    try:
        client = bigquery.Client(project=project_id)
        logging.info("BigQuery client initialized successfully.")
        return client
    except Exception as e:
        logging.error(f"Failed to initialize BigQuery client: {e}")
        raise

def find_common_tables_with_master_hub(client, dataset_name, prefixes, base_tables):
    """
    Find tables in the specified dataset that share the same base name after removing the 'master_hub_' prefix
    and exist with other given prefixes.

    Args:
        client (bigquery.Client): Initialized BigQuery client.
        dataset_name (str): The name of the dataset to search within.
        prefixes (list): List of prefixes to compare with 'master_hub_'.
        base_tables (dict): The BASE_TABLES dictionary containing base table configurations.

    Returns:
        dict: A dictionary where keys are base names and values are dictionaries showing which prefixes have tables.
    """
    try:
        # Reference the dataset
        dataset_ref = client.dataset(dataset_name)
        
        # List all tables in the dataset
        tables = client.list_tables(dataset_ref)
        table_names = [table.table_id for table in tables]
        logging.info(f"Found {len(table_names)} tables in dataset '{dataset_name}'.")
        
        # Dictionary to hold base names and their corresponding tables
        common_tables = {}
        for base_name, config in base_tables.items():
            # Determine the master table
            master_table = config.get('master_table', f'master_hub_{base_name}')
            if master_table in table_names:
                common_tables[base_name] = {'master_hub_': master_table}
                # Check for target tables with specified prefixes
                for prefix, target_key in config.get('targets', {}).items():
                    target_table = f"{prefix}{base_name}"
                    if target_table in table_names:
                        common_tables[base_name][prefix] = target_table
            else:
                logging.warning(f"Master table '{master_table}' for base '{base_name}' not found in dataset.")
        
        # Filter out base names that only have 'master_hub_' but no other matching prefixes
        common_tables_with_prefixes = {base_name: tables for base_name, tables in common_tables.items() if len(tables) > 1}
        
        logging.info(f"Identified {len(common_tables_with_prefixes)} common base names with 'master_hub_' and other specified prefixes.")
        return common_tables_with_prefixes

    except GoogleAPIError as e:
        logging.error(f"Google API Error: {e.message}")
        return {}
    except Exception as e:
        logging.error(f"An unexpected error occurred: {e}")
        return {}

def get_table_schema(client, dataset_name, table_name):
    """
    Retrieve the schema of a specified BigQuery table.

    Args:
        client (bigquery.Client): Initialized BigQuery client.
        dataset_name (str): The name of the dataset.
        table_name (str): The name of the table.

    Returns:
        dict: A dictionary mapping column names to their data types.
    """
    try:
        table_ref = client.dataset(dataset_name).table(table_name)
        table = client.get_table(table_ref)
        schema = {field.name: field.field_type for field in table.schema}
        logging.info(f"Retrieved schema for table '{table_name}'.")
        return schema
    except GoogleAPIError as e:
        logging.error(f"Failed to retrieve schema for table '{table_name}': {e.message}")
        return {}
    except Exception as e:
        logging.error(f"An unexpected error occurred while retrieving schema for table '{table_name}': {e}")
        return {}

def load_table_from_bigquery(client, dataset_name, table_name):
    """
    Load a table from BigQuery into a Pandas DataFrame.

    Args:
        client (bigquery.Client): Initialized BigQuery client.
        dataset_name (str): The name of the dataset.
        table_name (str): The name of the table.

    Returns:
        pd.DataFrame: DataFrame containing the table data.
    """
    try:
        query = f"SELECT * FROM `{PROJECT_ID}.{dataset_name}.{table_name}`"
        df = client.query(query).to_dataframe()
        logging.info(f"Loaded data from table '{table_name}' into DataFrame.")
        return df
    except GoogleAPIError as e:
        logging.error(f"Failed to load table '{table_name}': {e.message}")
        return pd.DataFrame()
    except Exception as e:
        logging.error(f"An unexpected error occurred while loading table '{table_name}': {e}")
        return pd.DataFrame()

# def standardize_dataframe(df):
#     """
#     Standardize string columns in the DataFrame by stripping whitespace and converting to lowercase.

#     Args:
#         df (pd.DataFrame): The DataFrame to standardize.

#     Returns:
#         pd.DataFrame: Standardized DataFrame.
#     """
#     df_copy = df.copy()
#     for col in df_copy.columns:
#         if pd.api.types.is_string_dtype(df_copy[col]):
#             df_copy[col] = df_copy[col].astype(str).str.strip().str.lower()
#     logging.info("Standardized DataFrame for comparison.")
#     return df_copy

def standardize_dataframe(df, exclude_columns=[]):
    """
    Standardize string columns in the DataFrame by stripping whitespace and converting to lowercase,
    excluding specified columns.

    Args:
        df (pd.DataFrame): The DataFrame to standardize.
        exclude_columns (list): Columns to exclude from standardization.

    Returns:
        pd.DataFrame: Standardized DataFrame.
    """
    df_copy = df.copy()
    for col in df_copy.columns:
        if col in exclude_columns:
            continue  # Skip standardizing this column
        if pd.api.types.is_string_dtype(df_copy[col]):
            df_copy[col] = df_copy[col].astype(str).str.strip().str.lower()
    logging.info("Standardized DataFrame for comparison.")
    return df_copy


def find_common_and_non_common_columns(df1, df2):
    """
    Identify common and unique columns between two DataFrames.

    Args:
        df1 (pd.DataFrame): First DataFrame.
        df2 (pd.DataFrame): Second DataFrame.

    Returns:
        tuple: (common_columns, df1_unique_columns, df2_unique_columns)
    """
    common_columns = list(set(df1.columns).intersection(set(df2.columns)))
    df1_unique_columns = list(set(df1.columns) - set(df2.columns))
    df2_unique_columns = list(set(df2.columns) - set(df1.columns))
    logging.info(f"Found {len(common_columns)} common columns, {len(df1_unique_columns)} unique to first table, {len(df2_unique_columns)} unique to second table.")
    return common_columns, df1_unique_columns, df2_unique_columns

def find_mismatches(df_master, df_target, common_columns, master_key, target_key, table1, table2, duplicates_master, duplicates_target, non_imp_columns):
    """
    Identify mismatches between two DataFrames based on common columns and key columns.

    Args:
        df_master (pd.DataFrame): Source DataFrame (master_hub_ table).
        df_target (pd.DataFrame): Target DataFrame (prefixed table).
        common_columns (list): List of common columns to compare.
        master_key (str): The key column in the master table.
        target_key (str): The key column in the target table.
        table1 (str): Name of the source table.
        table2 (str): Name of the target table.
        duplicates_master (pd.DataFrame): Duplicate keys in master table.
        duplicates_target (pd.DataFrame): Duplicate keys in target table.

    Returns:
        list: List of dictionaries containing mismatch details.
    """
    mismatches = []
    error_logs_m = []
    # Ensure key columns are present in both DataFrames
    if master_key not in df_master.columns or target_key not in df_target.columns:
        logging.error(f"Key columns '{master_key}' or '{target_key}' not found in the respective tables.")
        return mismatches

    # Rename target key to match master key for easier comparison
    df_target_renamed = df_target.rename(columns={target_key: master_key})

    # Merge DataFrames on the master_key
    merged_df = pd.merge(
        df_master.drop_duplicates(subset=master_key),
        df_target_renamed.drop_duplicates(subset=master_key),
        on=master_key,
        suffixes=(f'_{table1}', f'_{table2}'),
        how='inner'
    )

    logging.info(f"Merged DataFrame has {len(merged_df)} records for mismatch comparison.")

    for index, row in merged_df.iterrows():
        key = row[master_key]
        for column in common_columns:
            if column.startswith('_boltic_') or column in non_imp_columns:
                continue  # Skip columns starting with '_boltic_'
            val_master = row.get(f"{column}_{table1}")
            val_target = row.get(f"{column}_{table2}")
            # Handle NaN values in comparison
            if pd.isna(val_master) and pd.isna(val_target):
                continue  # Both are NaN, treat as equal
            elif pd.isna(val_master) or pd.isna(val_target) or val_master != val_target:
                mismatch_detail = {
                    master_key: key,
                    'column': column,
                    f'{table1}_value': val_master,
                    f'{table2}_value': val_target
                }
                mismatches.append(mismatch_detail)
                error_detail = {
                    'time_stamp': now.strftime('%Y-%m-%d %H:%M:%S'),
                    'issue': 'mismatch',
                    'error_message': '',
                    'source_table': f'{table1}',
                    'target_table': f'{table2}',
                    'issue_column': column,
                    'unique_identifier': f'{master_key}: {key}'
                }
                error_logs_m.append(error_detail)

    

    logging.info(f"Found {len(mismatches)} mismatches between '{table1}' and '{table2}'.")
    return mismatches, error_logs_m

def find_duplicates(df, key_column, table_name):
    """
    Detect duplicate key_column entries in the DataFrame and identify differences.

    Args:
        df (pd.DataFrame): The DataFrame to check.
        key_column (str): The key column to check for duplicates.

    Returns:
        pd.DataFrame: DataFrame containing duplicated key_column values with differences.
    """
    if key_column not in df.columns:
        logging.error(f"Key column '{key_column}' not found in DataFrame.")
        return pd.DataFrame()
    
    # Get all duplicate entries (keep=False to get all duplicates)
    duplicates_df = df[df.duplicated(subset=key_column, keep=False)]
    
    # Group by key_column
    grouped = duplicates_df.groupby(key_column)
    
    duplicate_records = []

    error_logs_m = []

    for key, group in grouped:
        if len(group) <= 1:
            continue  # Not a duplicate
        
        # Drop key_column and any columns starting with '_boltic_'
        group_non_key = group.drop(columns=[key_column] + [col for col in group.columns if col.startswith('_boltic_')])
        
        # Check if all rows are identical
        if group_non_key.nunique().sum() == 0:
            difference = "No difference exists"
        else:
            # Find which columns have differences
            cols_with_diff = group_non_key.columns[group_non_key.nunique() > 1].tolist()
            # difference = ', '.join(cols_with_diff)
            difference = "Difference in value of columns: " + ', '.join(cols_with_diff)
        
        duplicate_records.append({
            key_column: key,
            'Difference in value': difference
        })
        error_detail = {
            'time_stamp': now.strftime('%Y-%m-%d %H:%M:%S'),
            'issue': 'duplicate',
            'error_message': f'{difference}',
            'source_table': f'{table_name}',
            'target_table': '',
            'issue_column': '',
            'unique_identifier': f'{key_column}: {key}'
        }
        error_logs_m.append(error_detail)
    
    logging.info(f"Found {len(duplicate_records)} duplicate entries based on '{key_column}'.")
    return pd.DataFrame(duplicate_records), error_logs_m

def validate_data_types(schema_master, schema_target, master_key, table1_name, table2_name):
    """
    Compare data types of common columns between master and target schemas.

    Args:
        schema_master (dict): Schema of the master table.
        schema_target (dict): Schema of the target table.
        master_key (str): The key column for reference.
        table1_name (str): Name of the first table.
        table2_name (str): Name of the second table.

    Returns:
        pd.DataFrame: DataFrame containing data type discrepancies with table names in headers.
    """
    data_type_issues = []
    error_logs_m = []

    # Identify common columns
    common_columns = set(schema_master.keys()).intersection(set(schema_target.keys()))

    for column in common_columns:
        type_master = schema_master[column]
        type_target = schema_target[column]
        if type_master != type_target:
            data_type_issues.append({
                'column_name': column,
                f'{table1_name}_data_type': type_master,
                f'{table2_name}_data_type': type_target
            })
            error_detail = {
            'time_stamp': now.strftime('%Y-%m-%d %H:%M:%S'),
            'issue': 'data_type_issues',
            'error_message': f'{table1_name}_data_type: {type_master} , {table2_name}_data_type: {type_target}',
            'source_table': f'{table1_name}',
            'target_table': f'{table2_name}',
            'issue_column': column,
            'unique_identifier': ''
            }
            error_logs_m.append(error_detail)

    logging.info(f"Found {len(data_type_issues)} data type issues.")
    return pd.DataFrame(data_type_issues), error_logs_m

def validate_formats(df_master, df_target, key_column, target_key, target_table, master_table):
    """
    Validate specific column formats using regular expressions and include corresponding target table values.
    
    Args:
        df_master (pd.DataFrame): The master DataFrame to validate.
        df_target (pd.DataFrame): The target DataFrame to fetch corresponding values.
        key_column (str): The key column in the master DataFrame.
        target_key (str): The key column in the target DataFrame.
        target_table (str): The name of the target table.
    
    Returns:
        pd.DataFrame: DataFrame containing format issues with corresponding target table values.
    """
    format_issues = pd.DataFrame(columns=[key_column, 'column', 'value', 'issue', f'{target_table}_value'])
    error_logs_m = []
    
    for idx, row in df_master.iterrows():
        key_value = str(row[key_column]).strip()
        
        # GSTIN format validation
        if 'gstin' in df_master.columns:
            gstin = str(row['gstin']).strip()
            if not re.match(r'^[0-9]{2}[A-Z]{5}[0-9]{4}[A-Z]{1}[A-Z0-9]{3}$', gstin):
                # Fetch corresponding target value
                if key_value in df_target[target_key].astype(str).str.strip().values:
                    target_row = df_target[df_target[target_key].astype(str).str.strip() == key_value].iloc[0]
                    target_value = target_row['gstin'] if 'gstin' in target_row else "Column not present"
                else:
                    target_value = f"'{target_key}' not present"
                
                format_issues = pd.concat([format_issues, pd.DataFrame([{
                    key_column: key_value,
                    'column': 'gstin',
                    'value': row['gstin'],
                    'issue': 'Invalid GSTIN format',
                    f'{target_table}_value': target_value
                }])], ignore_index=True)
                error_detail = {
                'time_stamp': now.strftime('%Y-%m-%d %H:%M:%S'),
                'issue': 'format_issue',
                'error_message': 'Invalid GSTIN format',
                'source_table': f'{master_table}',
                'target_table': '',
                'issue_column': 'gstin',
                'unique_identifier': f'{key_column}: {key_value}'
                }
                error_logs_m.append(error_detail)
        
        # Email format validation
        if 'email' in df_master.columns:
            email = str(row['email']).strip()
            if not re.match(r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$', email):
                # Fetch corresponding target value
                if key_value in df_target[target_key].astype(str).str.strip().values:
                    target_row = df_target[df_target[target_key].astype(str).str.strip() == key_value].iloc[0]
                    target_value = target_row['email'] if 'email' in target_row else "Column not present"
                else:
                    target_value = f"'{target_key}' not present"
                
                format_issues = pd.concat([format_issues, pd.DataFrame([{
                    key_column: key_value,
                    'column': 'email',
                    'value': row['email'],
                    'issue': 'Invalid email format',
                    f'{target_table}_value': target_value
                }])], ignore_index=True)
                error_detail = {
                'time_stamp': now.strftime('%Y-%m-%d %H:%M:%S'),
                'issue': 'format_issue',
                'error_message': 'Invalid email format',
                'source_table': f'{master_table}',
                'target_table': '',
                'issue_column': 'email',
                'unique_identifier': f'{key_column}: {key_value}'
                }
                error_logs_m.append(error_detail)
        
        # Pincode format validation
        if 'pincode' in df_master.columns:
            pincode = str(row['pincode']).strip()
            if not re.match(r'^\d{6}$', pincode):
                # Fetch corresponding target value
                if key_value in df_target[target_key].astype(str).str.strip().values:
                    target_row = df_target[df_target[target_key].astype(str).str.strip() == key_value].iloc[0]
                    target_value = target_row['pincode'] if 'pincode' in target_row else "Column not present"
                else:
                    target_value = f"'{target_key}' not present"
                
                format_issues = pd.concat([format_issues, pd.DataFrame([{
                    key_column: key_value,
                    'column': 'pincode',
                    'value': row['pincode'],
                    'issue': 'Pincode must be exactly 6 digits',
                    f'{target_table}_value': target_value
                }])], ignore_index=True)
                error_detail = {
                'time_stamp': now.strftime('%Y-%m-%d %H:%M:%S'),
                'issue': 'format_issue',
                'error_message': 'Pincode must be exactly 6 digits',
                'source_table': f'{master_table}',
                'target_table': '',
                'issue_column': 'pincode',
                'unique_identifier': f'{key_column}: {key_value}'
                }

                error_logs_m.append(error_detail)

    
    logging.info(f"Found {len(format_issues)} format issues.")
    return format_issues, error_logs_m


def create_table(doc, data, column_names):
    """
    Helper function to create a table in a docx document from a list of dictionaries.

    Args:
        doc (Document): The Word document object.
        data (list or list of dict): Data to populate the table.
        column_names (list): List of column names for the table headers.
    """
    if not data:
        return
    table = doc.add_table(rows=1, cols=len(column_names))
    table.style = 'Light List Accent 1'
    hdr_cells = table.rows[0].cells
    for i, col_name in enumerate(column_names):
        hdr_cells[i].text = col_name

    for row_data in data:
        row_cells = table.add_row().cells
        for i, col_name in enumerate(column_names):
            cell_value = str(row_data.get(col_name, '')).strip()
            row_cells[i].text = cell_value
    logging.info("Added table to the Word document.")

def add_non_matching_keys_section(doc, df1_only_keys, table1_name, df2_only_keys, table2_name, key_column_master, key_column_target):
    """
    Add a section in the Word document for non-matching keys between two tables.

    Args:
        doc (Document): The Word document object.
        df1_only_keys (list): Keys present only in table1.
        table1_name (str): Name of the first table.
        df2_only_keys (list): Keys present only in table2.
        table2_name (str): Name of the second table.
        key_column_master (str): The key column in the master table.
        key_column_target (str): The key column in the target table.
    """
    if df1_only_keys or df2_only_keys:
        if df1_only_keys:
            doc.add_heading(f"'{key_column_master}' present only in '{table1_name}' and not in '{table2_name}' ({len(df1_only_keys)})", level=2)
            create_table(doc, [{key_column_master: key.strip()} for key in df1_only_keys], [key_column_master])
        if df2_only_keys:
            doc.add_heading(f"'{key_column_target}' present only in '{table2_name}' and not in '{table1_name}' ({len(df2_only_keys)})", level=2)
            create_table(doc, [{key_column_target: key.strip()} for key in df2_only_keys], [key_column_target])
    else:
        doc.add_paragraph("No non-matching keys found.")

def add_table_of_contents(doc):
    """
    Adds a Table of Contents to the Word document.

    Args:
        doc (Document): The Word document object.
    """
    paragraph = doc.add_paragraph()
    run = paragraph.add_run()
    fldChar_begin = OxmlElement('w:fldChar')  # creates a new element
    fldChar_begin.set(qn('w:fldCharType'), 'begin')  # sets attribute on element
    instrText = OxmlElement('w:instrText')
    instrText.set(qn('xml:space'), 'preserve')  # sets attribute on element
    instrText.text = 'TOC \\o "1-2" \\h \\z \\u'  # change to what you need
    fldChar_separate = OxmlElement('w:fldChar')
    fldChar_separate.set(qn('w:fldCharType'), 'separate')
    fldChar_end = OxmlElement('w:fldChar')
    fldChar_end.set(qn('w:fldCharType'), 'end')
    run._r.append(fldChar_begin)
    run._r.append(instrText)
    run._r.append(fldChar_separate)
    run._r.append(fldChar_end)
    logging.info("Added Table of Contents to the Word document.")

def create_aggregated_document(all_results, base_name):
    """
    Creates a single Word document that presents all comparison results for a base table.

    Args:
        all_results (list): List of comparison result dictionaries.
        base_name (str): The base name of the table.

    Returns:
        str: The filepath of the saved report.
    """
    doc = Document()
    doc.add_heading(f'{base_name.capitalize()} Tables Comparison Report', level=0)
    doc.add_paragraph(f'Report generated on {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n')
    
    # Add Instruction for TOC Update
    doc.add_paragraph(
        "📌 **Note:** To update the Table of Contents and make the links clickable, go to ‘Reference’ tab and click ‘Update_Table’ or press F9 in Windows and Fn+F9 in mac, after opening after opening this document in Microsoft Word.",
        style='Intense Quote'
    )
        
    # Add Table of Contents
    doc.add_heading('Table of Contents', level=1)
    add_table_of_contents(doc)
    doc.add_page_break()

    for result in all_results:
        table1_name = result['table1_name']
        table2_name = result['table2_name']
        key_column_master = result['key_column_master']
        key_column_target = result['key_column_target']
        doc.add_heading(f'Comparison: {table1_name} vs {table2_name}', level=1)

        # Mismatches
        if result['mismatches']:
            doc.add_heading(f'Mismatches ({len(result["mismatches"])})', level=2)
            column_names = [key_column_master, 'column', f'{table1_name}_value', f'{table2_name}_value']
            create_table(doc, result['mismatches'], column_names)
        else:
            doc.add_heading("No mismatches found.", level=2)

        # Null values in master table
        if not result['null_values_master'].empty:
            count_null_master = len(result['null_values_master'])
            doc.add_heading(f'Null values in {table1_name} ({count_null_master})', level=2)
            column_names = [key_column_master, 'column', table2_name]
            create_table(doc, result['null_values_master'].to_dict('records'), column_names)
        else:
            doc.add_heading(f"No null values found in {table1_name}.", level=2)

        # Null values in target table
        if not result['null_values_target'].empty:
            count_null_target = len(result['null_values_target'])
            doc.add_heading(f'Null values in {table2_name} ({count_null_target})', level=2)
            column_names = [key_column_target, 'column', table1_name]
            create_table(doc, result['null_values_target'].to_dict('records'), column_names)
        else:
            doc.add_heading(f"No null values found in {table2_name}.", level=2)

        # Duplicate keys in master table
        if not result['duplicates_master'].empty:
            count_dup_master = len(result['duplicates_master'])
            doc.add_heading(f'Duplicate Keys in {table1_name} ({count_dup_master})', level=2)
            # Modified to include 'Difference in value' column
            create_table(doc, result['duplicates_master'].to_dict('records'), [key_column_master, 'Difference in value'])
        else:
            doc.add_heading("No duplicate keys found in master table.", level=2)

        # Duplicate keys in target table with actual table name
        if not result['duplicates_target'].empty:
            count_dup_target = len(result['duplicates_target'])
            doc.add_heading(f'Duplicate Keys in {table2_name} ({count_dup_target})', level=2)
            # Modified to include 'Difference in value' column
            create_table(doc, result['duplicates_target'].to_dict('records'), [key_column_target, 'Difference in value'])
        else:
            doc.add_heading(f"No duplicate keys found in {table2_name}.", level=2)

        # Data type issues
        if not result['data_type_issues'].empty:
            count_data_type_issues = len(result['data_type_issues'])
            doc.add_heading(f'Data Type Issues ({count_data_type_issues})', level=2)
            column_names = ['column_name', f'{table1_name}_data_type', f'{table2_name}_data_type']
            create_table(doc, result['data_type_issues'].to_dict('records'), column_names)
        else:
            doc.add_heading("No data type issues found.", level=2)

        # Format issues in master table with target values
        if not result['format_issues_master'].empty:
            count_format_issues_master = len(result['format_issues_master'])
            doc.add_heading(f'Format Issues in {table1_name} ({count_format_issues_master})', level=2)
            column_names_master = [key_column_master, 'column', 'value', 'issue', f'{table2_name}_value']
            create_table(doc, result['format_issues_master'].to_dict('records'), column_names_master)
        else:
            doc.add_heading(f"No format issues found in {table1_name}.", level=2)

        # # Pincode Mapping Issues
        # if not result['pincode_mapping_issues'].empty:
        #     count_pincode_issues = len(result['pincode_mapping_issues'])
        #     doc.add_heading(f'Pincode Mapping Issues in {table1_name} ({count_pincode_issues})', level=2)
        #     column_names = [key_column_master, 'pincode', 'state', 'city', 'issue']
        #     create_table(doc, result['pincode_mapping_issues'].to_dict('records'), column_names)
        # else:
        #     doc.add_heading("No pincode mapping issues found.", level=2)

        # Pincode Mapping Issues with target details
        if not result['pincode_mapping_issues'].empty:
            count_pincode_issues = len(result['pincode_mapping_issues'])
            doc.add_heading(f'Pincode Mapping Issues in {table1_name} ({count_pincode_issues})', level=2)
            column_names = [
                key_column_master, 'pincode', 'state', 'city', 'issue',
                f'{table2_name}_details'
            ]
            create_table(doc, result['pincode_mapping_issues'].to_dict('records'), column_names)
        else:
            doc.add_heading("No pincode mapping issues found.", level=2)

        

        # Non-matching keys in master DataFrame
        if result['df_master_only_keys']:
            count_master_only = len(result['df_master_only_keys'])
            doc.add_heading(f'Keys only in {table1_name} ({count_master_only})', level=2)
            column_names = [key_column_master]
            create_table(doc, result['df_master_only_keys'], column_names)
        else:
            doc.add_heading(f"No keys found only in {table1_name}.", level=2)

        # Non-matching keys in target DataFrame
        if result['df_target_only_keys']:
            count_target_only = len(result['df_target_only_keys'])
            doc.add_heading(f'Keys only in {table2_name} ({count_target_only})', level=2)
            column_names = [key_column_target]
            create_table(doc, result['df_target_only_keys'], column_names)
        else:
            doc.add_heading(f"No keys found only in {table2_name}.", level=2)


        doc.add_page_break()  # Optional: Add a page break between comparisons

    # Save the aggregated document to the current directory
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    report_filename = f"{base_name}_comparison_report_aggregated_{timestamp}.docx"
    doc.save(report_filename)
    logging.info(f"Saved aggregated comparison report as '{report_filename}'.")

    return report_filename  # Return the filename for further processing



def send_slack_alert(message):
    """
    Send a message to a specified Slack channel.

    Args:
        message (str): The message to send.
    """
    if not slack_client:
        logging.warning("Slack client is not initialized. Skipping Slack notification.")
        return

    try:
        response = slack_client.chat_postMessage(
            channel=SLACK_CHANNEL,
            text=message
        )
        logging.info(f"Message sent to {SLACK_CHANNEL}: {response['ts']}")
    except SlackApiError as e:
        logging.error(f"Error sending message to Slack: {e.response['error']}")

def upload_file_to_slack(filepath, title=None):
    """
    Upload a file to the specified Slack channel using files_upload_v2.

    Args:
        filepath (str): The path to the file to upload.
        title (str, optional): The title for the uploaded file. Defaults to the file's basename.
    """
    if not slack_client:
        logging.warning("Slack client is not initialized. Skipping file upload.")
        return

    try:
        with open(filepath, 'rb') as f:
            response = slack_client.files_upload_v2(
                channel=SLACK_CHANNEL,
                file=f,
                filename=os.path.basename(filepath),  # Explicitly set the filename with extension
                title=title if title else os.path.basename(filepath),  # Set the title
                initial_comment=title if title else "File uploaded."  # Optional: Add an initial comment
            )

        # Verify if the upload was successful
        if response.get('ok'):
            file_permalink = response['file']['permalink']
            logging.info(f"File uploaded to Slack channel '{SLACK_CHANNEL}': {file_permalink}")
        else:
            logging.error(f"Failed to upload file to Slack: {response}")
    except SlackApiError as e:
        logging.error(f"Slack API Error during file upload: {e.response['error']}")
    except Exception as e:
        logging.error(f"Unexpected error during file upload: {e}")


def find_non_matching_keys(df_master, df_target, master_key, target_key, duplicates_master, duplicates_target, master_table, target_table):
    """
    Identify keys present in df_master but not in df_target and vice versa, including duplicates.

    Args:
        df_master (pd.DataFrame): Source DataFrame.
        df_target (pd.DataFrame): Target DataFrame.
        master_key (str): The key column in the master table.
        target_key (str): The key column in the target table.
        duplicates_master (pd.DataFrame): Duplicate keys in master table.
        duplicates_target (pd.DataFrame): Duplicate keys in target table.

    Returns:
        tuple: (master_only_keys, target_only_keys)
            - master_only_keys (list of dict): Keys present only in master DataFrame.
            - target_only_keys (list of dict): Keys present only in target DataFrame.
    """
    error_logs_m = []
    # Include all keys, including duplicates
    keys_master = set(df_master[master_key].astype(str).str.strip())
    keys_target = set(df_target[target_key].astype(str).str.strip())

    # Keys present only in master
    master_only = keys_master - keys_target
    # Keys present only in target
    target_only = keys_target - keys_master

    logging.info(f"Found {len(master_only)} keys in source not in target and {len(target_only)} keys in target not in source.")

    # Convert to list of dictionaries for consistency
    master_only_keys = [{master_key: key} for key in master_only]
    target_only_keys = [{target_key: key} for key in target_only]
        # Log errors for keys only in master
    for key in master_only:
        error_detail = {
            'time_stamp': now.strftime('%Y-%m-%d %H:%M:%S'),
            'issue': 'missing_key',
            'error_message': f"Key '{master_key}' with value '{key}' is present only in '{master_table}' and missing in '{target_table}'.",
            'source_table': master_table,
            'target_table': target_table, 
            'issue_column': master_key,
            'unique_identifier': f"{master_key}: {key}"
        }
        error_logs_m.append(error_detail)

    # Log errors for keys only in target
    for key in target_only:
        error_detail = {
            'time_stamp': now.strftime('%Y-%m-%d %H:%M:%S'),
            'issue': 'missing_key',
            'error_message': f"Key '{target_key}' with value '{key}' is present only in '{target_table}' and missing in '{master_table}'.",
            'source_table': target_table,
            'target_table': master_table,
            'issue_column': target_key,
            'unique_identifier': f"{target_key}: {key}"
        }
        error_logs_m.append(error_detail)


    return master_only_keys, target_only_keys, error_logs_m



def find_detailed_nulls(df_master, df_target, master_key, target_key, master_table, target_table):
    """
    Identify null values in both master and target tables and fetch corresponding values or indicate missing keys.

    Args:
        df_master (pd.DataFrame): Source DataFrame (master_hub_ table).
        df_target (pd.DataFrame): Target DataFrame (prefixed table).
        master_key (str): The key column in the master table.
        target_key (str): The key column in the target table.
        master_table (str): Name of the master table.
        target_table (str): Name of the target table.

    Returns:
        tuple: (null_values_master, null_values_target)
    """
    null_values_master = []
    null_values_target = []
    error_logs_m = []

    # Find nulls in master
    null_master = df_master[df_master.isnull().any(axis=1)]
    for idx, row in null_master.iterrows():
        key_value = str(row[master_key]).strip()
        for column in df_master.columns:
            if column == master_key or column.startswith('_boltic_'):
                continue  # Skip key column and columns starting with '_boltic_'
            if pd.isnull(row[column]):
                if key_value in df_target[target_key].astype(str).str.strip().values:
                    target_row = df_target[df_target[target_key].astype(str).str.strip() == key_value].iloc[0]
                    target_value = target_row[column] if column in target_row else "Column not present"
                else:
                    target_value = f"'{target_key}' not present"
                null_record = {
                    master_key: key_value,
                    'column': column,
                    target_table: target_value
                }
                error_detail = {
                'time_stamp': now.strftime('%Y-%m-%d %H:%M:%S'),
                'issue': 'null',
                'error_message': 'Null in columns',
                'source_table': f'{master_table}',
                'target_table': '',
                'issue_column': column,
                'unique_identifier': f'{master_key} : {key_value}'
                }
                error_logs_m.append(error_detail)
                null_values_master.append(null_record)
                

    # Find nulls in target
    null_target = df_target[df_target.isnull().any(axis=1)]
    for idx, row in null_target.iterrows():
        key_value = str(row[target_key]).strip()
        for column in df_target.columns:
            if column == target_key or column.startswith('_boltic_'):
                continue  # Skip key column and columns starting with '_boltic_'
            if pd.isnull(row[column]):
                if key_value in df_master[master_key].astype(str).str.strip().values:
                    master_row = df_master[df_master[master_key].astype(str).str.strip() == key_value].iloc[0]
                    master_value = master_row[column] if column in master_row else "Column not present"
                else:
                    master_value = f"'{master_key}' not present"
                null_record = {
                    target_key: key_value,
                    'column': column,
                    master_table: master_value
                }
                error_detail = {
                'time_stamp': now.strftime('%Y-%m-%d %H:%M:%S'),
                'issue': 'null',
                'error_message': 'Null in columns',
                'source_table': f'{target_table}',
                'target_table': '',
                'issue_column': column,
                'unique_identifier': f'{target_key} : {key_value}'
                }
                error_logs_m.append(error_detail)
                null_values_target.append(null_record)
                

    logging.info(f"Found {len(null_values_master)} null values in master table '{master_table}'.")
    logging.info(f"Found {len(null_values_target)} null values in target table '{target_table}'.")
    return null_values_master, null_values_target, error_logs_m


def validate_pincode_mapping(df_master, df_target, key_column, target_key, target_table, client, master_table):
    """
    Validate pincode mapping by comparing with the all_india_PO_list reference table.
    If a pincode issue is found in the master table, then check the corresponding pincode in the target table.
    
    Args:
        df_master (pd.DataFrame): The master DataFrame to validate.
        df_target (pd.DataFrame): The target DataFrame to fetch corresponding values.
        key_column (str): The key column in the master DataFrame.
        target_key (str): The key column in the target DataFrame.
        target_table (str): The name of the target table.
        client (bigquery.Client): Initialized BigQuery client.
        master_table (str): Name of the master table.

    Returns:
        pd.DataFrame: DataFrame containing pincode mapping issues with corresponding target table details.
        list: List of error log dictionaries.
    """

    error_logs_m = []
    # Read the reference table from Analytics dataset
    try:
        reference_table = "all_india_PO_list"
        reference_dataset = "analytics_data"
        query = f"SELECT pincode, city, state FROM `{PROJECT_ID}.{reference_dataset}.{reference_table}`"
        reference_df = client.query(query).to_dataframe()
        reference_df['pincode'] = reference_df['pincode'].astype(str).str.strip()
        reference_df['city'] = reference_df['city'].astype(str).str.strip().str.lower()
        reference_df['state'] = reference_df['state'].astype(str).str.strip().str.lower()
        logging.info(f"Loaded reference pincode mapping from '{reference_table}' in '{reference_dataset}' dataset.")
    except Exception as e:
        logging.error(f"Failed to load reference pincode mapping: {e}")
        return pd.DataFrame(), error_logs_m
    
    # Check if df_master has 'pincode', 'city', 'state' columns
    required_columns = {'pincode', 'city', 'state'}
    if not required_columns.issubset(df_master.columns):
        logging.info(f"DataFrame does not have required columns for pincode mapping validation: {required_columns}")
        return pd.DataFrame(), error_logs_m
    
    # Initialize the issues DataFrame with a single target table details column
    pincode_mapping_issues = pd.DataFrame(columns=[
        key_column, 'pincode', 'state', 'city', 'issue',
        f'{target_table}_details'
    ])
    
    # Iterate over each row in df_master to validate pincode mapping
    for idx, row in df_master.iterrows():
        key_value = str(row[key_column]).strip()
        pincode = str(row['pincode']).strip()
        city = str(row['city']).strip().lower()
        state = str(row['state']).strip().lower()
        
        # Fetch corresponding target row if exists
        target_row = df_target[df_target[target_key].astype(str).str.strip() == key_value]
        if not target_row.empty:
            target_row = target_row.iloc[0]
            target_pincode = target_row['pincode'] if 'pincode' in target_row and pd.notnull(target_row['pincode']) else "Pincode missing"
            target_state = target_row['state'] if 'state' in target_row and pd.notnull(target_row['state']) else "State missing"
            target_city = target_row['city'] if 'city' in target_row and pd.notnull(target_row['city']) else "City missing"
            target_details = f"Pincode: {target_pincode}, State: {target_state}, City: {target_city}"
        else:
            target_details = f"Key '{key_column}' with value '{key_value}' not present in target table '{target_table}'."
        
        # Check if pincode exists in reference
        ref_matches = reference_df[reference_df['pincode'] == pincode]
        if ref_matches.empty:
            issue = f"Invalid pincode ({pincode})."
            pincode_mapping_issues = pd.concat([pincode_mapping_issues, pd.DataFrame([{
                key_column: key_value,
                'pincode': pincode,
                'state': state,
                'city': city,
                'issue': issue,
                f'{target_table}_details': target_details
            }])], ignore_index=True)
            error_detail = {
                'time_stamp': now.strftime('%Y-%m-%d %H:%M:%S'),
                'issue': 'pincode_mapping',
                'error_message': f"{issue}. {target_table} Details: {target_details}",
                'source_table': master_table,
                'target_table': target_table,
                'issue_column': 'pincode',
                'unique_identifier': f'{key_column}: {key_value}'
            }
            error_logs_m.append(error_detail)
            continue
        
        # Check if any of the reference entries match both the city and state
        exact_match = ref_matches[
            (ref_matches['city'] == city) & (ref_matches['state'] == state)
        ]
        if not exact_match.empty:
            continue  # No issue, mapping is correct
        
        # Check for state mismatch
        state_matches = ref_matches[ref_matches['state'] == state]
        
        # Check for city mismatch
        city_matches = ref_matches[ref_matches['city'] == city]
        
        if state_matches.empty and city_matches.empty:
            # Both state and city do not match
            expected_entries = ref_matches[['state', 'city']].drop_duplicates()
            expected_states = expected_entries['state'].tolist()
            expected_cities = expected_entries['city'].tolist()
            expected_states_str = ', '.join(expected_states)
            expected_cities_str = ', '.join(expected_cities)
            issue = f"Pincode {pincode} does not match state '{state}' and city '{city}'. Expected states: {expected_states_str}; Expected cities: {expected_cities_str}."
        elif state_matches.empty:
            # State does not match
            expected_states = ref_matches['state'].unique().tolist()
            expected_states_str = ', '.join(expected_states)
            issue = f"Pincode {pincode} does not match state '{state}'. Expected states: {expected_states_str}."
        elif city_matches.empty:
            # City does not match
            expected_cities = state_matches['city'].unique().tolist()
            expected_cities_str = ', '.join(expected_cities)
            issue = f"Pincode {pincode} does not match city '{city}'. Expected cities: {expected_cities_str}."
        else:
            # Other cases
            issue = f"Pincode {pincode} has a mapping inconsistency."
        
        pincode_mapping_issues = pd.concat([pincode_mapping_issues, pd.DataFrame([{
            key_column: key_value,
            'pincode': pincode,
            'state': state,
            'city': city,
            'issue': issue,
            f'{target_table}_details': target_details
        }])], ignore_index=True)
        error_detail = {
            'time_stamp': now.strftime('%Y-%m-%d %H:%M:%S'),
            'issue': 'pincode_mapping',
            'error_message': f"{issue}. {target_table} Details: {target_details}",
            'source_table': master_table,
            'target_table': target_table,
            'issue_column': 'pincode',
            'unique_identifier': f'{key_column}: {key_value}'
        }
        error_logs_m.append(error_detail)
    
    logging.info(f"Found {len(pincode_mapping_issues)} pincode mapping issues in master table '{master_table}'.")
    return pincode_mapping_issues, error_logs_m




def compare_tables(client, dataset_name, base_name, master_table, target_table, master_key, target_key):
    """
    Compare two tables and generate a report.

    Args:
        client (bigquery.Client): Initialized BigQuery client.
        dataset_name (str): The name of the dataset.
        base_name (str): The base name of the table.
        master_table (str): Name of the master_hub_ table.
        target_table (str): Name of the target prefixed table.
        master_key (str): The key column in the master table.
        target_key (str): The key column in the target table.

    Returns:
        dict: A dictionary containing all comparison results.
    """
    logging.info(f"Starting comparison for base table '{base_name}': '{master_table}' vs '{target_table}'.")

    # Initialize comparison results
    mismatches = []
    null_values_master = []
    null_values_target = []
    data_type_issues = pd.DataFrame()
    format_issues_master = pd.DataFrame()
    pincode_mapping_issues = pd.DataFrame()
    duplicates_master = pd.DataFrame()
    duplicates_target = pd.DataFrame()
    master_only_keys = []
    target_only_keys = []

    # Load data
    df_master = load_table_from_bigquery(client, dataset_name, master_table)
    df_target = load_table_from_bigquery(client, dataset_name, target_table)

    # **Apply standardization to handle case insensitivity**
    df_master = standardize_dataframe(df_master, exclude_columns=[master_key])
    df_target = standardize_dataframe(df_target, exclude_columns=[target_key])


    # **Apply configurable active filter if defined**
    base_table_info = BASE_TABLES.get(base_name, {})
    active_filter = base_table_info.get('active_filter')
    # Determine whether to perform additional checks
    perform_checks = base_table_info.get('perform_checks', True)

    if active_filter:
        column = active_filter.get('column')
        value = active_filter.get('value')
        if column and column in df_master.columns:
            initial_count = len(df_master)
            df_master = df_master[df_master[column] == value]
            filtered_count = len(df_master)
            logging.info(f"Filtered '{base_name}' master table: {initial_count - filtered_count} records excluded based on {column} = {value}.")
        else:
            logging.warning(f"Active filter specified but column '{column}' not found in master table '{master_table}'.")


    if df_master.empty or df_target.empty:
        logging.warning(f"One of the tables '{master_table}' or '{target_table}' is empty. Skipping comparison.")
        return None

    # Identify BigNumeric columns in master and target tables
    schema_master = get_table_schema(client, dataset_name, master_table)
    schema_target = get_table_schema(client, dataset_name, target_table)
    bignumeric_columns_master = [col for col, dtype in schema_master.items() if dtype == 'BIGNUMERIC']
    bignumeric_columns_target = [col for col, dtype in schema_target.items() if dtype == 'BIGNUMERIC']

    # Format BigNumeric columns in master table
    for col in bignumeric_columns_master:
        if col in df_master.columns:
            df_master[col] = df_master[col].apply(lambda x: format(x, '.0f') if pd.notnull(x) else x)

    # Format BigNumeric columns in target table
    for col in bignumeric_columns_target:
        if col in df_target.columns:
            df_target[col] = df_target[col].apply(lambda x: format(x, '.0f') if pd.notnull(x) else x)
    
    if perform_checks:
        # Find duplicates in both tables
        duplicates_master, error_logs_m = find_duplicates(df_master, master_key, master_table)
        ERROR_LOG_M.extend(error_logs_m)
        duplicates_target,  error_logs_m = find_duplicates(df_target, target_key, target_table)
        ERROR_LOG_M.extend(error_logs_m)

    if not duplicates_master.empty:
        logging.warning(f"Duplicate keys found in source table '{master_table}'. These will be reported but not used in mismatch comparison.")
    if not duplicates_target.empty:
        logging.warning(f"Duplicate keys found in target table '{target_table}'. These will be reported but not used in mismatch comparison.")

    # Identify common columns
    common_columns, master_unique_cols, target_unique_cols = find_common_and_non_common_columns(df_master, df_target)

    if not common_columns:
        logging.warning(f"No common columns found between '{master_table}' and '{target_table}'. Skipping comparison.")
        return None
    
    # Retrieve non-important columns for the current base table
    non_imp_columns = Non_imp_columns.get(base_name, [])

    # print(non_imp_columns)
    
    # Perform mismatch comparison if allowed
    if perform_checks:
    # Find mismatches excluding duplicate keys
        mismatches, error_logs_m = find_mismatches(
            df_master,
            df_target,
            common_columns,
            master_key,
            target_key,
            master_table,
            target_table,
            duplicates_master,
            duplicates_target,
            non_imp_columns
        )
        ERROR_LOG_M.extend(error_logs_m)
    # Find detailed null values in both tables if allowed
    if perform_checks:    
        # Find detailed null values in both tables
        null_values_master, null_values_target, error_logs_m = find_detailed_nulls(
            df_master,
            df_target,
            master_key,
            target_key,
            master_table,
            target_table
        )
        ERROR_LOG_M.extend(error_logs_m)

    if perform_checks:    
        # Validate data types between master and target schemas
        data_type_issues, error_logs_m = validate_data_types(schema_master, schema_target, master_key, master_table, target_table)
        ERROR_LOG_M.extend(error_logs_m)

    if perform_checks:        
        # Validate formats in master table only and include target values
        format_issues_master, error_logs_m = validate_formats(df_master, df_target, master_key, target_key, target_table, master_table)
        ERROR_LOG_M.extend(error_logs_m)
        
        # # Validate pincode mapping if applicable
        # pincode_mapping_issues = pd.DataFrame()
        # if {'pincode', 'city', 'state'}.issubset(df_master.columns):
        #     pincode_mapping_issues = validate_pincode_mapping(df_master, master_key, client)
    
        # Validate pincode mapping if applicable and include target values

    if perform_checks:        
        pincode_mapping_issues = pd.DataFrame()
        if {'pincode', 'city', 'state'}.issubset(df_master.columns):
            pincode_mapping_issues, error_logs_m = validate_pincode_mapping(
                df_master, 
                df_target, 
                master_key, 
                target_key, 
                target_table, 
                client, master_table
            )
            ERROR_LOG_M.extend(error_logs_m)
    
        # if {'pincode', 'city', 'state'}.issubset(df_target.columns):
        # # **Corrected Call: Swap DataFrames and Keys**
        #     pincode_mapping_issues_target, error_logs_m = validate_pincode_mapping(
        #         df_target,       # df_master becomes target DataFrame
        #         df_master,       # df_target becomes master DataFrame
        #         target_key,      # key_column is target_key
        #         master_key,      # target_key is master_key
        #         master_table,    # target_table is master_table
        #         client, 
        #         target_table
        #     )
        #     pincode_mapping_issues = pd.concat([pincode_mapping_issues, pincode_mapping_issues_target], ignore_index=True)
        #     ERROR_LOG_M.extend(error_logs_m)
    
           
    # Find non-matching keys
    master_only_keys, target_only_keys, error_logs_m = find_non_matching_keys(
        df_master, df_target, master_key, target_key, duplicates_master, duplicates_target, master_table, target_table
    )
    ERROR_LOG_M.extend(error_logs_m)

    # Compile results
    results = {
        'mismatches': mismatches,
        'null_values_master': pd.DataFrame(null_values_master),
        'null_values_target': pd.DataFrame(null_values_target),
        'duplicates_master': duplicates_master,
        'duplicates_target': duplicates_target,
        'data_type_issues': data_type_issues,
        'format_issues_master': format_issues_master,
        'pincode_mapping_issues': pincode_mapping_issues,
        'key_column_master': master_key,
        'key_column_target': target_key,
        'df_master_only_keys': master_only_keys,
        'df_target_only_keys': target_only_keys,
        'table1_name': master_table,
        'table2_name': target_table,
        'df_master': df_master,
        'df_target': df_target
    }

    logging.info(f"Completed comparison for '{master_table}' vs '{target_table}'.")
    return results

def generate_string_schema(df):
    """
    Generates a BigQuery schema with all fields as STRING.
    
    Args:
        df (pd.DataFrame): The DataFrame for which to generate the schema.
        
    Returns:
        list: A list of SchemaField objects with type STRING.
    """
    schema = [SchemaField(column, "STRING", mode="NULLABLE") for column in df.columns]
    return schema


def _upload_dataframe_to_bigquery(client, analytics_dataset, table_name, df):
    """
    Helper function to upload a DataFrame to BigQuery.
    
    Args:
        client (bigquery.Client): Initialized BigQuery client.
        analytics_dataset (str): The name of the Analytics dataset.
        table_name (str): The name of the table to upload.
        df (pd.DataFrame): The DataFrame to upload.
        
    Returns:
        None
    """
    if df.empty:
        logging.info(f"No data to upload for '{table_name}'. Skipping.")
        return
    
    # Convert all columns to string type
    df = df.astype(str)
    
    # Generate BigQuery schema with all fields as STRING
    schema = generate_string_schema(df)
    
    # Ensure table name doesn't exceed BigQuery's maximum length (1,024 characters)
    if len(table_name) > 1024:
        original_table_name = table_name
        table_name = table_name[:1021] + '...'
        logging.warning(f"Table name truncated from '{original_table_name}' to '{table_name}' due to length constraints.")
    
    # Define the full table ID
    table_id = f"{client.project}.{analytics_dataset}.{table_name}"
    
    # Upload the DataFrame to BigQuery
    try:
        job = client.load_table_from_dataframe(
            df,
            table_id,
            job_config=bigquery.LoadJobConfig(
                write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
                schema=schema  # Using the provided schema with all fields as STRING
            )
        )
        job.result()  # Wait for the job to complete
        logging.info(f"Successfully uploaded '{table_id}' with {len(df)} records.")
    except Exception as e:
        logging.error(f"Failed to upload '{table_id}' to BigQuery: {e}")

def upload_comparison_results_to_bigquery(client, analytics_dataset, ERROR_LOG_M):
    """
    Uploads each part of the comparison_result to BigQuery as separate tables in the Analytics dataset.
    The table names follow the format: 'table1_name_vs_table2_name_heading'.
    
    Args:
        client (bigquery.Client): Initialized BigQuery client.
        analytics_dataset (str): The name of the Analytics dataset.
        table1_name (str): Name of the first table (e.g., 'master_hub_brand').
        table2_name (str): Name of the second table (e.g., 'procuro_brand').
        comparison_result (dict): The dictionary containing comparison results.
        ERROR_LOG: The error log data, either a DataFrame or a list.
        
    Returns:
        None
    """
    
    # Handle ERROR_LOG separately
    if ERROR_LOG_M is not None:
        # Determine the DataFrame to upload
        if isinstance(ERROR_LOG_M, pd.DataFrame):
            error_df = ERROR_LOG_M
        elif isinstance(ERROR_LOG_M, list):
            error_df = pd.DataFrame(ERROR_LOG_M)
        else:
            logging.warning("Unsupported data type for ERROR_LOG. Skipping upload.")
            error_df = None
        
        if error_df is not None:
            _upload_dataframe_to_bigquery(client, analytics_dataset, "error_logs", error_df)
    



def main():
    """
    Main function to orchestrate the comparison of multiple base tables against their master_hub_ counterparts.
    """
    try:
        # Initialize BigQuery client
        try:
            client = get_bigquery_client(PROJECT_ID)
        except Exception:
            logging.error("Exiting due to BigQuery client initialization failure.")
            return

        # Find common tables with 'master_hub_' and other prefixes, passing BASE_TABLES
        common_tables = find_common_tables_with_master_hub(client, DATASET_ID, PREFIXES, BASE_TABLES)
        
        if not common_tables:
            logging.info("No common tables found with 'master_hub_' and the specified prefixes.")
            return

        # Iterate over each base table and perform comparisons
        for base_name, tables in common_tables.items():
            base_table_info = BASE_TABLES.get(base_name)
            if not base_table_info:
                logging.warning(f"No configuration found for base table '{base_name}'. Skipping.")
                continue

            master_key = base_table_info.get('master_key')
            target_tables = base_table_info.get('targets', {})
            
            master_table = tables.get('master_hub_')
            if not master_table:
                logging.warning(f"Master table 'master_hub_{base_name}' not found. Skipping.")
                continue

            all_results = []
            
            # Iterate through each prefix and its corresponding target_key
            for prefix, target_key in target_tables.items():
                target_table = tables.get(prefix)
                if not target_table:
                    logging.warning(f"Target table with prefix '{prefix}' for base table '{base_name}' not found. Skipping.")
                    continue
                
                comparison_result = compare_tables(
                    client, 
                    DATASET_ID, 
                    base_name, 
                    master_table, 
                    target_table, 
                    master_key, 
                    target_key  # Pass the correct target_key per prefix
                )
                if comparison_result:
                    all_results.append(comparison_result)
                    
                    # Prepare and send a separate Slack message for each comparison
                    total_mismatches = len(comparison_result['mismatches'])
                    total_nulls_master = len(comparison_result['null_values_master'])
                    total_nulls_target = len(comparison_result['null_values_target'])
                    total_dup_master = len(comparison_result['duplicates_master'])
                    total_dup_target = len(comparison_result['duplicates_target'])
                    total_data_type_issues = len(comparison_result['data_type_issues'])
                    total_format_issues_master = len(comparison_result['format_issues_master'])
                    total_pincode_issues = len(comparison_result['pincode_mapping_issues'])
                    total_non_matching_source = len(comparison_result.get('df_master_only_keys', []))
                    total_non_matching_target = len(comparison_result.get('df_target_only_keys', []))
                    
                    message = (
                        f"✅ *Comparison Report Generated for `{base_name}`*\n"
                        f"*Tables Compared: `{comparison_result['table1_name']}` vs `{comparison_result['table2_name']}`*\n"
                        f"- *Total Mismatches between values of same column name of both tables : `{total_mismatches}`*\n"
                        f"- *Total Null Values in `{comparison_result['table1_name']}`: `{total_nulls_master}`*\n"
                        f"- *Total Null Values in `{comparison_result['table2_name']}`: `{total_nulls_target}`*\n"
                        f"- *Duplicate `{master_key}` in `{comparison_result['table1_name']}`: `{total_dup_master}`*\n"
                        f"- *Duplicate `{target_key}` in `{comparison_result['table2_name']}`: `{total_dup_target}`*\n"
                        f"- *Total Data Type Issues(mismatch between datatype in columns with same name of both tables): `{total_data_type_issues}`*\n"
                        f"- *Total Format/Value Issues(gstin, email, pincode) in `{comparison_result['table1_name']}`: `{total_format_issues_master}`*\n"
                        f"- *Total Pincode Mapping Issues in `{comparison_result['table1_name']}`: `{total_pincode_issues}`*\n"
                         "- *Non-Matching Keys*:\n"
                        f"--*`{master_key}` only in `{comparison_result['table1_name']}` and not in `{comparison_result['table2_name']}`:`{total_non_matching_source}`,*\n"
                        f"--*`{target_key}` only in `{comparison_result['table2_name']}` and not in `{comparison_result['table1_name']}`:`{total_non_matching_target}`*"
                    )
                
                    send_slack_alert(message)

        # Upload error logs to BigQuery after all comparisons
        upload_comparison_results_to_bigquery(
            client, 
            'analytics_data',
            ERROR_LOG_M
            )

        logging.info("All comparisons completed.")
    except Exception as e:
        # Capture the full traceback
        tb = traceback.format_exc()
        logging.error("An unexpected error occurred in the main process.", exc_info=True)
        
        # Prepare a detailed error message for Slack
        error_message = (
            f"❌ *Comparison Process Failed*\n"
            f"*Error:* {str(e)}\n"
            f"*Traceback:*\n```{tb}```"
        )
        send_slack_alert(error_message)

        # Optionally, exit the script with a non-zero status
        sys.exit(1)

main()