## Find Conflicting Values Across Datasets

**Description**: You have two datasets: `crm_customers.csv` and `erp_customers.csv` . Find customers with conflicting "email" information.

In [1]:
import pandas as pd

def find_conflicting_emails(crm_customers_path, erp_customers_path):
    """
    Finds customers with conflicting 'email' information across two datasets.

    Assumes both CSV files have 'customer_id' and 'email' columns.

    Args:
        crm_customers_path (str): The file path to the CRM customer data CSV.
        erp_customers_path (str): The file path to the ERP customer data CSV.

    Returns:
        pandas.DataFrame: A DataFrame containing customers where the 'email'
                          address differs between the CRM and ERP datasets.
                          Returns an empty DataFrame if no conflicts are found,
                          or if files cannot be read, or if required columns are missing.
    """
    try:
        crm_df = pd.read_csv(crm_customers_path)
        erp_df = pd.read_csv(erp_customers_path)
    except FileNotFoundError:
        print("Error: One or both customer CSV files not found. Please check paths.")
        return pd.DataFrame()
    except Exception as e:
        print(f"Error reading CSV files: {e}")
        return pd.DataFrame()

    # Ensure required columns exist in both DataFrames
    required_columns = ['customer_id', 'email']
    if not all(col in crm_df.columns for col in required_columns):
        print(f"Error: '{crm_customers_path}' must contain 'customer_id' and 'email' columns.")
        return pd.DataFrame()
    if not all(col in erp_df.columns for col in required_columns):
        print(f"Error: '{erp_customers_path}' must contain 'customer_id' and 'email' columns.")
        return pd.DataFrame()

    # Merge the two DataFrames on 'customer_id'
    # Use an inner merge to only compare customers present in both systems
    merged_df = pd.merge(
        crm_df,
        erp_df,
        on='customer_id',
        how='inner',
        suffixes=('_crm', '_erp')
    )

    # Find conflicts where customer_id exists in both, but emails differ
    # Also handle cases where one email might be NaN while the other is not.
    # To consider 'email_crm' == NaN and 'email_erp' == NaN as *matching* (no conflict),
    # we first filter for non-NaN in at least one, then check for inequality.
    conflicting_emails_df = merged_df[
        (merged_df['email_crm'].notna() | merged_df['email_erp'].notna()) & # At least one email is not missing
        (merged_df['email_crm'] != merged_df['email_erp'])
    ]

    if conflicting_emails_df.empty:
        print("No conflicting email addresses found for common customers.")
    else:
        print("Conflicting email addresses found:")

    return conflicting_emails_df[['customer_id', 'email_crm', 'email_erp']]

# --- Example Usage (requires creating dummy CSV files) ---
if __name__ == "__main__":
    # Create dummy CSV files for testing

    # Dummy crm_customers.csv
    crm_data = {
        'customer_id': ['C001', 'C002', 'C003', 'C004', 'C005'],
        'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
        'email': ['alice@crm.com', 'bob@crm.com', 'charlie@crm.com', 'david@crm.com', 'eve@crm.com']
    }
    df_crm = pd.DataFrame(crm_data)
    df_crm.to_csv('crm_customers.csv', index=False)

    # Dummy erp_customers.csv
    erp_data = {
        'customer_id': ['C001', 'C002', 'C003', 'C006', 'C007'],
        'name': ['Alice', 'Robert', 'Charles', 'Frank', 'Grace'],
        'email': ['alice@crm.com', 'bob@erp.com', 'charlie@erp.com', 'frank@erp.com', 'grace@erp.com']
    }
    df_erp = pd.DataFrame(erp_data)
    df_erp.to_csv('erp_customers.csv', index=False)

    print("--- Finding Conflicting Values Across Datasets ---")

    conflicts_df = find_conflicting_emails('crm_customers.csv', 'erp_customers.csv')
    print(conflicts_df)

    print("\n--- Test Case 2: No conflicts (common customers have same email) ---")
    crm_no_conflict = pd.DataFrame({
        'customer_id': ['D001', 'D002'],
        'name': ['Xavier', 'Yvonne'],
        'email': ['xavier@mail.com', 'yvonne@mail.com']
    })
    erp_no_conflict = pd.DataFrame({
        'customer_id': ['D001', 'D002', 'D003'],
        'name': ['Xavier', 'Yvonne', 'Zoe'],
        'email': ['xavier@mail.com', 'yvonne@mail.com', 'zoe@mail.com']
    })
    crm_no_conflict.to_csv('crm_no_conflict.csv', index=False)
    erp_no_conflict.to_csv('erp_no_conflict.csv', index=False)

    no_conflicts_df = find_conflicting_emails('crm_no_conflict.csv', 'erp_no_conflict.csv')
    print(no_conflicts_df)

    print("\n--- Test Case 3: Handling missing emails (one side has NaN, other has value) ---")
    crm_missing_email = pd.DataFrame({
        'customer_id': ['E001', 'E002'],
        'email': ['e001@crm.com', pd.NA]
    })
    erp_missing_email = pd.DataFrame({
        'customer_id': ['E001', 'E002'],
        'email': ['e001@erp.com', 'e002@erp.com']
    })
    crm_missing_email.to_csv('crm_missing_email.csv', index=False)
    erp_missing_email.to_csv('erp_missing_email.csv', index=False)

    missing_email_conflicts = find_conflicting_emails('crm_missing_email.csv', 'erp_missing_email.csv')
    print(missing_email_conflicts)


    # Clean up dummy files
    import os
    os.remove('crm_customers.csv')
    os.remove('erp_customers.csv')
    os.remove('crm_no_conflict.csv')
    os.remove('erp_no_conflict.csv')
    os.remove('crm_missing_email.csv')
    os.remove('erp_missing_email.csv')

--- Finding Conflicting Values Across Datasets ---
Conflicting email addresses found:
  customer_id        email_crm        email_erp
1        C002      bob@crm.com      bob@erp.com
2        C003  charlie@crm.com  charlie@erp.com

--- Test Case 2: No conflicts (common customers have same email) ---
No conflicting email addresses found for common customers.
Empty DataFrame
Columns: [customer_id, email_crm, email_erp]
Index: []

--- Test Case 3: Handling missing emails (one side has NaN, other has value) ---
Conflicting email addresses found:
  customer_id     email_crm     email_erp
0        E001  e001@crm.com  e001@erp.com
1        E002           NaN  e002@erp.com
