## Ensuring Consistency in Multi-source Data Integration

**Description**: Validate the integration of two datasets `products_A.csv` and `products_B.csv` . Ensure consistency in product "category" information.

In [1]:
# Write your code from here
# Write your code from here
import pandas as pd

def validate_category_consistency(file_a, file_b, join_column='product_id'):
    """
    Validates the consistency of product "category" information between two datasets
    based on a common join column (default: 'product_id').

    Args:
        file_a (str): Path to the CSV file for dataset A.
        file_b (str): Path to the CSV file for dataset B.
        join_column (str, optional): The column to join the two datasets on.
                                     Defaults to 'product_id'.

    Returns:
        pandas.DataFrame: A DataFrame showing products with inconsistent categories,
                          including the join column and the category from each dataset.
                          Returns None if files are not found or no conflicts are found.
    """
    try:
        df_a = pd.read_csv(file_a)
        df_b = pd.read_csv(file_b)
    except FileNotFoundError as e:
        print(f"Error: One or both files not found: {e}")
        return None

    # Check if required columns exist in both DataFrames
    if join_column not in df_a.columns or 'category' not in df_a.columns:
        print(f"Error: '{join_column}' or 'category' column missing in {file_a}.")
        return None
    if join_column not in df_b.columns or 'category' not in df_b.columns:
        print(f"Error: '{join_column}' or 'category' column missing in {file_b}.")
        return None
    merged_df = pd.merge(df_a, df_b, on=join_column, suffixes=('_A', '_B'))
    if merged_df.empty:
        print(f"No matching entries found between {file_a} and {file_b} based on '{join_column}'.")
        return pd.DataFrame()
    inconsistent_categories_df = merged_df[merged_df['category_A'] != merged_df['category_B']]

    if not inconsistent_categories_df.empty:
        print("Products with inconsistent category information:")
        return inconsistent_categories_df[[join_column, 'category_A', 'category_B']]
    else:
        print("No inconsistencies found in product category information across the datasets.")
        return pd.DataFrame()
data_a = {'product_id': [1, 2, 3, 4, 5],
          'name_A': ['Product 1A', 'Product 2A', 'Product 3A', 'Product 4A', 'Product 5A'],
          'category': ['Electronics', 'Books', 'Electronics', 'Clothing', 'Books']}
data_b = {'product_id': [1, 2, 3, 6, 5],
          'description_B': ['Desc 1B', 'Desc 2B', 'Desc 3B', 'Desc 6B', 'Desc 5B'],
          'category': ['Electronics', 'Books', 'Appliances', 'Clothing', 'Books']}

df_a = pd.DataFrame(data_a)
df_b = pd.DataFrame(data_b)

df_a.to_csv('products_A.csv', index=False)
df_b.to_csv('products_B.csv', index=False)

# Validate category consistency
inconsistent_report = validate_category_consistency('products_A.csv', 'products_B.csv')

if inconsistent_report is not None:
    print(inconsistent_report)


Products with inconsistent category information:
   product_id   category_A  category_B
2           3  Electronics  Appliances
