## Ensuring Consistency in Multi-source Data Integration

**Description**: Validate the integration of two datasets `products_A.csv` and `products_B.csv` . Ensure consistency in product "category" information.

In [1]:
import pandas as pd

def validate_category_consistency(file_a, file_b, product_id_col="product_id", category_col="category"):
    """
    Validates the consistency of product category information between two datasets.

    Args:
        file_a (str): Path to the CSV file for dataset A.
        file_b (str): Path to the CSV file for dataset B.
        product_id_col (str, optional): Name of the product ID column. Defaults to "product_id".
        category_col (str, optional): Name of the category column. Defaults to "category".

    Returns:
        pandas.DataFrame: A DataFrame containing products with inconsistent category information,
                          or an empty DataFrame if no inconsistencies are found.
                          Returns None if there are errors.
    """
    try:
        # Read the CSV files into Pandas DataFrames
        df_a = pd.read_csv(file_a)
        df_b = pd.read_csv(file_b)
    except FileNotFoundError as e:
        print(f"Error: {e}")
        return None
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None

    # Check if the required columns exist in both DataFrames
    if product_id_col not in df_a.columns or category_col not in df_a.columns:
        print(f"Error: Missing required columns in file A. Expected '{product_id_col}' and '{category_col}'.")
        return None
    if product_id_col not in df_b.columns or category_col not in df_b.columns:
        print(f"Error: Missing required columns in file B. Expected '{product_id_col}' and '{category_col}'.")
        return None

    # Merge the DataFrames on product ID
    merged_df = pd.merge(df_a, df_b, on=product_id_col, suffixes=('_A', '_B'))

    # Find products with inconsistent category information
    inconsistent_categories_df = merged_df[merged_df[f'{category_col}_A'] != merged_df[f'{category_col}_B']]

    return inconsistent_categories_df



def main():
    """
    Main function to run the category consistency validation and print the results.
    """
    # Provide the paths to your CSV files
    file_a = 'products_A.csv'  # Replace with your actual file path
    file_b = 'products_B.csv'  # Replace with your actual file path

    # Create dummy CSV files for demonstration
    try:
        with open(file_a, 'w') as f:
            f.write("product_id,category,name\n1,Electronics,Product1\n2,Clothing,Product2\n3,Home Goods,Product3\n4,Electronics,Product4")
        with open(file_b, 'w') as f:
            f.write("product_id,category,description\n1,Electronics,Description1\n2,Apparel,Description2\n3,Home Goods,Description3\n4,Electronics,Description4")
    except FileExistsError:
        pass

    # Validate category consistency
    inconsistent_categories_df = validate_category_consistency(file_a, file_b)

    # Print the results
    if inconsistent_categories_df is not None:
        if not inconsistent_categories_df.empty:
            print("Products with inconsistent category information:")
            print(inconsistent_categories_df.to_string(index=False))
        else:
            print("No products with inconsistent category information found.")
    else:
        print("Error occurred during category consistency validation. Please check the file paths and column names.")



if __name__ == "__main__":
    main()


Products with inconsistent category information:
 product_id category_A     name category_B  description
          2   Clothing Product2    Apparel Description2
