## Ensuring Consistency in Multi-source Data Integration

**Description**: Validate the integration of two datasets `products_A.csv` and `products_B.csv` . Ensure consistency in product "category" information.

In [1]:
import pandas as pd

# -------------------------------
# 1) Simulate / Load your data
# -------------------------------

# Source A
products_A = pd.DataFrame({
    'product_id': [101, 102, 103, 104],
    'product_name': ['Widget', 'Gadget', 'Doodad', 'Thingamajig'],
    'category': ['Tools', 'Electronics', 'Tools', 'Gadgets']
})

# Source B
products_B = pd.DataFrame({
    'product_id': [101, 102, 103, 104],
    'product_name': ['Widget', 'Gadget', 'Doodad', 'Thingamajig'],
    'category': ['Tools', 'Electronics', 'Hardware', 'Gadgets']  # mismatch at 103
})

# If you had real CSVs, you'd do:
# products_A = pd.read_csv('products_A.csv')
# products_B = pd.read_csv('products_B.csv')


# -------------------------------
# 2) Merge on product_id
# -------------------------------

merged = pd.merge(
    products_A,
    products_B,
    on='product_id',
    suffixes=('_A', '_B'),
    how='inner',            # use 'outer' if you want to also catch missing IDs
    validate='one_to_one'   # ensure no duplicates on either side
)


# -------------------------------
# 3) Check category consistency
# -------------------------------

merged['category_match'] = merged['category_A'] == merged['category_B']

# Extract the inconsistent rows
inconsistent = merged.loc[~merged['category_match'], 
                          ['product_id', 'product_name_A', 'category_A', 'category_B']]

# Rename for clarity
inconsistent = inconsistent.rename(columns={
    'product_name_A': 'product_name',
    'category_A': 'category_source_A',
    'category_B': 'category_source_B'
})


# -------------------------------
# 4) Summary statistics
# -------------------------------

total_records      = len(merged)
matching_records   = merged['category_match'].sum()
mismatch_records   = total_records - matching_records
consistency_rate   = matching_records / total_records * 100

print(f"Total products compared : {total_records}")
print(f"Matching categories     : {matching_records}")
print(f"Mismatched categories   : {mismatch_records}")
print(f"Consistency rate        : {consistency_rate:.1f}%")

if not inconsistent.empty:
    print("\n🚩 Inconsistent Category Entries:")
    print(inconsistent.to_string(index=False))
else:
    print("\n✅ All categories are consistent.")


# -------------------------------
# 5) Export mismatches for review
# -------------------------------

if not inconsistent.empty:
    inconsistent.to_csv('inconsistent_categories_report.csv', index=False)
    print("\nThe mismatches have been saved to 'inconsistent_categories_report.csv' for manual review.")


Total products compared : 4
Matching categories     : 3
Mismatched categories   : 1
Consistency rate        : 75.0%

🚩 Inconsistent Category Entries:
 product_id product_name category_source_A category_source_B
        103       Doodad             Tools          Hardware

The mismatches have been saved to 'inconsistent_categories_report.csv' for manual review.
