## Ensuring Consistency in Multi-source Data Integration

**Description**: Validate the integration of two datasets `products_A.csv` and `products_B.csv` . Ensure consistency in product "category" information.

In [2]:
# Write your code from here

import pandas as pd

# Load datasets
df_A = pd.read_csv('products_A.csv')
df_B = pd.read_csv('products_B.csv')

# Assume the key is 'product_id' and category columns are named 'category' in both
key = 'product_id'

# Standardize category names: lowercase and strip whitespace
df_A['category_std'] = df_A['category'].str.lower().str.strip()
df_B['category_std'] = df_B['category'].str.lower().str.strip()

# Merge datasets on the key
merged_df = pd.merge(df_A[[key, 'category_std']], df_B[[key, 'category_std']], on=key, suffixes=('_A', '_B'))

# Identify inconsistencies
inconsistent = merged_df[merged_df['category_std_A'] != merged_df['category_std_B']]

# Output inconsistencies
print("Inconsistent category entries:")
print(inconsistent)

# Optionally save the report
inconsistent.to_csv('category_inconsistencies_report.csv', index=False)


Inconsistent category entries:
   product_id category_std_A category_std_B
2         103          audio  audio devices


In [1]:
import pandas as pd

# Sample data for products_A.csv
data_A = {
    'product_id': [101, 102, 103, 104, 105],
    'product_name': ['Laptop', 'Smartphone', 'Headphones', 'Monitor', 'Keyboard'],
    'category': ['Electronics', 'Electronics', 'Audio', 'Electronics', 'Computer Accessories']
}

# Sample data for products_B.csv
data_B = {
    'product_id': [101, 102, 103, 104, 105],
    'product_name': ['Laptop', 'Smartphone', 'Headphones', 'Monitor', 'Keyboard'],
    'category': ['electronics', 'Electronics', 'Audio Devices', 'electronics', 'Computer Accessories']
}

# Create DataFrames
df_A = pd.DataFrame(data_A)
df_B = pd.DataFrame(data_B)

# Save to CSV locally
df_A.to_csv('products_A.csv', index=False)
df_B.to_csv('products_B.csv', index=False)

print("Datasets created: products_A.csv and products_B.csv")


Datasets created: products_A.csv and products_B.csv
