## Detect Schema Mismatches in Data Pipelines
**Objective**: Identify and resolve schema mismatches that commonly occur in data pipelines.

**Task**: Column Name Mismatch

**Steps**:
1. Load the source DataFrame with the below schema:
    - id : Integer
    - name : String
    - age : Integer
2. Load the target DataFrame with the below schema:
    - id : Integer
    - fullname : String
    - age : Integer
3. Use a schema comparison tool or write a simple function to detect mismatches in column names.
4. Resolve the mismatch by renaming the `fullname` column in the target DataFrame to `name` .

In [1]:
import pandas as pd

# Sample DataFrames
df_source = pd.DataFrame({
    'id': [1, 2],
    'name': ['Alice', 'Bob'],
    'age': [25, 30]
})

df_target = pd.DataFrame({
    'id': [1, 2],
    'fullname': ['Alice A', 'Bob B'],
    'age': [25, 30]
})

# Define expected schema
expected_schema = {
    'id': 'int64',
    'name': 'object',
    'age': 'int64'
}

# Function to check if dataframe is empty
def validate_non_empty(df, name="DataFrame"):
    if df.empty:
        raise ValueError(f"❌ {name} is empty.")

# Function to detect column mismatches
def detect_column_mismatches(df1, df2):
    source_cols = set(df1.columns)
    target_cols = set(df2.columns)

    missing_in_target = source_cols - target_cols
    extra_in_target = target_cols - source_cols

    return missing_in_target, extra_in_target

# Function to reconcile column name mismatch
def reconcile_columns(df, rename_map):
    for old_col, new_col in rename_map.items():
        if old_col in df.columns:
            df.rename(columns={old_col: new_col}, inplace=True)
        else:
            raise KeyError(f"❌ Column '{old_col}' not found in DataFrame. Cannot rename.")

# Function to validate column data types
def validate_schema(df, expected_schema):
    for col, expected_type in expected_schema.items():
        if col not in df.columns:
            raise ValueError(f"❌ Column '{col}' missing from DataFrame.")
        actual_type = str(df[col].dtype)
        if actual_type != expected_type:
            raise TypeError(f"❌ Data type mismatch for column '{col}': expected {expected_type}, got {actual_type}")

# ---- Execution ----

try:
    # 1. Validate non-empty DataFrames
    validate_non_empty(df_source, "Source")
    validate_non_empty(df_target, "Target")

    # 2. Detect column mismatches
    missing, extra = detect_column_mismatches(df_source, df_target)
    print(f"🔍 Missing in Target: {missing}")
    print(f"🔍 Extra in Target: {extra}")

    # 3. Rename column if necessary
    reconcile_columns(df_target, {'fullname': 'name'})

    # 4. Validate updated schema
    validate_schema(df_target, expected_schema)

    print("✅ Schema reconciliation completed successfully.")
except Exception as e:
    print(str(e))

🔍 Missing in Target: {'name'}
🔍 Extra in Target: {'fullname'}
✅ Schema reconciliation completed successfully.
