## Detect Schema Mismatches in Data Pipelines
**Objective**: Identify and resolve schema mismatches that commonly occur in data pipelines.

**Task**: Missing Column

1. Load the source DataFrame with the below schema:
    - id : Integer
    - email : String
    - signup_date : Date
2. Load the target DataFrame with the below schema:
    - id : Integer
    - email : String
3. Implement a check to identify any columns that are present in the source DataFrame but missing in the target.
4. Add the missing `signup_date` column to the target DataFrame.

In [1]:
import pandas as pd
import numpy as np

# --- Step 1: Load Source and Target DataFrames ---
df_source = pd.DataFrame({
    'id': [1, 2, 3],
    'email': ['a@example.com', 'b@example.com', 'c@example.com'],
    'signup_date': pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03'])
})

df_target = pd.DataFrame({
    'id': [1, 2, 3],
    'email': ['a@example.com', 'b@example.com', 'c@example.com']
})

# --- Step 2: Schema Comparison Function ---
def get_missing_columns(source_df, target_df):
    """Returns columns in source that are missing in target."""
    return [col for col in source_df.columns if col not in target_df.columns]

def get_mismatched_dtypes(source_df, target_df):
    """Returns columns with mismatched data types (only those that exist in both)."""
    mismatches = []
    for col in source_df.columns:
        if col in target_df.columns:
            if source_df[col].dtype != target_df[col].dtype:
                mismatches.append((col, str(source_df[col].dtype), str(target_df[col].dtype)))
    return mismatches

# --- Step 3: Reconciliation with Validation ---
def reconcile_schema(source_df, target_df):
    if source_df.empty or target_df.empty:
        raise ValueError("One of the DataFrames is empty.")

    missing_cols = get_missing_columns(source_df, target_df)
    mismatched_dtypes = get_mismatched_dtypes(source_df, target_df)

    print(f"✅ Missing Columns: {missing_cols}")
    print(f"⚠️  Mismatched Data Types: {mismatched_dtypes}")

    # Add missing columns with default NaN values of correct dtype
    for col in missing_cols:
        default_value = pd.Series([np.nan], dtype=source_df[col].dtype).iloc[0]
        target_df[col] = default_value
        print(f"➕ Added column: {col} (dtype: {source_df[col].dtype})")

    return target_df

# --- Step 4: Run Reconciliation ---
df_target = reconcile_schema(df_source, df_target)
print("\n🧾 Final Reconciled Target DataFrame:\n", df_target.dtypes)

✅ Missing Columns: ['signup_date']
⚠️  Mismatched Data Types: []
➕ Added column: signup_date (dtype: datetime64[ns])

🧾 Final Reconciled Target DataFrame:
 id                      int64
email                  object
signup_date    datetime64[ns]
dtype: object
