In [1]:
"""
Example 1: Handling Added Columns in CSV Files
----------------------------------------------
Demonstrates how to automatically detect and align new columns
when schema changes between data versions.
"""

import pandas as pd
import logging

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")

def align_schemas(old_df, new_df):
    """Align new DataFrame to old schema by adding missing columns."""
    for col in old_df.columns:
        if col not in new_df.columns:
            new_df[col] = None  # add missing columns
    # Reorder columns to match
    new_df = new_df[old_df.columns]
    return new_df

if __name__ == "__main__":
    old_data = pd.DataFrame({
        "order_id": [1, 2],
        "region": ["APAC", "EMEA"],
        "amount": [250, 400]
    })

    new_data = pd.DataFrame({
        "order_id": [3, 4],
        "region": ["US", "LATAM"],
        "amount": [320, 210],
        "delivery_eta": ["2 days", "3 days"]  # new column
    })

    aligned_df = align_schemas(old_data, new_data)
    logging.info("✅ Schema aligned successfully!")
    print(aligned_df)


2025-11-12 16:14:24,049 - ✅ Schema aligned successfully!


   order_id region  amount
0         3     US     320
1         4  LATAM     210


In [2]:
"""
Example 2: Schema Evolution with Column Renames
-----------------------------------------------
Uses a mapping dictionary to handle renamed columns across versions.
"""

import pandas as pd
import json
import logging

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")

def apply_schema_mapping(df, mapping_file):
    """Rename columns based on schema mapping JSON."""
    with open(mapping_file, "r") as f:
        mapping = json.load(f)
    df = df.rename(columns=mapping)
    return df

if __name__ == "__main__":
    # Simulate a mapping file (normally stored in S3 or repo)
    mapping = {"ship_id": "shipment_id", "region_code": "region"}
    with open("schema_mapping.json", "w") as f:
        json.dump(mapping, f, indent=4)

    df = pd.DataFrame({
        "ship_id": [101, 102],
        "region_code": ["APAC", "EMEA"],
        "weight": [20.5, 18.3]
    })

    updated_df = apply_schema_mapping(df, "schema_mapping.json")
    logging.info("✅ Schema mapping applied successfully!")
    print(updated_df)


2025-11-12 16:14:24,120 - ✅ Schema mapping applied successfully!


   shipment_id region  weight
0          101   APAC    20.5
1          102   EMEA    18.3
