In [None]:
# How to Trace a Record Through a Pipeline
# ----------------------------------------

import pandas as pd
import logging
from datetime import datetime

# 1Ô∏è‚É£ Setup logging
logging.basicConfig(
    filename="trace_record.log",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

print("üöÄ Pipeline trace started...")
logging.info("Trace started")

# 2Ô∏è‚É£ Create sample data (Raw Layer)
raw_data = [
    {"shipment_id": "SHIP123", "origin": "Chennai", "destination": "Delhi", "weight": 12.5, "status": "Received", "timestamp": "2025-11-05 10:00"},
    {"shipment_id": "SHIP124", "origin": "Hyderabad", "destination": "Bangalore", "weight": 8.0, "status": "Received", "timestamp": "2025-11-05 10:05"}
]
raw_df = pd.DataFrame(raw_data)
print("\n‚úÖ Step 1: Raw Data")
print(raw_df)
logging.info(f"Step 1: Loaded {len(raw_df)} records")

# 3Ô∏è‚É£ Pick a record to trace
record_id = "SHIP123"
trace_df = raw_df[raw_df["shipment_id"] == record_id]
print(f"\nüîç Tracing record: {record_id}")
print(trace_df)
logging.info(f"Tracing record {record_id} from raw data")

# 4Ô∏è‚É£ Step 2: Clean the data
cleaned_df = raw_df.copy()
cleaned_df["status"] = cleaned_df["status"].replace("Received", "Validated")
print("\n‚úÖ Step 2: Cleaned Data")
print(cleaned_df[cleaned_df["shipment_id"] == record_id])
logging.info(f"Record {record_id} cleaned successfully")

# 5Ô∏è‚É£ Step 3: Transformation (Add Delivery Time)
cleaned_df["delivery_time_hr"] = [5, 2]
print("\n‚úÖ Step 3: Transformed Data")
print(cleaned_df[cleaned_df["shipment_id"] == record_id])
logging.info(f"Record {record_id} transformed successfully")

# 6Ô∏è‚É£ Step 4: Aggregation (Group by Destination)
agg_df = cleaned_df.groupby("destination").agg(
    total_shipments=("shipment_id", "count"),
    avg_weight=("weight", "mean")
).reset_index()
print("\n‚úÖ Step 4: Aggregated Data")
print(agg_df)
logging.info("Aggregation completed")

# 7Ô∏è‚É£ Step 5: Verify trace after aggregation
if record_id in list(cleaned_df["shipment_id"]):
    print(f"\nüîÅ Record {record_id} found throughout all pipeline stages.")
    logging.info(f"Record {record_id} verified successfully across stages.")
else:
    print(f"‚ùå Record {record_id} missing after transformation!")
    logging.error(f"Record {record_id} missing in output.")

# 8Ô∏è‚É£ Completion
print("\nüéâ Record trace completed successfully!")
logging.info("Record trace completed")

# 9Ô∏è‚É£ Summary
"""
- This simulates tracing one record through multiple ETL steps.
- Helps debug issues like data loss or wrong transformations.
- Use filters + logging for every stage in production pipelines.
"""
