In [1]:
# Printing / Logging Data at Each Step
# ------------------------------------

import pandas as pd
import logging
from datetime import datetime

# 1. Configure logging
logging.basicConfig(
    filename="etl_pipeline.log",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

print("üöÄ Starting pipeline execution...")
logging.info("Pipeline started")

# 2. Step 1: Read sample data
data = [
    {"name": "Aisha", "role": "Manager", "region": "APAC"},
    {"name": "Rahul", "role": "Developer", "region": "EMEA"},
    {"name": "Fatima", "role": "Analyst", "region": "APAC"}
]

df = pd.DataFrame(data)
print("\n‚úÖ Step 1: Data loaded successfully")
print(df)
logging.info(f"Step 1 completed: Loaded {len(df)} records")

# 3. Step 2: Transform data
df["role"] = df["role"].str.upper()
df["load_time"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

print("\n‚úÖ Step 2: Data transformed")
print(df)
logging.info(f"Step 2 completed: Columns transformed {list(df.columns)}")

# 4. Step 3: Analyze metrics
unique_regions = df["region"].nunique()
print(f"\nUnique Regions: {unique_regions}")
logging.info(f"Unique regions: {unique_regions}")

# 5. Step 4: Simulate output write
output_path = "s3://training-bucket/output/employees.parquet"
print(f"\n‚úÖ Step 3: Data ready for write ‚Üí {output_path}")
logging.info(f"Data ready for S3 write: {output_path}")

# 6. Step 5: Error example
try:
    print("\nChecking for missing 'salary' column...")
    df["salary"].mean()
except KeyError as e:
    print(f"‚ö†Ô∏è Warning: Missing column - {e}")
    logging.error(f"Missing column error: {e}")

# 7. Completion
print("\nüéâ Pipeline execution finished successfully!")
logging.info("Pipeline finished successfully")

# 8. Summary
"""
- print() ‚Üí use for local visibility
- logging ‚Üí use for production tracking
- Always log key data (rows, columns, output paths)
"""


üöÄ Starting pipeline execution...

‚úÖ Step 1: Data loaded successfully
     name       role region
0   Aisha    Manager   APAC
1   Rahul  Developer   EMEA
2  Fatima    Analyst   APAC

‚úÖ Step 2: Data transformed
     name       role region            load_time
0   Aisha    MANAGER   APAC  2025-11-07 15:06:54
1   Rahul  DEVELOPER   EMEA  2025-11-07 15:06:54
2  Fatima    ANALYST   APAC  2025-11-07 15:06:54

Unique Regions: 2

‚úÖ Step 3: Data ready for write ‚Üí s3://training-bucket/output/employees.parquet

Checking for missing 'salary' column...

üéâ Pipeline execution finished successfully!


'\n- print() ‚Üí use for local visibility\n- logging ‚Üí use for production tracking\n- Always log key data (rows, columns, output paths)\n'