In [None]:
"""
Topic: Data Pipeline Concepts (End-to-End Flow)
===============================================
Simulates a simple ETL pipeline using Python & boto3.
Shows how AWS services like S3, Glue, and Athena can
integrate to automate the full data flow.
"""

import boto3
import json
import logging
import time
from datetime import datetime

# -------------------------------------------------------------------
# Step 1 ‚Äì Initialize Clients
# -------------------------------------------------------------------
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")
logger = logging.getLogger("data_pipeline_demo")

s3 = boto3.client("s3")
glue = boto3.client("glue")
athena = boto3.client("athena")

# -------------------------------------------------------------------
# Step 2 ‚Äì Ingestion Simulation (Raw Data Upload)
# -------------------------------------------------------------------
def ingest_data(bucket, key, data):
    """Simulate raw data ingestion into S3."""
    logger.info("üì• Ingesting data into raw zone...")
    s3.put_object(Bucket=bucket, Key=key, Body=json.dumps(data))
    logger.info(f"‚úÖ Data uploaded to s3://{bucket}/{key}")

# -------------------------------------------------------------------
# Step 3 ‚Äì Trigger Glue Job (Transformation)
# -------------------------------------------------------------------
def run_glue_job(job_name):
    """Start AWS Glue job for transformation."""
    logger.info(f"‚öôÔ∏è Starting Glue Job: {job_name}")
    response = glue.start_job_run(JobName=job_name)
    job_run_id = response["JobRunId"]
    logger.info(f"üÜî Glue JobRun ID: {job_run_id}")
    return job_run_id

# -------------------------------------------------------------------
# Step 4 ‚Äì Run Athena Query (Analytics Layer)
# -------------------------------------------------------------------
def run_athena_query(database, query, output_s3):
    """Execute Athena query and return results."""
    logger.info("üîç Running Athena query...")
    exec_id = athena.start_query_execution(
        QueryString=query,
        QueryExecutionContext={"Database": database},
        ResultConfiguration={"OutputLocation": output_s3}
    )["QueryExecutionId"]

    # Wait for completion
    while True:
        status = athena.get_query_execution(QueryExecutionId=exec_id)
        state = status["QueryExecution"]["Status"]["State"]
        if state in ["SUCCEEDED", "FAILED", "CANCELLED"]:
            break
        time.sleep(2)

    logger.info(f"‚úÖ Query Status: {state}")
    return exec_id

# -------------------------------------------------------------------
# Step 5 ‚Äì Orchestration Simulation
# -------------------------------------------------------------------
def main():
    bucket = "supplychain-data-demo"
    raw_key = "raw/shipments.json"
    glue_job_name = "glue_transform_shipments"
    output_s3 = "s3://aws-athena-query-results-demo/"

    # Step 1: Ingest Data
    sample_data = {
        "shipment_id": "SHP001",
        "status": "Delivered",
        "region": "APAC",
        "weight": 12.4,
        "timestamp": datetime.now().isoformat()
    }
    ingest_data(bucket, raw_key, sample_data)

    # Step 2: Transform Data (Glue Job)
    run_glue_job(glue_job_name)

    # Step 3: Query Data (Athena)
    query = """
        SELECT region, COUNT(*) AS delivered_shipments
        FROM trusted_shipments
        WHERE status = 'Delivered'
        GROUP BY region;
    """
    run_athena_query("supplychain_catalog", query, output_s3)

    logger.info("üèÅ Data Pipeline executed successfully end-to-end.")

if __name__ == "__main__":
    main()
