In [None]:
"""
Topic: AWS Glue Jobs (Cataloging & Light Transformations)
=========================================================
Simulates an AWS Glue ETL job performing light transformations
and writing processed data back to S3.
"""

import sys
import boto3
import json
import logging
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# -------------------------------------------------------------------
# Step 1 ‚Äì Initialize Spark Session (for Glue)
# -------------------------------------------------------------------
print("üöÄ Starting Glue ETL Job...")
spark = SparkSession.builder.appName("Glue_Transformations_Demo").getOrCreate()
logger = logging.getLogger("glue_demo")
logger.setLevel(logging.INFO)

# -------------------------------------------------------------------
# Step 2 ‚Äì Read Data from Raw Zone
# -------------------------------------------------------------------
raw_path = "s3://supplychain-data-demo/raw/shipments/"
logger.info(f"üì• Reading data from: {raw_path}")
df_raw = spark.read.json(raw_path)

logger.info("‚úÖ Sample schema:")
df_raw.printSchema()

# -------------------------------------------------------------------
# Step 3 ‚Äì Light Transformations
# -------------------------------------------------------------------
logger.info("‚öôÔ∏è Performing data cleaning and transformations...")

df_cleaned = (
    df_raw
    .filter(col("shipment_id").isNotNull())
    .withColumnRenamed("ShipmentID", "shipment_id")
    .withColumnRenamed("Weight", "weight")
    .withColumnRenamed("Status", "status")
    .filter(col("status") != "Cancelled")
)

logger.info(f"‚úÖ Cleaned record count: {df_cleaned.count()}")

# -------------------------------------------------------------------
# Step 4 ‚Äì Write to Trusted Zone
# -------------------------------------------------------------------
trusted_path = "s3://supplychain-data-demo/trusted/shipments/"
logger.info(f"üì§ Writing cleaned data to: {trusted_path}")
(
    df_cleaned
    .repartition(1)
    .write.mode("overwrite")
    .parquet(trusted_path)
)

logger.info("‚úÖ Data successfully written to Trusted Zone.")

# -------------------------------------------------------------------
# Step 5 ‚Äì Update Glue Data Catalog (Simulation)
# -------------------------------------------------------------------
glue = boto3.client("glue")
table_name = "shipments_trusted"
database_name = "supplychain_catalog"

try:
    logger.info("üóÇÔ∏è Registering table in Glue Data Catalog...")
    glue.create_table(
        DatabaseName=database_name,
        TableInput={
            "Name": table_name,
            "StorageDescriptor": {
                "Columns": [
                    {"Name": "shipment_id", "Type": "string"},
                    {"Name": "weight", "Type": "double"},
                    {"Name": "status", "Type": "string"},
                ],
                "Location": trusted_path,
                "InputFormat": "org.apache.hadoop.mapred.TextInputFormat",
                "OutputFormat": "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
            }
        }
    )
    logger.info(f"‚úÖ Table '{table_name}' registered in Glue Catalog.")
except Exception as e:
    logger.warning(f"‚ö†Ô∏è Skipping catalog registration (may already exist): {e}")

# -------------------------------------------------------------------
# Step 6 ‚Äì Stop Spark Session
# -------------------------------------------------------------------
spark.stop()
logger.info("üèÅ Glue job completed successfully!")
