In [None]:
import logging
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, DoubleType

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

try:
    # Load data from Unity Catalog tables
    logger.info("Loading data from Unity Catalog tables...")
    class1_df = spark.table("catalog.source_db.class1")
    class2_df = spark.table("catalog.source_db.class2")

    # Sort datasets by Name
    logger.info("Sorting datasets by Name...")
    class1_df = class1_df.orderBy("Name")
    class2_df = class2_df.orderBy("Name")

    # Merge datasets on Name
    logger.info("Merging datasets on Name...")
    merged_df = class1_df.join(class2_df, on="Name", how="inner")

    # Calculate BMI
    logger.info("Calculating BMI...")
    merged_df = merged_df.withColumn("BMI", (F.col("Weight") / (F.col("Height") ** 2)) * 703)

    # Define BMI categories
    def categorize_bmi(bmi):
        if bmi < 18.5:
            return 'Underweight'
        elif 18.5 <= bmi < 24.9:
            return 'Normal'
        elif 25 <= bmi < 29.9:
            return 'Overweight'
        else:
            return 'Obese'

    categorize_bmi_udf = F.udf(categorize_bmi, StringType())

    # Apply BMI categorization
    logger.info("Applying BMI categorization...")
    merged_df = merged_df.withColumn("BMI_Category", categorize_bmi_udf(F.col("BMI")))

    # Generate summary statistics
    logger.info("Generating summary statistics...")
    summary_stats_df = merged_df.select(
        F.mean("Age").alias("Mean_Age"),
        F.stddev("Age").alias("Std_Age"),
        F.min("Age").alias("Min_Age"),
        F.max("Age").alias("Max_Age"),
        F.mean("Height").alias("Mean_Height"),
        F.stddev("Height").alias("Std_Height"),
        F.min("Height").alias("Min_Height"),
        F.max("Height").alias("Max_Height"),
        F.mean("Weight").alias("Mean_Weight"),
        F.stddev("Weight").alias("Std_Weight"),
        F.min("Weight").alias("Min_Weight"),
        F.max("Weight").alias("Max_Weight"),
        F.mean("BMI").alias("Mean_BMI"),
        F.stddev("BMI").alias("Std_BMI"),
        F.min("BMI").alias("Min_BMI"),
        F.max("BMI").alias("Max_BMI")
    )

    # Generate frequency distribution of BMI categories
    logger.info("Generating frequency distribution of BMI categories...")
    bmi_freq_df = merged_df.groupBy("BMI_Category").count()

    # Generate class report card
    logger.info("Generating class report card...")
    report_card_df = merged_df.select(
        "Name", "Age", "Sex", "Height", "Weight", "Grade", "GPA", "BMI", "BMI_Category"
    )

    # Write outputs to Unity Catalog tables
    logger.info("Writing outputs to Unity Catalog tables...")
    spark.sql("DROP TABLE IF EXISTS catalog.target_db.summary_statistics")
    summary_stats_df.write.format("delta").mode("overwrite").saveAsTable("catalog.target_db.summary_statistics")

    spark.sql("DROP TABLE IF EXISTS catalog.target_db.bmi_frequency")
    bmi_freq_df.write.format("delta").mode("overwrite").saveAsTable("catalog.target_db.bmi_frequency")

    spark.sql("DROP TABLE IF EXISTS catalog.target_db.class_report_card")
    report_card_df.write.format("delta").mode("overwrite").saveAsTable("catalog.target_db.class_report_card")

    logger.info("ETL process completed successfully.")

except Exception as e:
    logger.error(f"An error occurred during the ETL process: {e}")
    raise
