In [None]:
import logging
from pyspark.sql import functions as F
from pyspark.sql.types import StringType

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

try:
    # Load data from Unity Catalog tables
    class1_df = spark.table("postgresql_catalog.sas.class1")
    class2_df = spark.table("postgresql_catalog.sas.class2")

    # Sort datasets by Name
    class1_sorted_df = class1_df.orderBy("Name")
    class2_sorted_df = class2_df.orderBy("Name")

    # Merge datasets on Name (inner join)
    merged_df = class1_sorted_df.join(class2_sorted_df, on="Name", how="inner")

    # Calculate BMI
    bmi_df = merged_df.withColumn("BMI", (F.col("Weight") / (F.col("Height") ** 2)) * 703)

    # Define BMI categories
    def bmi_category(bmi):
        if bmi < 18.5:
            return "Underweight"
        elif 18.5 <= bmi < 24.9:
            return "Normal"
        elif 25 <= bmi < 29.9:
            return "Overweight"
        else:
            return "Obese"

    # Register UDF for BMI classification
    bmi_category_udf = F.udf(bmi_category, StringType())
    classified_df = bmi_df.withColumn("BMI_Category", bmi_category_udf(F.col("BMI")))

    # Generate summary statistics
    summary_stats_df = classified_df.select(
        F.mean("Age").alias("Mean_Age"),
        F.stddev("Age").alias("Std_Age"),
        F.min("Age").alias("Min_Age"),
        F.max("Age").alias("Max_Age"),
        F.mean("Height").alias("Mean_Height"),
        F.stddev("Height").alias("Std_Height"),
        F.min("Height").alias("Min_Height"),
        F.max("Height").alias("Max_Height"),
        F.mean("Weight").alias("Mean_Weight"),
        F.stddev("Weight").alias("Std_Weight"),
        F.min("Weight").alias("Min_Weight"),
        F.max("Weight").alias("Max_Weight"),
        F.mean("BMI").alias("Mean_BMI"),
        F.stddev("BMI").alias("Std_BMI"),
        F.min("BMI").alias("Min_BMI"),
        F.max("BMI").alias("Max_BMI")
    )

    # Frequency distribution of BMI categories
    freq_dist_df = classified_df.groupBy("BMI_Category").count()

    # Generate class report card
    report_card_df = classified_df.select("Name", "Age", "Sex", "Height", "Weight", "Grade", "GPA", "BMI", "BMI_Category")

    # Write outputs to Unity Catalog tables
    summary_stats_df.write.format("delta").mode("overwrite").saveAsTable("postgresql_catalog.sas.summary_statistics")
    freq_dist_df.write.format("delta").mode("overwrite").saveAsTable("postgresql_catalog.sas.bmi_frequency_distribution")
    report_card_df.write.format("delta").mode("overwrite").saveAsTable("postgresql_catalog.sas.class_report_card")

    logger.info("Data processing and migration completed successfully.")

except Exception as e:
    logger.error(f"An error occurred during the data processing: {e}")
