In [None]:
%run oeai_py

In [None]:
# Create an instance of OEAI class and set the platform ("Synapse" or "Fabric")
oeai = OEAI()

In [None]:
# CHANGE VALUES FOR YOUR KEY VAULT
keyvault = "INSERT_YOUR_KEYVAULT_NAME_HERE""  
keyvault_linked_service = "INSERT_YOUR_KEYVAULT_LINKED_SERVICE_NAME_HERE"  

# Synapse OEA environment paths
silver_path = oeai.get_secret(spark, "wonde-silver", keyvault_linked_service, keyvault)
gold_path = oeai.get_secret(spark, "gold-path", keyvault_linked_service, keyvault)

In [None]:
def create_attendance_report(spark, silver_path, gold_path):
    """
    Generates an aggregated attendance report from Delta Lake tables.

    This function reads organization, student, and attendance summary data from Delta tables,
    performs filtering and joins, and then aggregates the data at both the school and trust levels.
    The final aggregated data is written to a Parquet file.

    Args:
        spark (SparkSession): Active Spark session.
        silver_path (str): Path to the silver layer where source Delta tables are stored.
        gold_path (str): Path to the gold layer where the aggregated report will be saved.

    The function reads 'dim_Organisation', 'dim_Student', and 'fact_AttendanceSummary' tables,
    aggregates attendance data, and saves the output as 'agg_attendance.parquet'.
    """
    # Read delta tables
    dim_Organisation = spark.read.format("delta").load(silver_path + "dim_Organisation")
    dim_Student = spark.read.format("delta").load(silver_path + "dim_Student")
    fact_AttendanceSummary = spark.read.format("delta").load(silver_path + "fact_AttendanceSummary")

    # Alias the DataFrames
    dim_Student = dim_Student.alias("student")
    dim_Organisation = dim_Organisation.alias("org")
    fact_AttendanceSummary = fact_AttendanceSummary.alias("attendance")

    # Filter and join
    dim_Student = dim_Student.filter(col("Current_Year").isin([7, 8, 9, 10, 11]))
    joined_df = dim_Student.join(dim_Organisation, col("student.organisationkey") == col("org.organisationkey")) \
                           .join(fact_AttendanceSummary, col("student.studentkey") == col("attendance.studentkey"))

    # Aggregations
    school_agg_df = joined_df.groupBy("org.organisationkey", "org.Organisation_Name") \
                             .agg(
                                 avg("attendance.Percentage_Attendance").alias("avg_percentage_attendance"),
                                 (sum("attendance.Is_Persistently_Absent") / count("student.studentkey")).alias("rate_persistent_absence"),
                                 (sum("attendance.Is_Severely_Absent") / count("student.studentkey")).alias("rate_severe_absence")
                             )

    trust_agg_df = joined_df.agg(
        avg("attendance.Percentage_Attendance").alias("avg_percentage_attendance"),
        (sum("attendance.Is_Persistently_Absent") / count("student.studentkey")).alias("rate_persistent_absence"),
        (sum("attendance.Is_Severely_Absent") / count("student.studentkey")).alias("rate_severe_absence")
    ).withColumn("organisationkey", lit("Trust")) \
     .withColumn("Organisation_Name", lit("All Schools"))

    # Union and write to Parquet
    final_df = trust_agg_df.unionByName(school_agg_df)
    final_df.write.mode("overwrite").format("parquet").save(gold_path + "/agg_attendance.parquet")

In [None]:
def create_attendance_report_by_year(spark, silver_path, gold_path):
    """
    Generates an aggregated attendance report by year from Delta Lake tables.

    This function reads organization, student, and attendance summary data from Delta tables,
    performs joins and aggregates the data by school and overall trust level for each year.
    The final aggregated data is written to a Parquet file.

    Args:
        spark (SparkSession): Active Spark session.
        silver_path (str): Path to the silver layer where source Delta tables are stored.
        gold_path (str): Path to the gold layer where the aggregated report will be saved.

    The function reads 'dim_Organisation', 'dim_Student', and 'fact_AttendanceSummary' tables,
    aggregates attendance data by year, and saves the output as 'agg_attendance_by_year.parquet'.
    """
    # Read delta tables
    dim_Organisation = spark.read.format("delta").load(silver_path + "dim_Organisation")
    dim_Student = spark.read.format("delta").load(silver_path + "dim_Student")
    fact_AttendanceSummary = spark.read.format("delta").load(silver_path + "fact_AttendanceSummary")

    # Alias and process DataFrames
    dim_Student = dim_Student.alias("student").withColumn("Current_Year", col("Current_Year").cast("integer"))
    dim_Organisation = dim_Organisation.alias("org")
    fact_AttendanceSummary = fact_AttendanceSummary.alias("attendance")

    # Filter and join
    dim_Student = dim_Student.filter(col("Current_Year").isin([7, 8, 9, 10, 11, 12, 13]))
    joined_df = dim_Student.join(dim_Organisation, col("student.organisationkey") == col("org.organisationkey")) \
                           .join(fact_AttendanceSummary, col("student.studentkey") == col("attendance.studentkey"))

    # Aggregations
    school_year_agg_df = joined_df.groupBy("org.organisationkey", "org.Organisation_Name", "Current_Year") \
                                  .agg(avg("attendance.Percentage_Attendance").alias("avg_percentage_attendance"))

    trust_year_agg_df = joined_df.groupBy("Current_Year") \
                                 .agg(avg("attendance.Percentage_Attendance").alias("avg_percentage_attendance")) \
                                 .withColumn("organisationkey", lit("Trust")) \
                                 .withColumn("Organisation_Name", lit("All Schools"))

    # Union and write to Parquet
    final_year_df = trust_year_agg_df.unionByName(school_year_agg_df)
    final_year_df.write.mode("overwrite").format("parquet").save(gold_path + "/agg_attendance_by_year.parquet")

In [None]:
def create_attendance_report_by_gender(spark, silver_path, gold_path):
    """
    Generates an aggregated attendance report by gender from Delta Lake tables.

    This function reads organization, student, and attendance summary data from Delta tables,
    performs joins and aggregates the data by gender at both the school and trust levels.
    The final aggregated data is written to a Parquet file.

    Args:
        spark (SparkSession): Active Spark session.
        silver_path (str): Path to the silver layer where source Delta tables are stored.
        gold_path (str): Path to the gold layer where the aggregated report will be saved.

    The function reads 'dim_Organisation', 'dim_Student', and 'fact_AttendanceSummary' tables,
    aggregates attendance data by gender, and saves the output as 'agg_attendance_by_gender.parquet'.
    """
    # Read delta tables
    dim_Organisation = spark.read.format("delta").load(silver_path + "dim_Organisation")
    dim_Student = spark.read.format("delta").load(silver_path + "dim_Student")
    fact_AttendanceSummary = spark.read.format("delta").load(silver_path + "fact_AttendanceSummary")

    # Alias and process DataFrames
    dim_Student = dim_Student.alias("student").filter(col("Current_Year").isin([7, 8, 9, 10, 11]))
    dim_Organisation = dim_Organisation.alias("org")
    fact_AttendanceSummary = fact_AttendanceSummary.alias("attendance")

    # Join and select necessary columns
    joined_df = dim_Student.join(dim_Organisation, col("student.organisationkey") == col("org.organisationkey")) \
                           .join(fact_AttendanceSummary, col("student.studentkey") == col("attendance.studentkey")) \
                           .select("student.Gender", "student.studentkey", "org.organisationkey", 
                                   "org.Organisation_Name", "attendance.Percentage_Attendance", 
                                   "attendance.Is_Persistently_Absent", "attendance.Is_Severely_Absent")

    # Aggregations
    school_agg_df = joined_df.groupBy("org.organisationkey", "org.Organisation_Name") \
                             .pivot("student.Gender") \
                             .agg(
                                 avg("attendance.Percentage_Attendance").alias("avg_percentage_attendance"),
                                 (sum("attendance.Is_Persistently_Absent") / count("student.studentkey")).alias("rate_persistent_absence"),
                                 (sum("attendance.Is_Severely_Absent") / count("student.studentkey")).alias("rate_severe_absence")
                             )

    trust_agg_df = joined_df.groupBy().pivot("student.Gender").agg(
        avg("attendance.Percentage_Attendance").alias("avg_percentage_attendance"),
        (sum("attendance.Is_Persistently_Absent") / count("student.studentkey")).alias("rate_persistent_absence"),
        (sum("attendance.Is_Severely_Absent") / count("student.studentkey")).alias("rate_severe_absence")
    ).withColumn("organisationkey", lit("Trust")) \
     .withColumn("Organisation_Name", lit("All Schools"))

    # Union and write to Parquet
    final_df = trust_agg_df.unionByName(school_agg_df)
    final_df.write.mode("overwrite").format("parquet").save(gold_path + "/agg_attendance_by_gender.parquet")

In [None]:
# Create the aggregation tables
create_attendance_report(spark, silver_path, gold_path)
create_attendance_report_by_year(spark, silver_path, gold_path)
create_attendance_report_by_gender(spark, silver_path, gold_path)