In [None]:
%run oeai_py

In [None]:
# Create an instance of OEAI class and set the platform ("Synapse" or "Fabric")
oeai = OEAI()

In [None]:
import pyspark.sql.functions as F
from pyspark.sql.functions import col, udf
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [None]:
# CHANGE VALUES FOR YOUR KEY VAULT
keyvault = "INSERT_YOUR_KEYVAULT_NAME_HERE"  
keyvault_linked_service = "INSERT_YOUR_KEYVAULT_LINKED_SERVICE_NAME_HERE"  

silver_path = oeai.get_secret(spark, "bromcom-silver", keyvault_linked_service, keyvault)
gold_path = oeai.get_secret(spark, "gold-path", keyvault_linked_service, keyvault)

# Define the paths to your Delta tables
path_dim_student = silver_path + 'dim_Student'
path_dim_studentextended = silver_path + 'dim_StudentExtended'
path_attendancesummary = silver_path + 'fact_AttendanceSummary'

# Read the Delta tables into DataFrames
df_dim_student = spark.read.format("delta").load(path_dim_student)
df_dim_studentextended = spark.read.format("delta").load(path_dim_studentextended)
df_attendancesummary = spark.read.format("delta").load(path_attendancesummary)

# Alias each DataFrame
df_student = df_dim_student.alias("student")
df_extended = df_dim_studentextended.alias("extended")
df_attendance = df_attendancesummary.alias("attendance")


In [None]:
'''
This code is for the first part of the notebook, where we create the DataFrame for prediction. 
The DataFrame is saved as a Delta table in the Gold zone.
'''
# Select columns explicitly to avoid duplicates, using DataFrame alias
combined_df = df_student.join(df_extended, col("student.unique_key") == col("extended.unique_key"), "inner") \
                        .join(df_attendance, col("student.unique_key") == col("attendance.unique_key"), "inner") \
                        .select(
                            col("student.unique_key"),  
                            col("student.Gender"),
                            col("extended.Pupil_Premium_Indicator"), 
                            col("extended.SEN_Status"),
                            col("extended.Current_Year"),
                            col("extended.English_As_Additional_Language"),
                            col("attendance.Attendance_Mark_String"),
                        )

# Convert 'Pupil_Premium_Indicator' to numeric (1 for True, 0 for False)
combined_df = combined_df.withColumn("Pupil_Premium_Indicator_Num", col("Pupil_Premium_Indicator").cast("integer"))


# Define the UDF to calculate risk based on the last 20 marks of the attendance string
# StringIndexer for categorical columns
indexers = [
    StringIndexer(inputCol="Gender", outputCol="Gender_Index"),
    StringIndexer(inputCol="SEN_Status", outputCol="SEN_Status_Index"),
    StringIndexer(inputCol="Current_Year", outputCol="Current_Year_Index"),  # Optional: treat as categorical
    StringIndexer(inputCol="English_As_Additional_Language", outputCol="EAL_Index"),
]

# UDF to count absences in the last 30 characters of a string, ignoring '#'
def count_absences_last_30_chars(mark_string):
    filtered_string = mark_string.replace("#", "")[-30:]  # Remove '#' and consider the last 30 characters
    count = 0
    for char in filtered_string:
        if char not in ['/', '\\', 'Z', 'L', 'B', 'D', 'J', 'P', 'V', 'W', 'X', 'Y']:
            count += 1
    return count

# Register UDF
count_absences_last_30_chars_udf = udf(count_absences_last_30_chars, IntegerType())

# Apply the UDF to create a new column for absences
combined_df_transformed = combined_df_transformed.withColumn("count_last_30_absences", count_absences_last_30_chars_udf(F.col("Attendance_Mark_String")))

# Example: Viewing the new column
combined_df_transformed.select("unique_key", "count_last_30_absences").show()

# we are later going to predict the risk of absence for each student, so we are preparing the data for that
df_for_prediction = combined_df_transformed

# Define the UDF to calculate risk based on the last 20 marks of the attendance string
def calculate_risk(mark_string):
    # Get the last 20 characters (10 days) of the string
    future_marks = mark_string.replace("#", "")[-20:]  # Remove '#' and consider the last 20 characters
    # Count non-attendance marks
    non_attendance_count = 0
    for mark in future_marks:
        if mark not in ['/', '\\', 'Z', 'L', 'B', 'D', 'J', 'P', 'V', 'W', 'X', 'Y']:
            non_attendance_count += 1
    # Determine if the student is at risk
    return 1 if non_attendance_count >= 2 else 0

calculate_risk_udf = udf(calculate_risk, IntegerType())

# Apply the UDF to create the label
combined_df_transformed = combined_df_transformed.withColumn("label", calculate_risk_udf(col("Attendance_Mark_String")))

# next we are going to train a model to predict the risk of absence for each student

# List of input columns for the VectorAssembler
feature_columns = ["Gender_Index", "SEN_Status_Index", "Current_Year_Index", "EAL_Index", "Pupil_Premium_Indicator_Num", "count_last_30_absences"]

# Drop the existing 'features' column, because we are going to create a new one
combined_df_transformed = combined_df_transformed.drop('features')

# we are going to use a random forest classifier because it is a good classifier for binary classification problems 
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

# Pipeline with assembler and classifier.  We use a pipeline because it allows us to reuse the same steps for the training and prediction data
pipeline = Pipeline(stages=[assembler, rf])

# Split the data because we want to evaluate the model on unseen data
(train_data, test_data) = combined_df_transformed.randomSplit([0.8, 0.2], seed=42)

# Model training and evaluation
model = pipeline.fit(train_data)

# Model evaluation
predictions = model.transform(test_data)
evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
auc = evaluator.evaluate(predictions)

# the ROC curve is a plot of the true positive rate (TPR) against the false positive rate (FPR) for the different possible cutpoints of a diagnostic test.
print("Test Area Under ROC: ", auc)

# Extract Best Model Parameters
bestPipeline = cvModel.bestModel
bestRFModel = bestPipeline.stages[-1]  # The last stage in the pipeline is the RandomForest model
print("Best Max Depth: ", bestRFModel.getMaxDepth())
print("Best Num Trees: ", bestRFModel.numTrees)  # Access as attribute

# Evaluate Best Model Performance
evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
bestModelPerformance = evaluator.evaluate(bestPipeline.transform(test_data))
print("Best Model Test Area Under ROC: ", bestModelPerformance)

# specify the feature columns
feature_columns = [
    "Pupil_Premium_Indicator_Num", "Gender_Index", "SEN_Status_Index",
    "Current_Year_Index", "EAL_Index", "count_last_30_absences"
]

# Extract feature importances
feature_importances = model.stages[-1].featureImportances.toArray()

# Map feature names to their importance scores
importances = {feature: score for feature, score in zip(feature_columns, feature_importances)}

# Sort and print feature importances
sorted_importances = sorted(importances.items(), key=lambda x: x[1], reverse=True)
for feature, importance in sorted_importances:
    print(f"{feature}: {importance}")


# next we are going to make predictions for the risk of absence for each student and save the results in a parquet file in the Gold zone
# Make predictions
predictions = model.transform(df_for_prediction) #df_for_prediction is the dataframe we created earlier

# Select the relevant columns (student key and prediction)
output_data = predictions.select(col("unique_key"), col("prediction").alias("predicted_label"))

# Write to Parquet file
output_data.write.parquet(gold_path + "ml_fact_AttendanceRisk/")