In [None]:
%run oeai_py

In [None]:
# Create an instance of OEAI class and set the platform ("Synapse" or "Fabric")
oeai = OEAI()

In [None]:
import pyspark.sql.functions as F
from pyspark.sql.functions import col, udf
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql.functions import lit, current_timestamp
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, TimestampType

In [None]:
# CHANGE VALUES FOR YOUR KEY VAULT
keyvault = "INSERT_YOUR_KEYVAULT_NAME_HERE"  
keyvault_linked_service = "INSERT_YOUR_KEYVAULT_LINKED_SERVICE_NAME_HERE" # Linked service name for Synapse  

# Synapse OEA environment paths
silver_path = oeai.get_secret(spark, "bromcom-silver", keyvault_linked_service, keyvault)
gold_path = oeai.get_secret(spark, "gold-path", keyvault_linked_service, keyvault)

In [None]:
# Define the paths to your Delta tables
path_dim_student = silver_path + 'dim_Student'
path_dim_studentextended = silver_path + 'dim_StudentExtended'
path_attendancesummary = silver_path + 'fact_AttendanceSummary'

# Read the Delta tables into DataFrames
df_dim_student = spark.read.format("delta").load(path_dim_student)
df_dim_studentextended = spark.read.format("delta").load(path_dim_studentextended)
df_attendancesummary = spark.read.format("delta").load(path_attendancesummary)

# Alias each DataFrame
df_student = df_dim_student.alias("student")
df_extended = df_dim_studentextended.alias("extended")
df_attendance = df_attendancesummary.alias("attendance")

In [None]:
# Select columns explicitly to avoid duplicates, using DataFrame alias
combined_df = df_student.join(df_extended, col("student.unique_key") == col("extended.unique_key"), "inner") \
                        .join(df_attendance, col("student.unique_key") == col("attendance.unique_key"), "inner") \
                        .select(
                            col("student.unique_key"),  
                            col("student.Gender"),
                            col("extended.Pupil_Premium_Indicator"), 
                            col("extended.SEN_Status"),
                            col("extended.Current_Year"),
                            col("extended.English_As_Additional_Language"),
                            col("attendance.Attendance_Mark_String"),
                        )

In [None]:
# Convert 'Pupil_Premium_Indicator' to numeric (1 for True, 0 for False)
combined_df = combined_df.withColumn("Pupil_Premium_Indicator_Num", col("Pupil_Premium_Indicator").cast("integer"))

#Convert 'English_As_An_Additional_Language' to numeric (1 for True, 0 for False)
combined_df = combined_df.withColumn("English_As_Additional_Language_Num", col("English_As_Additional_Language").cast("integer"))


# StringIndexer for categorical columns
indexers = [
    StringIndexer(inputCol="Gender", outputCol="Gender_Index"),
    StringIndexer(inputCol="SEN_Status", outputCol="SEN_Status_Index"),
    StringIndexer(inputCol="Current_Year", outputCol="Current_Year_Index"),  
    #StringIndexer(inputCol="English_As_Additional_Language", outputCol="EAL_Index"),
]

# Assembling all features into one vector column
# Include "English_As_Additional_Language_Num" directly in the inputCols for VectorAssembler
assembler = VectorAssembler(
    inputCols=[indexer.getOutputCol() for indexer in indexers] + ["Pupil_Premium_Indicator_Num", "English_As_Additional_Language_Num"],
    outputCol="features"
)

# Define the pipeline
pipeline = Pipeline(stages=indexers + [assembler])

# Fit and transform the pipeline on the combined_df
combined_df_transformed = pipeline.fit(combined_df).transform(combined_df)

# Now, combined_df_transformed has a 'features' column with all the standardized features


In [None]:
# Define a schema for the UDF return type
schema = StructType([
    StructField("count", IntegerType(), False),
    StructField("filtered_string", StringType(), False)
])

# Modify the UDF to return both count and filtered_string
def count_absences_last_40_chars(mark_string):
    filtered_string = mark_string.replace("#", "")[-40:]  # Remove '#' and consider the last 40 characters
    count = 0
    for char in filtered_string:
        if char not in ['/', '\\', 'Z', 'L', 'B', 'D', 'J', 'P', 'V', 'W', 'X', 'Y']:
            count += 1
    
    return (count, filtered_string)

# Register the UDF with the new schema
count_absences_last_40_chars_udf = udf(count_absences_last_40_chars, schema)

# Apply the UDF and create two new columns
combined_df_transformed = combined_df_transformed.withColumn("absence_info", count_absences_last_40_chars_udf(combined_df_transformed["Attendance_Mark_String"]))
combined_df_transformed = combined_df_transformed.withColumn("count_last_40_absences", combined_df_transformed["absence_info"]["count"])
combined_df_transformed = combined_df_transformed.withColumn("last_40_mark_string", combined_df_transformed["absence_info"]["filtered_string"])

# Drop the struct column as it's no longer needed
combined_df_transformed = combined_df_transformed.drop("absence_info")

# Example: Viewing the new columns
combined_df_transformed.select("unique_key", "count_last_40_absences", "last_40_mark_string").show()

In [None]:
df_for_prediction = combined_df_transformed

In [None]:
# Define a schema for the UDF return type
schema = StructType([
    StructField("risk", IntegerType(), False),
    StructField("future_marks", StringType(), False)
])

# Modify the UDF to return both risk and future_marks
def calculate_risk(mark_string):
    future_marks = mark_string.replace("#", "")[-10:]  # Remove '#' and consider the last 10 characters
    non_attendance_count = 0
    for mark in future_marks:
        if mark not in ['/', '\\', 'Z', 'L', 'B', 'D', 'J', 'P', 'V', 'W', 'X', 'Y']:
            non_attendance_count += 1
    risk = 1 if non_attendance_count >= 2 else 0
    return (risk, future_marks)

# Register the UDF with the new schema
calculate_risk_udf = udf(calculate_risk, schema)

# Apply the UDF to create a struct column containing both risk and future_marks, then extract these into separate columns
combined_df_transformed = combined_df_transformed.withColumn("risk_info", calculate_risk_udf(col("Attendance_Mark_String")))
combined_df_transformed = combined_df_transformed.withColumn("label", combined_df_transformed["risk_info"]["risk"])
combined_df_transformed = combined_df_transformed.withColumn("training_future_marks", combined_df_transformed["risk_info"]["future_marks"])

# Drop the struct column as it's no longer needed
combined_df_transformed = combined_df_transformed.drop("risk_info")

# Now the DataFrame has 'label' and 'training_future_marks' columns for each student
combined_df_transformed.show(50)


In [None]:
# List of feature columns
feature_columns = [
    "Pupil_Premium_Indicator_Num", "Gender_Index", "SEN_Status_Index",
    "Current_Year_Index", "English_As_Additional_Language_Num",
    "count_last_40_absences"
]
# Assemble features
assembler = VectorAssembler(inputCols=feature_columns, outputCol="assembled_features")  # Replace 'feature_columns' with your feature column names

# Initialize the classifier
rf = RandomForestClassifier(labelCol="label", featuresCol="assembled_features")

# Pipeline
pipeline = Pipeline(stages=[assembler, rf])

# Split the data
(train_data, test_data) = combined_df_transformed.randomSplit([0.8, 0.2], seed=42)

# Model training
model = pipeline.fit(train_data)

# Model evaluation
predictions = model.transform(test_data)
evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
auc = evaluator.evaluate(predictions)
print("Test Area Under ROC: ", auc)

In [None]:
'''
# Hyperparameter tuning (optional)
paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 20, 30]) \
    .addGrid(rf.maxDepth, [5, 10, 15]) \
    .build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=5)

cvModel = crossval.fit(train_data)
bestModel = cvModel.bestModel
'''

In [None]:
'''
# Extract Best Model Parameters
bestPipeline = cvModel.bestModel
bestRFModel = bestPipeline.stages[-1]  # The last stage in the pipeline is the RandomForest model
print("Best Max Depth: ", bestRFModel.getMaxDepth())
print("Best Num Trees: ", bestRFModel.numTrees)  # Access as attribute

# Evaluate Best Model Performance
evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
bestModelPerformance = evaluator.evaluate(bestPipeline.transform(test_data))
print("Best Model Test Area Under ROC: ", bestModelPerformance)
'''

In [None]:
# Ensure this list exactly matches the inputCols of the VectorAssembler used in the model
feature_columns = [
    "Pupil_Premium_Indicator_Num", "Gender_Index", "SEN_Status_Index",
    "Current_Year_Index", "English_As_Additional_Language_Num",
    "count_last_40_absences"
]

# Extract feature importances
feature_importances = model.stages[-1].featureImportances.toArray()

# Check if lengths match
if len(feature_columns) != len(feature_importances):
    print("Warning: Mismatch in the number of features")

# Map feature names to their importance scores
importances = {feature: score for feature, score in zip(feature_columns, feature_importances)}

# Sort and print feature importances
sorted_importances = sorted(importances.items(), key=lambda x: x[1], reverse=True)
for feature, importance in sorted_importances:
    print(f"{feature}: {importance}")

In [None]:


# Get the dataset size
dataset_size = combined_df_transformed.count()

# Define a simpler schema without TrainingDateTime and DatasetSize
schema = StructType([
    StructField("Feature", StringType(), True),
    StructField("Importance", DoubleType(), True)
])

# Prepare rows with Python floats for the importances
rows = [(feature, float(importance)) for feature, importance in sorted_importances]

# Create a DataFrame from rows
importances_df = spark.createDataFrame(rows, schema)

# Add TrainingDateTime and DatasetSize columns
importances_df = importances_df.withColumn("TrainingDateTime", current_timestamp())\
                               .withColumn("DatasetSize", lit(dataset_size))

# Specify the path to save the feature importances
output_path = gold_path + "ml_fact_AttendanceRisk_explanation/"

# Save the DataFrame as a Parquet file, overwriting any existing data
importances_df.write.mode("overwrite").parquet(output_path)


In [None]:
# Make predictions
predictions = model.transform(df_for_prediction)
#predictions.printSchema()
# Write to Parquet file
predictions_path = gold_path + "ml_fact_AttendanceRisk/"
predictions.write.mode("overwrite").parquet(predictions_path)
