**Gold Transformation**

Assemble our clean Silver layer tables into the final Star Schema. The Gold layer tables are what Power BI will consume for analysis

In [0]:
from pyspark.sql.functions import col

spark.sql("CREATE SCHEMA IF NOT EXISTS gold")

DataFrame[]

**Create gold.Fact_Performance**


In [0]:
# 1. Read tables from the Silver layer (using the correct catalog)
df_fact_source = spark.table("databricks_student.silver.student_assessment_cleaned")
df_dim_student = spark.table("databricks_student.silver.Dim_Student")
df_dim_assessment = spark.table("databricks_student.silver.Dim_Assessment")
df_dim_time = spark.table("databricks_student.silver.Dim_Time")

# 2. Assemble (Perform the joins)
df_gold_fact_performance = df_fact_source.join(
    df_dim_student,
    on="id_student",
    how="inner"
).join(
    df_dim_assessment,
    on="id_assessment",
    how="inner"
).join(
    df_dim_time,
    df_fact_source.date_submitted == df_dim_time.day_key,
    how="left" 
)

# 3. Select final columns (Keys + Measures)
df_gold_fact_performance = df_gold_fact_performance.select(
    col("id_student"),
    col("id_assessment"),
    col("day_key"),
    col("score"),
    col("is_banked")
)

# 4. Write to Gold Zone (with schema overwrite)
df_gold_fact_performance.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("databricks_student.gold.Fact_Performance")

print("Table 'gold.Fact_Performance' created successfully.")
display(df_gold_fact_performance.limit(10))

Table 'gold.Fact_Performance' created successfully.


id_student,id_assessment,day_key,score,is_banked
11391,1752,18,78,0
28400,1752,22,70,0
31604,1752,17,72,0
32885,1752,26,69,0
38053,1752,19,79,0
45462,1752,20,70,0
45642,1752,18,72,0
52130,1752,19,72,0
53025,1752,9,71,0
57506,1752,18,68,0


**Create gold.Fact_Engagement**


In [0]:
# 1. Read tables from Silver
df_fact_source_eng = spark.table("databricks_student.silver.student_vle_cleaned")
df_dim_student = spark.table("databricks_student.silver.Dim_Student")
df_dim_module = spark.table("databricks_student.silver.Dim_Module")
df_dim_vle = spark.table("databricks_student.silver.Dim_VLE_Activity")
df_dim_time = spark.table("databricks_student.silver.Dim_Time")

# 2. Assemble (Perform joins using the new 'module_key')
df_gold_fact_engagement = df_fact_source_eng.join(
    df_dim_student,
    on="id_student",
    how="inner"
).join(
    df_dim_module,
    on="module_key",  
    how="inner"
).join(
    df_dim_vle,
    on="id_site",
    how="inner"
).join(
    df_dim_time,
    df_fact_source_eng.date == df_dim_time.day_key, # 'date' from vle is the key
    how="left"
)

# 3. Select final columns
df_gold_fact_engagement = df_gold_fact_engagement.select(
    col("id_student"),
    col("module_key"), # The new, single foreign key
    col("id_site"),
    col("day_key"),
    col("sum_click")
)

# 4. Write to Gold Zone
df_gold_fact_engagement.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("databricks_student.gold.Fact_Engagement")

print("Table 'gold.Fact_Engagement' (Fixed) created successfully.")
display(df_gold_fact_engagement.limit(10))

Table 'gold.Fact_Engagement' (Fixed) created successfully.


id_student,module_key,id_site,day_key,sum_click
28400,AAA_2013J,546652,-10,4
28400,AAA_2013J,546652,-10,1
28400,AAA_2013J,546652,-10,1
28400,AAA_2013J,546614,-10,11
28400,AAA_2013J,546714,-10,1
28400,AAA_2013J,546652,-10,8
28400,AAA_2013J,546876,-10,2
28400,AAA_2013J,546688,-10,15
28400,AAA_2013J,546662,-10,17
28400,AAA_2013J,546890,-10,1


**Create gold.Fact_Inscription**


In [0]:
# 1. Read tables from Silver
df_fact_source_reg = spark.table("databricks_student.silver.student_registration_cleaned")
df_dim_student = spark.table("databricks_student.silver.Dim_Student")
df_dim_module = spark.table("databricks_student.silver.Dim_Module")
df_dim_time_reg = spark.table("databricks_student.silver.Dim_Time").alias("time_reg")
df_dim_time_unreg = spark.table("databricks_student.silver.Dim_Time").alias("time_unreg")

# 2. Assemble (Perform joins using the new 'module_key')
df_gold_fact_inscription = df_fact_source_reg.join(
    df_dim_student,
    on="id_student",
    how="inner"
).join(
    df_dim_module,
    on="module_key",  
    how="inner"
).join(
    df_dim_time_reg,
    df_fact_source_reg.date_registration == df_dim_time_reg.day_key,
    how="left"
).join(
    df_dim_time_unreg,
    df_fact_source_reg.date_unregistration == df_dim_time_unreg.day_key,
    how="left"
)

# 3. Select final columns
df_gold_fact_inscription = df_gold_fact_inscription.select(
    col("id_student"),
    col("module_key"), # The new, single foreign key
    col("time_reg.day_key").alias("date_registration_key"),
    col("time_unreg.day_key").alias("date_unregistration_key"),
    col("final_result"),
    col("date_unregistration") 
)

# 4. Write to Gold Zone
df_gold_fact_inscription.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("databricks_student.gold.Fact_Inscription")

print("Table 'gold.Fact_Inscription' (Fixed) created successfully.")
display(df_gold_fact_inscription.limit(10))

Table 'gold.Fact_Inscription' (Fixed) created successfully.


id_student,module_key,date_registration_key,date_unregistration_key,final_result,date_unregistration
11391,AAA_2013J,,,PASS,
28400,AAA_2013J,-53.0,,PASS,
30268,AAA_2013J,-92.0,12.0,WITHDRAWN,12.0
31604,AAA_2013J,-52.0,,PASS,
32885,AAA_2013J,,,PASS,
38053,AAA_2013J,,,PASS,
45462,AAA_2013J,-67.0,,PASS,
45642,AAA_2013J,-29.0,,PASS,
52130,AAA_2013J,-33.0,,PASS,
53025,AAA_2013J,,,PASS,
