Ingestion & Time Fields

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
enrollments_path = "file:/Workspace/Shared/course_enrollments.csv"

df = spark.read.option("header", True).option("inferSchema", True).csv(enrollments_path)

df = df.withColumn("EnrollDate", to_date("EnrollDate")) \
       .withColumn("CompletionDate", to_date("CompletionDate"))
df = df.withColumn("DaysToComplete", 
                   datediff("CompletionDate", "EnrollDate").cast("int"))


User Learning Path Progress

In [0]:

user_courses = df.groupBy("UserID").agg(
    count("*").alias("CoursesEnrolled"),
    avg("ProgressPercent").alias("AvgProgress")
)

df = df.withColumn("IsCompleted", col("ProgressPercent") == 100)


Engagement Scoring

In [0]:
df = df.withColumn("Rating", coalesce(col("Rating"), lit(0)))
df = df.withColumn("EngagementScore", col("ProgressPercent") * col("Rating"))


Identify Drop-offs

In [0]:
dropouts = df.filter((col("ProgressPercent") < 50) & col("CompletionDate").isNull())
dropouts.createOrReplaceTempView("Dropouts")


Joins with Metadata

In [0]:

catalog_path = "file:/Workspace/Shared/course_catalog.csv"

catalog_schema = StructType([
    StructField("CourseID", StringType(), True),
    StructField("Instructor", StringType(), True),
    StructField("DurationHours", IntegerType(), True),
    StructField("Level", StringType(), True)
])

catalog_df = spark.read.option("header", True).schema(catalog_schema).csv(catalog_path)

joined_df = df.join(catalog_df, on="CourseID", how="left")
progress_by_instructor = joined_df.groupBy("Instructor").agg(avg("ProgressPercent").alias("AvgProgress"))

most_enrolled = df.groupBy("CourseID").count().orderBy(col("count").desc()).limit(1)
most_enrolled_course = most_enrolled.join(catalog_df, "CourseID", "left")


Delta Lake Practice

In [0]:

delta_path = "file:/Workspace/Shared/enrollments_delta"
df.write.format("delta").mode("overwrite").save(delta_path)

from delta.tables import DeltaTable

delta_table = DeltaTable.forPath(spark, delta_path)
delta_table.update(
    condition="CourseName = 'Python Basics'",
    set={"Rating": "5"}
)
delta_table.delete("ProgressPercent = 0")
delta_table.history().show()


+-------+--------------------+----------------+--------------------+---------+--------------------+----+------------------+--------------------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|version|           timestamp|          userId|            userName|operation| operationParameters| job|          notebook|           clusterId|readVersion|   isolationLevel|isBlindAppend|    operationMetrics|userMetadata|          engineInfo|
+-------+--------------------+----------------+--------------------+---------+--------------------+----+------------------+--------------------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|      3|2025-06-19 04:58:...|7868838587549447|azuser3557_mml.lo...| OPTIMIZE|{predicate -> [],...|NULL|{1222209826929636}|0619-042535-5t46f450|          1|SnapshotIsolation|        false|{numRemovedFiles ...|        NULL|Databricks-Runtim...|
|      2|2025-06-19 04:5

Window Functions

In [0]:
from pyspark.sql.window import Window
course_counts = df.groupBy("CourseID").count()
window_spec = Window.orderBy(col("count").desc())
course_counts = course_counts.withColumn("Rank", dense_rank().over(window_spec))
user_window = Window.partitionBy("UserID").orderBy("EnrollDate")
df = df.withColumn("NextCourseID", lead("CourseID").over(user_window))


SQL Logic for Dashboard Views

In [0]:
df.createOrReplaceTempView("enrollments")


spark.sql("""
CREATE OR REPLACE TEMP VIEW daily_enrollments AS
SELECT EnrollDate, COUNT(*) AS TotalEnrollments
FROM enrollments
GROUP BY EnrollDate
ORDER BY EnrollDate
""")


spark.sql("""
CREATE OR REPLACE TEMP VIEW category_performance AS
SELECT Category, AVG(Rating) AS AvgRating
FROM enrollments
GROUP BY Category
""")


spark.sql("""
CREATE OR REPLACE TEMP VIEW top_3_courses AS
SELECT CourseName, COUNT(*) AS Enrollments
FROM enrollments
GROUP BY CourseName
ORDER BY Enrollments DESC
LIMIT 3
""")


DataFrame[]

Time Travel & Export Reporting

In [0]:

df_version_0 = spark.read.format("delta").option("versionAsOf", 0).load(delta_path)
df_version_0.show()

df.write.mode("overwrite").partitionBy("Category").json("file:/Workspace/Shared/output_json")
summary_df = df.groupBy("CourseName").agg(
    count("*").alias("TotalEnrollments"),
    avg("Rating").alias("AvgRating"),
    avg("ProgressPercent").alias("AvgProgress")
)

summary_df.write.mode("overwrite").parquet("file:/Workspace/Shared/course_summary")


+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+---------------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|DaysToComplete|IsCompleted|EngagementScore|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+---------------+
|    E001|  U001|    C001|    Python Basics| Programming|2024-04-01|    2024-04-10|            100|     4|             9|       true|            400|
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|     0|          NULL|      false|              0|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|     0|          NULL|      false|              0|
|    E004|  U003|    C001|    Python Basics| Programming|2024-04-04|    2024-04-20|            100| 