In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
spark = SparkSession.builder.appName("Online Course Learning Platform Analytics").getOrCreate()
spark

In [0]:
df_infra = spark.read.csv("dbfs:/FileStore/shared_uploads/azuser3559_mml.local@techademy.com/course_enrollments_.csv", header=True, inferSchema=True)
df_infra.show()

+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+
|    E001|  U001|    C001|    Python Basics| Programming|2024-04-01|    2024-04-10|            100|     4|
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|  NULL|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|  NULL|
|    E004|  U003|    C001|    Python Basics| Programming|2024-04-04|    2024-04-20|            100|     5|
|    E005|  U004|    C004|Digital Marketing|   Marketing|2024-04-05|    2024-04-16|            100|     4|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+



In [0]:
df_date = df_infra.withColumn("EnrollDate", to_date("EnrollDate"))  .withColumn("CompletionDate", to_date("CompletionDate"))
df_date.show()

+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+
|    E001|  U001|    C001|    Python Basics| Programming|2024-04-01|    2024-04-10|            100|     4|
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|  NULL|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|  NULL|
|    E004|  U003|    C001|    Python Basics| Programming|2024-04-04|    2024-04-20|            100|     5|
|    E005|  U004|    C004|Digital Marketing|   Marketing|2024-04-05|    2024-04-16|            100|     4|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+



In [0]:
df_com = df_date.withColumn("DaysToComplete", datediff("CompletionDate", "EnrollDate"))
df_com.show()

+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|DaysToComplete|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+
|    E001|  U001|    C001|    Python Basics| Programming|2024-04-01|    2024-04-10|            100|     4|             9|
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|  NULL|          NULL|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|  NULL|          NULL|
|    E004|  U003|    C001|    Python Basics| Programming|2024-04-04|    2024-04-20|            100|     5|            16|
|    E005|  U004|    C004|Digital Marketing|   Marketing|2024-04-05|    2024-04-16|            100|     4|            11|
+--------+------+-------

In [0]:
df_TC = df_com.groupBy("UserID").agg(count("*").alias("TotalCourses"))
df_TC.show()

+------+------------+
|UserID|TotalCourses|
+------+------------+
|  U004|           1|
|  U002|           1|
|  U003|           1|
|  U001|           2|
+------+------------+



In [0]:
df_AP = df_com.groupBy("UserID").agg(avg("ProgressPercent").alias("AvgProgress"))
df_AP.show()

+------+-----------+
|UserID|AvgProgress|
+------+-----------+
|  U004|      100.0|
|  U002|       45.0|
|  U003|      100.0|
|  U001|       65.0|
+------+-----------+



In [0]:
df_IC = df_com.withColumn("IsCompleted", (col("ProgressPercent") == 100))
df_IC.show()

+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|DaysToComplete|IsCompleted|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+
|    E001|  U001|    C001|    Python Basics| Programming|2024-04-01|    2024-04-10|            100|     4|             9|       true|
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|  NULL|          NULL|      false|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|  NULL|          NULL|      false|
|    E004|  U003|    C001|    Python Basics| Programming|2024-04-04|    2024-04-20|            100|     5|            16|       true|
|    E005|  U004|    C004|Digital Marketing|   Marketing|2024-

In [0]:
df_NN = df_IC.withColumn("Rating", when(col("Rating").isNull(), 0.0).otherwise(col("Rating")))
df_NN.show()

+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|DaysToComplete|IsCompleted|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+
|    E001|  U001|    C001|    Python Basics| Programming|2024-04-01|    2024-04-10|            100|   4.0|             9|       true|
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|   0.0|          NULL|      false|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|   0.0|          NULL|      false|
|    E004|  U003|    C001|    Python Basics| Programming|2024-04-04|    2024-04-20|            100|   5.0|            16|       true|
|    E005|  U004|    C004|Digital Marketing|   Marketing|2024-

In [0]:
df_ES = df_NN.withColumn("EngagementScore", col("ProgressPercent") * col("Rating"))
df_ES.show()


+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+---------------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|DaysToComplete|IsCompleted|EngagementScore|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+---------------+
|    E001|  U001|    C001|    Python Basics| Programming|2024-04-01|    2024-04-10|            100|   4.0|             9|       true|          400.0|
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|   0.0|          NULL|      false|            0.0|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|   0.0|          NULL|      false|            0.0|
|    E004|  U003|    C001|    Python Basics| Programming|2024-04-04|    2024-04-20|            100| 

In [0]:
df_dropout = df_ES.filter((col("ProgressPercent") < 50) & col("CompletionDate").isNull())
df_dropout.createOrReplaceTempView("Dropouts")
df_dropout.show()


+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+---------------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|DaysToComplete|IsCompleted|EngagementScore|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+---------------+
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|   0.0|          NULL|      false|            0.0|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|   0.0|          NULL|      false|            0.0|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+---------------+



In [0]:
df1 = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/azuser3559_mml.local@techademy.com/course_catalog.csv")
df1.show()

+--------+-------------+-------------+------------+
|CourseID|   Instructor|DurationHours|       Level|
+--------+-------------+-------------+------------+
|    C001|Abdullah Khan|            8|    Beginner|
|    C002|   Sana Gupta|            5|    Beginner|
|    C003| Ibrahim Khan|           10|Intermediate|
|    C004|  Zoya Sheikh|            6|    Beginner|
+--------+-------------+-------------+------------+



In [0]:
df_join = df_ES.join(df1, "CourseID")
df_join.show()
df_API = df_join.groupBy("Instructor").agg(avg("ProgressPercent").alias("AvgProgress"))
df_API.show()

+--------+--------+------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+---------------+-------------+-------------+------------+
|CourseID|EnrollID|UserID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|DaysToComplete|IsCompleted|EngagementScore|   Instructor|DurationHours|       Level|
+--------+--------+------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+---------------+-------------+-------------+------------+
|    C001|    E001|  U001|    Python Basics| Programming|2024-04-01|    2024-04-10|            100|   4.0|             9|       true|          400.0|Abdullah Khan|            8|    Beginner|
|    C002|    E002|  U002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|   0.0|          NULL|      false|            0.0|   Sana Gupta|            5|    Beginner|
|    C003|    E003|  U001|  ML with PySpark|D

In [0]:
df_MSE = df_join.groupBy("CourseID").count().orderBy(col("count").desc()).limit(1).join(df1, "CourseID").select("Instructor")
df_MSE.show()


+-------------+
|   Instructor|
+-------------+
|Abdullah Khan|
+-------------+



In [0]:
df_ES.write.format("delta").mode("overwrite").save("/delta/enrollments_delta")
spark.sql("CREATE TABLE IF NOT EXISTS enrollments_delta USING DELTA LOCATION '/delta/enrollments_delta'")

spark.read.format("delta").load("/delta/enrollments_delta").show()


+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+---------------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|DaysToComplete|IsCompleted|EngagementScore|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+---------------+
|    E001|  U001|    C001|    Python Basics| Programming|2024-04-01|    2024-04-10|            100|   4.0|             9|       true|          400.0|
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|   0.0|          NULL|      false|            0.0|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|   0.0|          NULL|      false|            0.0|
|    E004|  U003|    C001|    Python Basics| Programming|2024-04-04|    2024-04-20|            100| 

In [0]:
from delta.tables import DeltaTable

dt = DeltaTable.forPath(spark, "/delta/enrollments_delta")

dt.update(
    condition="CourseName = 'Python Basics'",
    set={"Rating": "5"}
)

display(dt.toDF())

dt.delete("ProgressPercent = 0")

display(dt.toDF())


EnrollID,UserID,CourseID,CourseName,Category,EnrollDate,CompletionDate,ProgressPercent,Rating,DaysToComplete,IsCompleted,EngagementScore
E002,U002,C002,Excel for Finance,Productivity,2024-04-02,,45,0.0,,False,0.0
E003,U001,C003,ML with PySpark,Data Science,2024-04-03,,30,0.0,,False,0.0
E005,U004,C004,Digital Marketing,Marketing,2024-04-05,2024-04-16,100,4.0,11.0,True,400.0
E001,U001,C001,Python Basics,Programming,2024-04-01,2024-04-10,100,5.0,9.0,True,400.0
E004,U003,C001,Python Basics,Programming,2024-04-04,2024-04-20,100,5.0,16.0,True,500.0


EnrollID,UserID,CourseID,CourseName,Category,EnrollDate,CompletionDate,ProgressPercent,Rating,DaysToComplete,IsCompleted,EngagementScore
E001,U001,C001,Python Basics,Programming,2024-04-01,2024-04-10,100,5.0,9.0,True,400.0
E004,U003,C001,Python Basics,Programming,2024-04-04,2024-04-20,100,5.0,16.0,True,500.0
E002,U002,C002,Excel for Finance,Productivity,2024-04-02,,45,0.0,,False,0.0
E003,U001,C003,ML with PySpark,Data Science,2024-04-03,,30,0.0,,False,0.0
E005,U004,C004,Digital Marketing,Marketing,2024-04-05,2024-04-16,100,4.0,11.0,True,400.0


In [0]:
spark.sql("DESCRIBE HISTORY enrollments_delta").show()


+-------+-------------------+----------------+--------------------+---------+--------------------+----+------------------+--------------------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|version|          timestamp|          userId|            userName|operation| operationParameters| job|          notebook|           clusterId|readVersion|   isolationLevel|isBlindAppend|    operationMetrics|userMetadata|          engineInfo|
+-------+-------------------+----------------+--------------------+---------+--------------------+----+------------------+--------------------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|     14|2025-06-19 08:55:59|6267232536945943|azuser3559_mml.lo...| OPTIMIZE|{predicate -> [],...|NULL|{4060256036899042}|0612-123310-2108yh11|         12|SnapshotIsolation|        false|{numRemovedFiles ...|        NULL|Databricks-Runtim...|
|     13|2025-06-19 08:55:58

In [0]:
from pyspark.sql.window import Window

win_rank = Window.orderBy(col("count").desc())
df_ES.groupBy("CourseID").count().withColumn("Rank", dense_rank().over(win_rank)).show()

+--------+-----+----+
|CourseID|count|Rank|
+--------+-----+----+
|    C001|    2|   1|
|    C004|    1|   2|
|    C003|    1|   2|
|    C002|    1|   2|
+--------+-----+----+



In [0]:
win_lead = Window.partitionBy("UserID").orderBy("EnrollDate")
df_WL = df_ES.withColumn("NextCourse", lead("CourseName").over(win_lead))
df_WL.show()

+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+---------------+---------------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|DaysToComplete|IsCompleted|EngagementScore|     NextCourse|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+---------------+---------------+
|    E001|  U001|    C001|    Python Basics| Programming|2024-04-01|    2024-04-10|            100|   4.0|             9|       true|          400.0|ML with PySpark|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|   0.0|          NULL|      false|            0.0|           NULL|
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|   0.0|          NULL|      false|            0.0|           NULL|
|   

In [0]:
df_ES.createOrReplaceTempView("enrollments")
spark.sql("""
    CREATE OR REPLACE TEMP VIEW daily_enrollments AS
    SELECT EnrollDate, COUNT(*) AS EnrollCount
    FROM enrollments
    GROUP BY EnrollDate
""")

spark.sql("SELECT * FROM daily_enrollments").show()

+----------+-----------+
|EnrollDate|EnrollCount|
+----------+-----------+
|2024-04-02|          1|
|2024-04-01|          1|
|2024-04-04|          1|
|2024-04-05|          1|
|2024-04-03|          1|
+----------+-----------+



In [0]:
spark.sql("""
    CREATE OR REPLACE TEMP VIEW category_performance AS
    SELECT Category, AVG(Rating) AS AvgRating
    FROM enrollments
    GROUP BY Category
""")

spark.sql("SELECT * FROM category_performance").show()

+------------+---------+
|    Category|AvgRating|
+------------+---------+
| Programming|      4.5|
|Productivity|      0.0|
|   Marketing|      4.0|
|Data Science|      0.0|
+------------+---------+



In [0]:
spark.sql("""
    CREATE OR REPLACE TEMP VIEW top_3_courses AS
    SELECT CourseName, COUNT(*) AS EnrollCount
    FROM enrollments
    GROUP BY CourseName
    ORDER BY EnrollCount DESC
    LIMIT 3
""")

spark.sql("SELECT * FROM top_3_courses").show()

+-----------------+-----------+
|       CourseName|EnrollCount|
+-----------------+-----------+
|    Python Basics|          2|
|Digital Marketing|          1|
|Excel for Finance|          1|
+-----------------+-----------+



In [0]:
spark.read.format("delta").option("versionAsOf", 0).load("/delta/enrollments_delta").show()

+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+---------------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|DaysToComplete|IsCompleted|EngagementScore|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+---------------+
|    E001|  U001|    C001|    Python Basics| Programming|2024-04-01|    2024-04-10|            100|   4.0|             9|       true|          400.0|
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|   0.0|          NULL|      false|            0.0|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|   0.0|          NULL|      false|            0.0|
|    E004|  U003|    C001|    Python Basics| Programming|2024-04-04|    2024-04-20|            100| 

In [0]:
spark.read.format("delta").option("timestampAsOf", "2025-06-19 08:51:54").load("/delta/enrollments_delta").show()

+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+---------------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|DaysToComplete|IsCompleted|EngagementScore|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+---------------+
|    E001|  U001|    C001|    Python Basics| Programming|2024-04-01|    2024-04-10|            100|   4.0|             9|       true|          400.0|
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|   0.0|          NULL|      false|            0.0|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|   0.0|          NULL|      false|            0.0|
|    E004|  U003|    C001|    Python Basics| Programming|2024-04-04|    2024-04-20|            100| 

In [0]:
df_ES.write.mode("overwrite").partitionBy("Category").json("/export/enrollments_json")

In [0]:
df_summ = df_ES.groupBy("CourseName").agg(count("*").alias("TotalEnrollments"),avg("Rating").alias("AvgRating"),avg("ProgressPercent").alias("AvgProgress"))
df_summ.show()

+-----------------+----------------+---------+-----------+
|       CourseName|TotalEnrollments|AvgRating|AvgProgress|
+-----------------+----------------+---------+-----------+
|Digital Marketing|               1|      4.0|      100.0|
|    Python Basics|               2|      4.5|      100.0|
|Excel for Finance|               1|      0.0|       45.0|
|  ML with PySpark|               1|      0.0|       30.0|
+-----------------+----------------+---------+-----------+



In [0]:
df_summ.write.mode("overwrite").parquet("/export/summary_courses")