In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("CourseAnalytics").getOrCreate()
df = spark.read.csv("file:/Workspace/Shared/course_enrollments.csv", header=True, inferSchema=True)


In [0]:
from pyspark.sql.functions import col, datediff, to_date, when
from pyspark.sql.types import IntegerType

# Convert EnrollDate and CompletionDate to DateType
df = df.withColumn("EnrollDate", to_date("EnrollDate", "yyyy-MM-dd")) \
       .withColumn("CompletionDate", to_date("CompletionDate", "yyyy-MM-dd"))

# Add DaysToComplete column if completed
df = df.withColumn("DaysToComplete", when(col("CompletionDate").isNotNull(), datediff("CompletionDate", "EnrollDate"))
                   .otherwise(None))
df.show()

+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|DaysToComplete|IsCompleted|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+
|    E001|  U001|    C001|    Python Basics| Programming|2024-04-01|    2024-04-10|            100|     4|             9|          1|
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|     0|          NULL|          0|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|     0|          NULL|          0|
|    E004|  U003|    C001|    Python Basics| Programming|2024-04-04|    2024-04-20|            100|     5|            16|          1|
|    E005|  U004|    C004|Digital Marketing|   Marketing|2024-

In [0]:
from pyspark.sql.functions import avg, count, expr, when, col

#Flag IsCompleted = ProgressPercent = 100
df = df.withColumn("IsCompleted", when(col("ProgressPercent") == 100, 1).otherwise(0))

# Group by UserID : count of courses enrolled
#Avg progress % across all enrollments

user_progress = df.groupBy("UserID").agg(count("*").alias("CoursesEnrolled"),
    avg("ProgressPercent").alias("AvgProgress"),
    expr("sum(IsCompleted)").alias("CompletedCourses")
)
user_progress.show()

+------+---------------+-----------+----------------+
|UserID|CoursesEnrolled|AvgProgress|CompletedCourses|
+------+---------------+-----------+----------------+
|  U004|              1|      100.0|               1|
|  U002|              1|       45.0|               0|
|  U003|              1|      100.0|               1|
|  U001|              2|       65.0|               1|
+------+---------------+-----------+----------------+



In [0]:
#3. Engagement Scoring

# Replace null Rating with 0
df = df.withColumn("Rating", when(col("Rating").isNull(), 0).otherwise(col("Rating")))

# Create EngagementScore = ProgressPercent * Rating
df.withColumn("EngagementScore", (col("ProgressPercent") * col("Rating")).cast(IntegerType())).display()


EnrollID,UserID,CourseID,CourseName,Category,EnrollDate,CompletionDate,ProgressPercent,Rating,DaysToComplete,IsCompleted,EngagementScore
E001,U001,C001,Python Basics,Programming,2024-04-01,2024-04-10,100,4,9.0,1,400
E002,U002,C002,Excel for Finance,Productivity,2024-04-02,,45,0,,0,0
E003,U001,C003,ML with PySpark,Data Science,2024-04-03,,30,0,,0,0
E004,U003,C001,Python Basics,Programming,2024-04-04,2024-04-20,100,5,16.0,1,500
E005,U004,C004,Digital Marketing,Marketing,2024-04-05,2024-04-16,100,4,11.0,1,400


In [0]:
# Identify Drop-offs
# Filter all records with ProgressPercent < 50 and CompletionDate is null
# Create a view called Dropouts
# Filter drop-off records

dropouts = df.filter((col("ProgressPercent") < 50) & (col("CompletionDate").isNull()))

dropouts.createOrReplaceTempView("Dropouts")

spark.sql("SELECT * FROM Dropouts").show()



+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|DaysToComplete|IsCompleted|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|     0|          NULL|          0|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|     0|          NULL|          0|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+



In [0]:
# Joins with Metadata
# Create course_catalog.csv :
# CourseID,Instructor,DurationHours,Level
# C001,Abdullah Khan,8,Beginner
# C002,Sana Gupta,5,Beginner
# C003,Ibrahim Khan,10,Intermediate
# C004,Zoya Sheikh,6,Beginner

catalog_df = spark.read.csv("file:/Workspace/Shared/course_catalog.csv", header=True, inferSchema=True)
joined_df = df.join(catalog_df, on="CourseID", how="left")

# Join to find average progress per instructor

from pyspark.sql.functions import avg

avg_progress = joined_df.groupBy("Instructor").agg(avg("ProgressPercent").alias("AvgProgress"))
avg_progress.show()

# Show who teaches the most enrolled course
from pyspark.sql.functions import count

most_enrolled = df.groupBy("CourseID").agg(count("*").alias("Enrollments"))
top_course= most_enrolled.orderBy(col("Enrollments").desc()).limit(1)

instructor_df= top_course.join(catalog_df, on="CourseID", how="left")
instructor_df.select("CourseID", "Instructor", "Enrollments").show()

+-------------+-----------+
|   Instructor|AvgProgress|
+-------------+-----------+
|  Zoya Sheikh|      100.0|
|   Sana Gupta|       45.0|
| Ibrahim Khan|       30.0|
|Abdullah Khan|      100.0|
+-------------+-----------+

+--------+-------------+-----------+
|CourseID|   Instructor|Enrollments|
+--------+-------------+-----------+
|    C001|Abdullah Khan|          2|
+--------+-------------+-----------+



In [0]:
# Delta Lake Practice
# Save as Delta Table enrollments_delta

df.write.format("delta").mode("overwrite").save("/mnt/data/enrollments_delta")

# Load the Delta table
from delta.tables import DeltaTable
delta_df = DeltaTable.forPath(spark, "/mnt/data/enrollments_delta")

# Update: Set all ratings to 5 where Course = 'Python Basics'
delta_df.update(
    condition=col("CourseName") == "Python Basics",set={"Rating": expr("5")}
)

# Delete: All rows where ProgressPercent = 0
delta_df.delete(condition=col("ProgressPercent") == 0)

# Show DESCRIBE HISTORY
spark.sql("DESCRIBE HISTORY delta.`/mnt/data/enrollments_delta`").show(truncate=False)
delta_df.toDF().display()

+-------+-----------------------+----------------+----------------------------------+---------+------------------------------------------------------------------------------+----+------------------+--------------------+-----------+-----------------+-------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+------------------------------------------+
|version|timestamp              |userId          |userName                          |operation|operationParameters                                                           |job |notebook          |clusterId           |readVersion|isolationLevel   |isBlindAppend|operationMetrics                                                                                      

EnrollID,UserID,CourseID,CourseName,Category,EnrollDate,CompletionDate,ProgressPercent,Rating,DaysToComplete,IsCompleted
E002,U002,C002,Excel for Finance,Productivity,2024-04-02,,45,0,,0
E003,U001,C003,ML with PySpark,Data Science,2024-04-03,,30,0,,0
E005,U004,C004,Digital Marketing,Marketing,2024-04-05,2024-04-16,100,4,11.0,1
E001,U001,C001,Python Basics,Programming,2024-04-01,2024-04-10,100,5,9.0,1
E004,U003,C001,Python Basics,Programming,2024-04-04,2024-04-20,100,5,16.0,1


In [0]:

# 7. Window Functions
# Use dense_rank() to rank courses by number of enrollments
from pyspark.sql.window import Window
from pyspark.sql.functions import dense_rank, count

course_counts = df.groupBy("CourseID").agg(count("*").alias("Enrollments"))
rank_window = Window.orderBy(col("Enrollments").desc())

rank_courses = course_counts.withColumn("CourseRank", dense_rank().over(rank_window))
rank_courses.show()

# lead() to find next course by each user (sorted by EnrollDate)
from pyspark.sql.functions import lead

next_course_window = Window.partitionBy("UserID").orderBy("EnrollDate")

next_course_df=df.withColumn("NextCourseID", lead("CourseID").over(next_course_window)) \
                 .withColumn("NextCourseName", lead("CourseName").over(next_course_window))

next_course_df.select("UserID", "CourseID", "CourseName", "EnrollDate", "NextCourseID", "NextCourseName").show()


+--------+-----------+----------+
|CourseID|Enrollments|CourseRank|
+--------+-----------+----------+
|    C001|          2|         1|
|    C004|          1|         2|
|    C003|          1|         2|
|    C002|          1|         2|
+--------+-----------+----------+

+------+--------+-----------------+----------+------------+---------------+
|UserID|CourseID|       CourseName|EnrollDate|NextCourseID| NextCourseName|
+------+--------+-----------------+----------+------------+---------------+
|  U001|    C001|    Python Basics|2024-04-01|        C003|ML with PySpark|
|  U001|    C003|  ML with PySpark|2024-04-03|        NULL|           NULL|
|  U002|    C002|Excel for Finance|2024-04-02|        NULL|           NULL|
|  U003|    C001|    Python Basics|2024-04-04|        NULL|           NULL|
|  U004|    C004|Digital Marketing|2024-04-05|        NULL|           NULL|
+------+--------+-----------------+----------+------------+---------------+



In [0]:
# 8
# SQL Logic for Dashboard Views
# Create views:
df.createOrReplaceTempView("enrollments")

# daily_enrollments
spark.sql("""
CREATE OR REPLACE TEMP VIEW daily_enrollments AS
select EnrollDate, COUNT(*) AS TotalEnrollments
from enrollments GROUP BY EnrollDate
ORDER BY EnrollDate
""")
spark.sql("SELECT * FROM daily_enrollments").show()

# category_performance (avg rating by category)
spark.sql("""
CREATE OR REPLACE TEMP VIEW category_performance AS
select Category, ROUND(AVG(Rating), 2) AS AvgRating
from enrollments where Rating IS NOT NULL
GROUP BY Category;
""")
spark.sql("SELECT * FROM category_performance").show()

# top_3_courses
spark.sql("""
CREATE OR REPLACE TEMP VIEW top_3_courses AS
select CourseID, CourseName, ROUND(AVG(Rating), 2) AS AvgRating, COUNT(*) AS TotalEnrollments
from enrollments
GROUP BY CourseID, CourseName
ORDER BY AvgRating DESC, TotalEnrollments DESC
LIMIT 3;
""")
spark.sql("SELECT * FROM top_3_courses").show()


+----------+----------------+
|EnrollDate|TotalEnrollments|
+----------+----------------+
|2024-04-01|               1|
|2024-04-02|               1|
|2024-04-03|               1|
|2024-04-04|               1|
|2024-04-05|               1|
+----------+----------------+

+------------+---------+
|    Category|AvgRating|
+------------+---------+
| Programming|      4.5|
|Productivity|      0.0|
|   Marketing|      4.0|
|Data Science|      0.0|
+------------+---------+

+--------+-----------------+---------+----------------+
|CourseID|       CourseName|AvgRating|TotalEnrollments|
+--------+-----------------+---------+----------------+
|    C001|    Python Basics|      4.5|               2|
|    C004|Digital Marketing|      4.0|               1|
|    C002|Excel for Finance|      0.0|               1|
+--------+-----------------+---------+----------------+



In [0]:
# 9. Time Travel
# View previous version before update/delete
path = "/mnt/data/enrollments_delta"
spark.sql(f"DESCRIBE HISTORY delta.`{path}`").show(truncate=False)

# Use VERSION AS OF 
version_df = spark.read.format("delta").option("versionAsOf", 0).load(path)
version_df.show()

# Use TIMESTAMP AS OF
timestamp_df = spark.read.format("delta").option("timestampAsOf", "2025-06-19T06:42:37").load(path)
timestamp_df.show()



+-------+-----------------------+----------------+----------------------------------+---------+------------------------------------------------------------------------------+----+------------------+--------------------+-----------+-----------------+-------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+------------------------------------------+
|version|timestamp              |userId          |userName                          |operation|operationParameters                                                           |job |notebook          |clusterId           |readVersion|isolationLevel   |isBlindAppend|operationMetrics                                                                                      

In [0]:
# Export Reporting
# Write to JSON, partitioned by Category

df.write.partitionBy("Category").mode("overwrite").json("/mnt/data/enrollments_by_category")

# Create summary DataFrame:
# CourseName, TotalEnrollments, AvgRating, AvgProgress
from pyspark.sql.functions import count, avg, round

summary_df = df.groupBy("CourseName").agg(
    count("*").alias("TotalEnrollments"),
    round(avg("Rating"), 2).alias("AvgRating"),
    round(avg("ProgressPercent"), 2).alias("AvgProgress")
)

# Save as Parquet
summary_df.write.mode("overwrite").parquet("/mnt/data/course_summary_parquet")
