In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, datediff, to_date

spark = SparkSession.builder.appName("CourseAnalytics").getOrCreate()

In [0]:
# Load course_enrollments (with space and brackets in name)
df = spark.read.option("header", True).option("inferSchema", True).csv("/Volumes/workspace/default/nithyashree/course_enrollments (1).csv")

# Load course_catalog
df_catalog = spark.read.option("header", True).option("inferSchema", True).csv("/Volumes/workspace/default/nithyashree/course_catalog.csv")

# Show data
df.show()
df_catalog.show()

+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+
|    E001|  U001|    C001|    Python Basics| Programming|2024-04-01|    2024-04-10|            100|   4.0|
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|  NULL|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|  NULL|
|    E004|  U003|    C001|    Python Basics| Programming|2024-04-04|    2024-04-20|            100|   5.0|
|    E005|  U004|    C004|Digital Marketing|   Marketing|2024-04-05|    2024-04-16|            100|   4.0|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+

+--------+-------------+------------

Data Ingestion & Time Fields

In [0]:
# Load the enrollments dataset (replace with correct path if needed)
df = spark.read.option("header", True).option("inferSchema", True).csv(
    "/Volumes/workspace/default/nithyashree/course_enrollments (1).csv"
)

# Load the course catalog dataset
df_catalog = spark.read.option("header", True).option("inferSchema", True).csv(
    "/Volumes/workspace/default/nithyashree/course_catalog.csv"
)

# Show schemas
df.printSchema()
df_catalog.printSchema()

# Convert date fields to DateType
from pyspark.sql.functions import to_date

df = df.withColumn("EnrollDate", to_date("EnrollDate", "yyyy-MM-dd")) \
       .withColumn("CompletionDate", to_date("CompletionDate", "yyyy-MM-dd"))

df.show()

root
 |-- EnrollID: string (nullable = true)
 |-- UserID: string (nullable = true)
 |-- CourseID: string (nullable = true)
 |-- CourseName: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- EnrollDate: date (nullable = true)
 |-- CompletionDate: date (nullable = true)
 |-- ProgressPercent: integer (nullable = true)
 |-- Rating: double (nullable = true)

root
 |-- CourseID: string (nullable = true)
 |-- Instructor: string (nullable = true)
 |-- DurationHours: integer (nullable = true)
 |-- Level: string (nullable = true)

+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+
|    E001|  U001|    C001|    Python Basics| Programming|2024-04-01|    2024-04-10|            100|   4.0|
|    E002|  U002|    C002

Add DaysToComplete Column

In [0]:
from pyspark.sql.functions import datediff

# Add column only for completed courses
df = df.withColumn("DaysToComplete", datediff("CompletionDate", "EnrollDate"))

df.select("EnrollID", "UserID", "CourseName", "EnrollDate", "CompletionDate", "DaysToComplete").show()

+--------+------+-----------------+----------+--------------+--------------+
|EnrollID|UserID|       CourseName|EnrollDate|CompletionDate|DaysToComplete|
+--------+------+-----------------+----------+--------------+--------------+
|    E001|  U001|    Python Basics|2024-04-01|    2024-04-10|             9|
|    E002|  U002|Excel for Finance|2024-04-02|          NULL|          NULL|
|    E003|  U001|  ML with PySpark|2024-04-03|          NULL|          NULL|
|    E004|  U003|    Python Basics|2024-04-04|    2024-04-20|            16|
|    E005|  U004|Digital Marketing|2024-04-05|    2024-04-16|            11|
+--------+------+-----------------+----------+--------------+--------------+



 User Learning Path Progress

In [0]:
from pyspark.sql.functions import avg, count, when

df = df.withColumn("IsCompleted", (col("ProgressPercent") == 100))

user_progress = df.groupBy("UserID") \
    .agg(count("*").alias("TotalCourses"),
         avg("ProgressPercent").alias("AvgProgress"))

user_progress.show()

+------+------------+-----------+
|UserID|TotalCourses|AvgProgress|
+------+------------+-----------+
|  U002|           1|       45.0|
|  U001|           2|       65.0|
|  U004|           1|      100.0|
|  U003|           1|      100.0|
+------+------------+-----------+



Engagement Scoring

In [0]:
df = df.withColumn("Rating", when(col("Rating").isNull(), 0).otherwise(col("Rating")))
df = df.withColumn("EngagementScore", col("ProgressPercent") * col("Rating"))
df.select("EnrollID", "UserID", "CourseName", "EngagementScore").show()

+--------+------+-----------------+---------------+
|EnrollID|UserID|       CourseName|EngagementScore|
+--------+------+-----------------+---------------+
|    E001|  U001|    Python Basics|          400.0|
|    E002|  U002|Excel for Finance|            0.0|
|    E003|  U001|  ML with PySpark|            0.0|
|    E004|  U003|    Python Basics|          500.0|
|    E005|  U004|Digital Marketing|          400.0|
+--------+------+-----------------+---------------+



 Identify Drop-offs

In [0]:
dropouts = df.filter((col("ProgressPercent") < 50) & col("CompletionDate").isNull())
dropouts.createOrReplaceTempView("Dropouts")
spark.sql("SELECT * FROM Dropouts").show()

+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+---------------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|DaysToComplete|IsCompleted|EngagementScore|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+---------------+
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|   0.0|          NULL|      false|            0.0|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|   0.0|          NULL|      false|            0.0|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+---------------+



Join with Metadata

In [0]:
joined = df.join(df_catalog, on="CourseID", how="left")
joined.createOrReplaceTempView("JoinedTable")

# Avg progress per instructor
spark.sql("""
    SELECT Instructor, AVG(ProgressPercent) AS AvgProgress
    FROM JoinedTable
    GROUP BY Instructor
""").show()

# Most enrolled course instructor
spark.sql("""
    SELECT CourseName, Instructor, COUNT(*) AS TotalEnrollments
    FROM JoinedTable
    GROUP BY CourseName, Instructor
    ORDER BY TotalEnrollments DESC
    LIMIT 1
""").show()


+-------------+-----------+
|   Instructor|AvgProgress|
+-------------+-----------+
| Ibrahim Khan|       30.0|
|  Zoya Sheikh|      100.0|
|Abdullah Khan|      100.0|
|   Sana Gupta|       45.0|
+-------------+-----------+

+-------------+-------------+----------------+
|   CourseName|   Instructor|TotalEnrollments|
+-------------+-------------+----------------+
|Python Basics|Abdullah Khan|               2|
+-------------+-------------+----------------+



Save as Delta Table

In [0]:
df.write.format("delta").mode("overwrite").saveAsTable("enrollments_delta")

Delta Table Updates

In [0]:
# Update ratings for Python Basics
spark.sql("""
MERGE INTO enrollments_delta AS target
USING (
  SELECT * FROM enrollments_delta WHERE CourseName = 'Python Basics'
) AS source
ON target.EnrollID = source.EnrollID
WHEN MATCHED THEN UPDATE SET Rating = 5
""")


DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

Delta Table Deletes

In [0]:
# Delete where ProgressPercent is 0
spark.sql("""
DELETE FROM enrollments_delta WHERE ProgressPercent = 0
""")

DataFrame[num_affected_rows: bigint]

View Delta History

In [0]:
spark.sql("DESCRIBE HISTORY enrollments_delta").show(truncate=False)


+-------+-------------------+----------------+--------------------------+---------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----+--------+------------------------+-----------+-----------------+-------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Window Functions – Rank by Enrollments

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import dense_rank

course_counts = df.groupBy("CourseName").count()
window = Window.orderBy(col("count").desc())

ranked = course_counts.withColumn("Rank", dense_rank().over(window))
ranked.show()



+-----------------+-----+----+
|       CourseName|count|Rank|
+-----------------+-----+----+
|    Python Basics|    2|   1|
|  ML with PySpark|    1|   2|
|Excel for Finance|    1|   2|
|Digital Marketing|    1|   2|
+-----------------+-----+----+



Window Function – Lead for next course

In [0]:
from pyspark.sql.functions import lead

window = Window.partitionBy("UserID").orderBy("EnrollDate")
next_course = df.withColumn("NextCourse", lead("CourseName").over(window))
next_course.select("UserID", "CourseName", "NextCourse", "EnrollDate").show()

+------+-----------------+---------------+----------+
|UserID|       CourseName|     NextCourse|EnrollDate|
+------+-----------------+---------------+----------+
|  U001|    Python Basics|ML with PySpark|2024-04-01|
|  U001|  ML with PySpark|           NULL|2024-04-03|
|  U002|Excel for Finance|           NULL|2024-04-02|
|  U003|    Python Basics|           NULL|2024-04-04|
|  U004|Digital Marketing|           NULL|2024-04-05|
+------+-----------------+---------------+----------+



 SQL Views for Dashboards

In [0]:
# Create or replace a temp view
df.createOrReplaceTempView("enrollments")

# 1. Daily Enrollments
print(" Daily Enrollments")
spark.sql("""
    SELECT EnrollDate, COUNT(*) AS TotalEnrollments
    FROM enrollments
    GROUP BY EnrollDate
    ORDER BY EnrollDate
""").show()

# 2. Category Performance
print(" Category Performance (Average Rating)")
spark.sql("""
    SELECT Category, AVG(Rating) AS AvgRating
    FROM enrollments
    GROUP BY Category
""").show()

# 3. Top 3 Courses by Enrollment Count
print(" Top 3 Courses")
spark.sql("""
    SELECT CourseName, COUNT(*) AS Total
    FROM enrollments
    GROUP BY CourseName
    ORDER BY Total DESC
    LIMIT 3
""").show()


 Daily Enrollments
+----------+----------------+
|EnrollDate|TotalEnrollments|
+----------+----------------+
|2024-04-01|               1|
|2024-04-02|               1|
|2024-04-03|               1|
|2024-04-04|               1|
|2024-04-05|               1|
+----------+----------------+

 Category Performance (Average Rating)
+------------+---------+
|    Category|AvgRating|
+------------+---------+
|Data Science|      0.0|
| Programming|      4.5|
|   Marketing|      4.0|
|Productivity|      0.0|
+------------+---------+

 Top 3 Courses
+-----------------+-----+
|       CourseName|Total|
+-----------------+-----+
|    Python Basics|    2|
|Excel for Finance|    1|
|  ML with PySpark|    1|
+-----------------+-----+



Time Travel – View Previous Versions

In [0]:
# Replace <version_num> with actual value from DESCRIBE HISTORY
spark.read.format("delta").option("versionAsOf", 0).table("enrollments_delta").show()

+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+---------------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|DaysToComplete|IsCompleted|EngagementScore|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+---------------+
|    E001|  U001|    C001|    Python Basics| Programming|2024-04-01|    2024-04-10|            100|   4.0|             9|       true|          400.0|
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|   0.0|          NULL|      false|            0.0|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|   0.0|          NULL|      false|            0.0|
|    E004|  U003|    C001|    Python Basics| Programming|2024-04-04|    2024-04-20|            100| 

Export Reporting

In [0]:
from pyspark.sql.functions import avg, count

# Export as JSON partitioned by Category
df.write.mode("overwrite") \
    .partitionBy("Category") \
    .json("/Volumes/workspace/default/nithyashree/exported_json")

# Summary dataframe
summary = df.groupBy("CourseName") \
    .agg(
        count("*").alias("TotalEnrollments"),
        avg("Rating").alias("AvgRating"),
        avg("ProgressPercent").alias("AvgProgress")
    )

# Export summary as Parquet
summary.write.mode("overwrite") \
    .parquet("/Volumes/workspace/default/nithyashree/course_summary")
