#**Assessment-2**

In [0]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.window import Window as W

In [0]:
spark = SparkSession.builder.appName("dbshell-01").getOrCreate()

# **Ingestion & Time Fields**

In [0]:
# 1.1 Load into PySpark with inferred schema
df = spark.read.csv("/FileStore/tables/course_enrollments.csv", header=True, inferSchema=True)

In [0]:
# 1.2 Convert EnrollDate and CompletionDate to date type
df = df.withColumn("EnrollDate", F.to_date(df.EnrollDate)) \
        .withColumn("CompletionDate", F.to_date(df.CompletionDate))
df.select(["EnrollDate", "CompletionDate"]).show()

+----------+--------------+
|EnrollDate|CompletionDate|
+----------+--------------+
|2024-04-01|    2024-04-10|
|2024-04-02|          null|
|2024-04-03|          null|
|2024-04-04|    2024-04-20|
|2024-04-05|    2024-04-16|
+----------+--------------+



In [0]:
# 1.3 Add DaysToComplete column if completed
df = df.withColumn("DaysToComplete", F.when(F.col("CompletionDate").isNotNull(), F.datediff(F.col("CompletionDate"), F.col("EnrollDate"))).otherwise(None))
df.select(["UserID", "EnrollDate", "CompletionDate", "DaysToComplete"]).show()

+------+----------+--------------+--------------+
|UserID|EnrollDate|CompletionDate|DaysToComplete|
+------+----------+--------------+--------------+
|  U001|2024-04-01|    2024-04-10|             9|
|  U002|2024-04-02|          null|          null|
|  U001|2024-04-03|          null|          null|
|  U003|2024-04-04|    2024-04-20|            16|
|  U004|2024-04-05|    2024-04-16|            11|
+------+----------+--------------+--------------+



# **User Learning Path Progress**

In [0]:
# 2.1 Group by UserID : count of courses enrolled
df.groupBy("UserID").agg(
  F.countDistinct("CourseID").alias("CourseCount")
).show()

+------+-----------+
|UserID|CourseCount|
+------+-----------+
|  U004|          1|
|  U002|          1|
|  U003|          1|
|  U001|          2|
+------+-----------+



In [0]:
# 2.2 Avg progress % across all enrollments
df.groupBy("UserID").agg(
  F.countDistinct("CourseID").alias("CourseCount"),
  F.round(F.avg("ProgressPercent"), 2).alias("Average%")
).show()

+------+-----------+--------+
|UserID|CourseCount|Average%|
+------+-----------+--------+
|  U004|          1|   100.0|
|  U002|          1|    45.0|
|  U003|          1|   100.0|
|  U001|          2|    65.0|
+------+-----------+--------+



In [0]:
# 2.3 Flag IsCompleted = ProgressPercent = 100
df = df.withColumn("IsCompleted", F.when(F.col("ProgressPercent") == 100, 1).otherwise(0))
df.select(["UserID", "ProgressPercent", "IsCompleted"]).show()

+------+---------------+-----------+
|UserID|ProgressPercent|IsCompleted|
+------+---------------+-----------+
|  U001|            100|          1|
|  U002|             45|          0|
|  U001|             30|          0|
|  U003|            100|          1|
|  U004|            100|          1|
+------+---------------+-----------+



# **Engagement Scoring**

In [0]:
# 3.1 Create a score: ProgressPercent * Rating (if not null)
df = df.withColumn("Score", F.when(F.col("Rating").isNotNull(), df.ProgressPercent * df.Rating).otherwise(0))
df.select(["UserID", "ProgressPercent", "Rating", "Score"]).show()

+------+---------------+------+-----+
|UserID|ProgressPercent|Rating|Score|
+------+---------------+------+-----+
|  U001|            100|     4|  400|
|  U002|             45|  null|    0|
|  U001|             30|  null|    0|
|  U003|            100|     5|  500|
|  U004|            100|     4|  400|
+------+---------------+------+-----+



In [0]:
# 3.2 Replace null Rating with 0 before computing
df = df.fillna(0, subset="Rating")
df.select("Rating").show()

+------+
|Rating|
+------+
|     4|
|     0|
|     0|
|     5|
|     4|
+------+



# **Identify Drop-offs**

In [0]:
# 4.1 Filter all records with ProgressPercent < 50 and CompletionDate is null
dropouts = df.filter((df.ProgressPercent < 50) & (df.CompletionDate.isNull())) \
  .select(["UserID", "ProgressPercent", "CompletionDate"]) 

dropouts.show()

+------+---------------+--------------+
|UserID|ProgressPercent|CompletionDate|
+------+---------------+--------------+
|  U002|             45|          null|
|  U001|             30|          null|
+------+---------------+--------------+



In [0]:
# 4.2 Create a view called Dropouts
dropouts.createOrReplaceTempView("dropouts")

# **Joins with Metadata**

In [0]:
# 5.1 Create course_catalog.csv :
data = [
  ("C001","Abdullah Khan",8,"Beginner"),
  ("C002","Sana Gupta",5,"Beginner"),
  ("C003","Ibrahim Khan",10,"Intermediate"),
  ("C004","Zoya Sheikh",6,"Beginner")
]
columns = ["CourseID","Instructor","DurationHours","Level"]

course_catalog = spark.createDataFrame(data, columns)

In [0]:
# 5.2 Join to find average progress per instructor
dfJoined = df.join(course_catalog, on="CourseID", how="inner")
dfJoined.groupBy("Instructor").agg(
    F.round(F.avg("ProgressPercent"), 2).alias("AverageProgress")
).show()

+-------------+---------------+
|   Instructor|AverageProgress|
+-------------+---------------+
|Abdullah Khan|          100.0|
|   Sana Gupta|           45.0|
| Ibrahim Khan|           30.0|
|  Zoya Sheikh|          100.0|
+-------------+---------------+



In [0]:
# 5.3 Show who teaches the most enrolled course
dfJoined.groupBy("Instructor").agg(
    F.count("*").alias("CourseCount")
) \
.sort("CourseCount", ascending=False) \
.show(1)

+-------------+-----------+
|   Instructor|CourseCount|
+-------------+-----------+
|Abdullah Khan|          2|
+-------------+-----------+
only showing top 1 row



# **Delta Lake Practice**

In [0]:
# 6.1 Save as Delta Table enrollments_delta
spark.sql("CREATE DATABASE IF NOT EXISTS college")
spark.sql("USE college")

df.write.format("delta").mode("overwrite").saveAsTable("enrollments_delta")

In [0]:
# 6.2 Update: Set all ratings to 5 where Course = 'Python Basics'
spark.sql("""
          UPDATE enrollments_delta
          SET Rating = 5
          WHERE CourseName = 'Python Basics'
          """)

Out[122]: DataFrame[num_affected_rows: bigint]

In [0]:
# 6.3 Delete: All rows where ProgressPercent = 0
spark.sql("""
          DELETE FROM enrollments_delta
          WHERE ProgressPercent = 0
          """)

Out[123]: DataFrame[num_affected_rows: bigint]

In [0]:
# 6.4 Show DESCRIBE HISTORY
spark.sql("DESCRIBE HISTORY enrollments_delta").select(["version", "operation"]).show()

+-------+--------------------+
|version|           operation|
+-------+--------------------+
|      2|              DELETE|
|      1|              UPDATE|
|      0|CREATE OR REPLACE...|
+-------+--------------------+



# **Window Functions**

In [0]:
# 7.1 Use dense_rank() to rank courses by number of enrollments
win1 = W.orderBy(F.desc("NumEnrollments"))
df.groupBy("CourseName") \
  .agg(
  F.countDistinct("UserID").alias("NumEnrollments")
  ) \
  .withColumn("Rank", F.dense_rank().over(win1)) \
  .show()

+-----------------+--------------+----+
|       CourseName|NumEnrollments|Rank|
+-----------------+--------------+----+
|    Python Basics|             2|   1|
|Digital Marketing|             1|   2|
|Excel for Finance|             1|   2|
|  ML with PySpark|             1|   2|
+-----------------+--------------+----+



In [0]:
# 7.2 lead() to find next course by each user (sorted by EnrollDate)
win2 = W.partitionBy("UserID").orderBy("EnrollDate")
df.withColumn("NextCourse", F.lead("CourseName", 1, "None").over(win2)) \
    .select(["UserID", "CourseName", "NextCourse"]) \
    .show()

+------+-----------------+---------------+
|UserID|       CourseName|     NextCourse|
+------+-----------------+---------------+
|  U001|    Python Basics|ML with PySpark|
|  U001|  ML with PySpark|           None|
|  U002|Excel for Finance|           None|
|  U003|    Python Basics|           None|
|  U004|Digital Marketing|           None|
+------+-----------------+---------------+



# **SQL Logic for Dashboard Views**

In [0]:
# 8.1 Create views:
# daily_enrollments
daily_erollments = df.groupBy("EnrollDate").agg(
    F.count("*").alias("EnrollmentCount")
)
daily_erollments.createOrReplaceTempView("daily_enrollments")

In [0]:
# 8.2 Create views:
# category_performance (avg rating by category)
category_performance = df.groupBy("Category").agg(
    F.round(F.mean("Rating")).alias("AverageRating")
)
category_performance.createOrReplaceTempView("category_performance")

In [0]:
# 8.3 Create views:
# top_3_courses
top_3_courses = df.groupBy("CourseName").agg(
    F.count("*").alias("EnrollmentCount")
) \
.sort("EnrollmentCount", ascending=False) \
.limit(3)

top_3_courses.createOrReplaceTempView("top_3_courses")

# **Time Travel**

In [0]:
# 9.1 View previous version before update/delete
spark.sql("DESCRIBE HISTORY enrollments_delta").select(["version", "operation", "timestamp"]).show()

+-------+--------------------+-------------------+
|version|           operation|          timestamp|
+-------+--------------------+-------------------+
|      2|              DELETE|2025-06-19 06:17:10|
|      1|              UPDATE|2025-06-19 06:16:11|
|      0|CREATE OR REPLACE...|2025-06-19 06:15:12|
+-------+--------------------+-------------------+



In [0]:
# 9.2 Use VERSION AS OF and TIMESTAMP AS OF
spark.sql("""
          SELECT * FROM enrollments_delta VERSION AS OF 0
          """).show()

spark.sql("""
          SELECT * FROM enrollments_delta TIMESTAMP AS OF '2025-06-19 06:15:12'
          """).show()

+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+-----+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|DaysToComplete|IsCompleted|Score|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+-----+
|    E001|  U001|    C001|    Python Basics| Programming|2024-04-01|    2024-04-10|            100|     4|             9|          1|  400|
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          null|             45|     0|          null|          0|    0|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|          null|             30|     0|          null|          0|    0|
|    E004|  U003|    C001|    Python Basics| Programming|2024-04-04|    2024-04-20|            100|     5|            16|          1|  500|
|    E005|  U004|   

# **Export Reporting**

In [0]:
# 10.1 Write to JSON, partitioned by Category
df.write.mode("overwrite").option("partitionby", "Category").json("/FileStore/tables/enrollment_data")

In [0]:
# 10.2 Create summary DataFrame: CourseName, TotalEnrollments, AvgRating, AvgProgress
# Save as Parquet
dfSummary = df.groupBy("CourseName") \
            .agg(
                F.count("*").alias("TotalEnrollments"),
                F.round(F.mean("Rating"), 2).alias("AvergaeRating"),
                F.round(F.mean("ProgressPercent"), 2).alias("AverageProgress")
            )
dfSummary.write.mode("overwrite").parquet("/FileStore/tables/enrollment_summary")