In [1]:

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, when, desc, year, month, lead, lag, rank
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("CourseEnrollments").getOrCreate()

In [2]:
df_infer = spark.read.option("header", True).option("inferSchema", True).csv("course_enrollments.csv")
df_infer.printSchema()

from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType

schema = StructType([
    StructField("EnrollmentID", StringType(), True),
    StructField("StudentName", StringType(), True),
    StructField("CourseName", StringType(), True),
    StructField("Category", StringType(), True),
    StructField("EnrollDate", DateType(), True),
    StructField("ProgressPercent", IntegerType(), True),
    StructField("Rating", DoubleType(), True),
    StructField("Status", StringType(), True)
])

df_manual = spark.read.option("header", True).schema(schema).csv("course_enrollments.csv")
df_manual.printSchema()

root
 |-- EnrollmentID: string (nullable = true)
 |-- StudentName: string (nullable = true)
 |-- CourseName: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- EnrollDate: date (nullable = true)
 |-- ProgressPercent: integer (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Status: string (nullable = true)

root
 |-- EnrollmentID: string (nullable = true)
 |-- StudentName: string (nullable = true)
 |-- CourseName: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- EnrollDate: date (nullable = true)
 |-- ProgressPercent: integer (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Status: string (nullable = true)



**3. Filter records where ProgressPercent < 50**

---



In [10]:
df_filtered = df_manual.filter(df_manual.ProgressPercent < 50)
df_filtered.show()




+------------+-----------+-------------------+-----------+----------+---------------+------+--------+
|EnrollmentID|StudentName|         CourseName|   Category|EnrollDate|ProgressPercent|Rating|  Status|
+------------+-----------+-------------------+-----------+----------+---------------+------+--------+
|      ENR003|     Aakash|Power BI Essentials|  Analytics|2024-05-13|             30|   3.8|  Active|
|      ENR004|       Neha|        Java Basics|Programming|2024-05-15|              0|  NULL|Inactive|
+------------+-----------+-------------------+-----------+----------+---------------+------+--------+



**4. Replace null ratings with average rating**


In [12]:
from pyspark.sql.functions import avg, when, col

avg_rating = df_manual.select(avg("Rating")).first()[0]

df_rating_filled = df_manual.withColumn("Rating", when(col("Rating").isNull(), avg_rating).otherwise(col("Rating")))


**5. Add column IsActive: 1 if Status is Active, else 0**

In [13]:
df_flagged = df_rating_filled.withColumn("IsActive", when(col("Status") == "Active", 1).otherwise(0))

**6. Average progress by course**

In [14]:
df_flagged.groupBy("CourseName").agg(avg("ProgressPercent").alias("AvgProgress")).show()

+--------------------+-----------+
|          CourseName|AvgProgress|
+--------------------+-----------+
|Data Analysis wit...|      100.0|
|         Java Basics|        0.0|
|Machine Learning 101|       60.0|
|Python for Beginners|       85.0|
| Power BI Essentials|       30.0|
+--------------------+-----------+



**7. Count of students in each course category**

In [15]:
df_flagged.groupBy("Category").count().show()

+-----------+-----+
|   Category|count|
+-----------+-----+
|Programming|    3|
|         AI|    1|
|  Analytics|    2|
+-----------+-----+



**8. Most enrolled course**

In [16]:
from pyspark.sql.functions import desc

df_flagged.groupBy("CourseName").count().orderBy(desc("count")).limit(1).show()

+--------------------+-----+
|          CourseName|count|
+--------------------+-----+
|Python for Beginners|    2|
+--------------------+-----+



**9. Create & Load course_details.csv**

In [17]:
df_details = spark.read.option("header", True).option("inferSchema", True).csv("course_details.csv")

**10. Join enrollments with course_details**

In [18]:
df_joined = df_flagged.join(df_details, on="CourseName", how="left")
df_joined.show()

+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+
|          CourseName|EnrollmentID|StudentName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|DurationWeeks|Instructor|
+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+
|Python for Beginners|      ENR001|     Aditya|Programming|2024-05-10|             80|              4.5|   Active|       1|            4|    Rakesh|
|Data Analysis wit...|      ENR002|     Simran|  Analytics|2024-05-12|            100|              4.7|Completed|       0|            3|    Anjali|
| Power BI Essentials|      ENR003|     Aakash|  Analytics|2024-05-13|             30|              3.8|   Active|       1|            5|     Rekha|
|         Java Basics|      ENR004|       Neha|Programming|2024-05-15|              0|4.359999999999999| I

**11. Rank students in each course by ProgressPercent**

In [20]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank

window_spec = Window.partitionBy("CourseName").orderBy(col("ProgressPercent").desc())

df_ranked = df_joined.withColumn("Rank", rank().over(window_spec))
df_ranked.show()

+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+----+
|          CourseName|EnrollmentID|StudentName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|DurationWeeks|Instructor|Rank|
+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+----+
|Data Analysis wit...|      ENR002|     Simran|  Analytics|2024-05-12|            100|              4.7|Completed|       0|            3|    Anjali|   1|
|         Java Basics|      ENR004|       Neha|Programming|2024-05-15|              0|4.359999999999999| Inactive|       0|            6|     Manoj|   1|
|Machine Learning 101|      ENR005|       Zara|         AI|2024-05-17|             60|              4.2|   Active|       1|            8|     Samir|   1|
| Power BI Essentials|      ENR003|     Aakash|  Analytics|2024-05-13|      

**12. Lead & Lag of EnrollDate by Category**

In [21]:
from pyspark.sql.functions import lead, lag

window_cat = Window.partitionBy("Category").orderBy("EnrollDate")

df_lead_lag = df_ranked.withColumn("LeadDate", lead("EnrollDate").over(window_cat)) \
                       .withColumn("LagDate", lag("EnrollDate").over(window_cat))

**13. Pivot: Total enrollments by Category and Status**

In [22]:
df_pivot = df_joined.groupBy("Category").pivot("Status").count()
df_pivot.show()

+-----------+------+---------+--------+
|   Category|Active|Completed|Inactive|
+-----------+------+---------+--------+
|Programming|     1|        1|       1|
|         AI|     1|     NULL|    NULL|
|  Analytics|     1|        1|    NULL|
+-----------+------+---------+--------+



**14. Extract year and month from EnrollDate**

In [24]:
from pyspark.sql.functions import year, month

df_dates = df_joined.withColumn("EnrollYear", year("EnrollDate")) \
                    .withColumn("EnrollMonth", month("EnrollDate"))

df_dates.show()



+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+----------+-----------+
|          CourseName|EnrollmentID|StudentName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|DurationWeeks|Instructor|EnrollYear|EnrollMonth|
+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+----------+-----------+
|Python for Beginners|      ENR001|     Aditya|Programming|2024-05-10|             80|              4.5|   Active|       1|            4|    Rakesh|      2024|          5|
|Data Analysis wit...|      ENR002|     Simran|  Analytics|2024-05-12|            100|              4.7|Completed|       0|            3|    Anjali|      2024|          5|
| Power BI Essentials|      ENR003|     Aakash|  Analytics|2024-05-13|             30|              3.8|   Active|       1|            5|   

**15. Drop rows where Status is null or empty**

In [25]:
df_cleaned = df_dates.filter((col("Status").isNotNull()) & (col("Status") != ""))


**16. Remove duplicate enrollments**

In [26]:
df_deduped = df_cleaned.dropDuplicates(["EnrollmentID"])

**17. Write final DataFrame**

In [27]:
# CSV
df_deduped.write.mode("overwrite").csv("output/final_enrollments_csv", header=True)

# JSON
df_deduped.write.mode("overwrite").json("output/final_enrollments_json")

# Parquet (snappy compression)
df_deduped.write.mode("overwrite").option("compression", "snappy").parquet("output/final_enrollments_parquet")
