In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("CourseEnrollment") \
    .getOrCreate()


Data Loading

In [2]:
df = spark.read.option("header", True).option("inferSchema", True).csv("course_enrollments.csv")
df2 = spark.read.option("header", True).option("inferSchema", True).csv("course_details.csv")


In [3]:
df.printSchema()
df.show()

root
 |-- EnrollmentID: string (nullable = true)
 |-- StudentName: string (nullable = true)
 |-- CourseName: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- EnrollDate: date (nullable = true)
 |-- ProgressPercent: integer (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Status: string (nullable = true)

+------------+-----------+--------------------+-----------+----------+---------------+------+---------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|Rating|   Status|
+------------+-----------+--------------------+-----------+----------+---------------+------+---------+
|      ENR001|     Aditya|Python for Beginners|Programming|2024-05-10|             80|   4.5|   Active|
|      ENR002|     Simran|Data Analysis wit...|  Analytics|2024-05-12|            100|   4.7|Completed|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|2024-05-13|             30|   3.8|   Active|
|      ENR004|       Neha|         Ja

In [4]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType

manual_schema = StructType([
    StructField("EnrollmentID", StringType(), True),
    StructField("StudentName", StringType(), True),
    StructField("CourseName", StringType(), True),
    StructField("Category", StringType(), True),
    StructField("EnrollDate", DateType(), True),
    StructField("ProgressPercent", IntegerType(), True),
    StructField("Rating", DoubleType(), True),
    StructField("Status", StringType(), True),
])

df_manual = spark.read.option("header", True).schema(manual_schema).csv("course_enrollments.csv")
df_manual.printSchema()
df_manual.show()


root
 |-- EnrollmentID: string (nullable = true)
 |-- StudentName: string (nullable = true)
 |-- CourseName: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- EnrollDate: date (nullable = true)
 |-- ProgressPercent: integer (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Status: string (nullable = true)

+------------+-----------+--------------------+-----------+----------+---------------+------+---------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|Rating|   Status|
+------------+-----------+--------------------+-----------+----------+---------------+------+---------+
|      ENR001|     Aditya|Python for Beginners|Programming|2024-05-10|             80|   4.5|   Active|
|      ENR002|     Simran|Data Analysis wit...|  Analytics|2024-05-12|            100|   4.7|Completed|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|2024-05-13|             30|   3.8|   Active|
|      ENR004|       Neha|         Ja

Filtering and Transformation

In [7]:
from pyspark.sql.functions import *
df_filtered = df.filter(col("ProgressPercent") < 50)

In [8]:
avg_rating = df.select(avg("Rating")).first()[0]
df_filled = df.withColumn("Rating", when(col("Rating").isNull(), avg_rating).otherwise(col("Rating")))

In [9]:
df_status = df_filled.withColumn("IsActive", when(col("Status") == "Active", 1).otherwise(0))

Aggregations & Metrics

In [11]:
df_status.groupBy("CourseName").agg(avg("ProgressPercent").alias("AvgProgress")).show()

+--------------------+-----------+
|          CourseName|AvgProgress|
+--------------------+-----------+
|Data Analysis wit...|      100.0|
|         Java Basics|        0.0|
|Machine Learning 101|       60.0|
|Python for Beginners|       85.0|
| Power BI Essentials|       30.0|
+--------------------+-----------+



In [12]:
df_status.groupBy("Category").count().withColumnRenamed("count", "StudentCount").show()

+-----------+------------+
|   Category|StudentCount|
+-----------+------------+
|Programming|           3|
|         AI|           1|
|  Analytics|           2|
+-----------+------------+



In [14]:
df_status.groupBy("CourseName").agg(count("*").alias("Enrollments")).orderBy(col("Enrollments").desc()).show(1)

+--------------------+-----------+
|          CourseName|Enrollments|
+--------------------+-----------+
|Python for Beginners|          2|
+--------------------+-----------+
only showing top 1 row



Joins

In [15]:
df_joined = df_status.join(df2, on="CourseName", how="left")

In [16]:
df_joined.show()

+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+
|          CourseName|EnrollmentID|StudentName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|DurationWeeks|Instructor|
+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+
|Python for Beginners|      ENR001|     Aditya|Programming|2024-05-10|             80|              4.5|   Active|       1|            4|    Rakesh|
|Data Analysis wit...|      ENR002|     Simran|  Analytics|2024-05-12|            100|              4.7|Completed|       0|            3|    Anjali|
| Power BI Essentials|      ENR003|     Aakash|  Analytics|2024-05-13|             30|              3.8|   Active|       1|            5|     Rekha|
|         Java Basics|      ENR004|       Neha|Programming|2024-05-15|              0|4.359999999999999| I

Window Functions

In [17]:
from pyspark.sql.window import Window
window_spec = Window.partitionBy("CourseName").orderBy(col("ProgressPercent").desc())
df_ranked = df_joined.withColumn("Rank", rank().over(window_spec))

In [18]:
window_cat = Window.partitionBy("Category").orderBy("EnrollDate")
df_leadlag = df_ranked.withColumn("NextDate", lead("EnrollDate").over(window_cat)) \
                      .withColumn("PrevDate", lag("EnrollDate").over(window_cat))


Pivoting & Formatting

In [19]:
df_pivot = df_joined.groupBy("Category").pivot("Status").count()

In [20]:
df_dates = df_joined.withColumn("EnrollYear", year("EnrollDate")) \
                    .withColumn("EnrollMonth", month("EnrollDate"))

Cleaning and Deduplication

In [21]:
df_clean = df_dates.filter((col("Status").isNotNull()) & (col("Status") != ""))

In [22]:
df_deduped = df_clean.dropDuplicates(["EnrollmentID"])

Export

In [23]:
df_deduped.write.mode("overwrite").option("header", True).csv("final_output_csv")
df_deduped.write.mode("overwrite").json("final_output_json")
df_deduped.write.mode("overwrite").option("compression", "snappy").parquet("final_output_parquet")