In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("CourseData").getOrCreate()

df = spark.read.option("header", True).option("inferSchema", True).csv("file:/Workspace/Shared/course_enrollments.csv")
df.printSchema()
df.show()


root
 |-- EnrollmentID: string (nullable = true)
 |-- StudentName: string (nullable = true)
 |-- CourseName: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- EnrollDate: date (nullable = true)
 |-- ProgressPercent: integer (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Status: string (nullable = true)

+------------+-----------+--------------------+-----------+----------+---------------+------+---------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|Rating|   Status|
+------------+-----------+--------------------+-----------+----------+---------------+------+---------+
|      ENR001|     Aditya|Python for Beginners|Programming|2024-05-10|             80|   4.5|   Active|
|      ENR002|     Simran|Data Analysis wit...|  Analytics|2024-05-12|            100|   4.7|Completed|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|2024-05-13|             30|   3.8|   Active|
|      ENR004|       Neha|         Ja

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType

schema = StructType([
    StructField("EnrollmentID", StringType(), True),
    StructField("StudentName", StringType(), True),
    StructField("CourseName", StringType(), True),
    StructField("Category", StringType(), True),
    StructField("EnrollDate", DateType(), True),
    StructField("ProgressPercent", IntegerType(), True),
    StructField("Rating", DoubleType(), True),
    StructField("Status", StringType(), True)
])

df = spark.read.option("header", True).schema(schema).csv("file:/Workspace/Shared/course_enrollments.csv")
df.printSchema()
df.show()


root
 |-- EnrollmentID: string (nullable = true)
 |-- StudentName: string (nullable = true)
 |-- CourseName: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- EnrollDate: date (nullable = true)
 |-- ProgressPercent: integer (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Status: string (nullable = true)

+------------+-----------+--------------------+-----------+----------+---------------+------+---------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|Rating|   Status|
+------------+-----------+--------------------+-----------+----------+---------------+------+---------+
|      ENR001|     Aditya|Python for Beginners|Programming|2024-05-10|             80|   4.5|   Active|
|      ENR002|     Simran|Data Analysis wit...|  Analytics|2024-05-12|            100|   4.7|Completed|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|2024-05-13|             30|   3.8|   Active|
|      ENR004|       Neha|         Ja

In [0]:
# 3. Filter records where ProgressPercent < 50.
from pyspark.sql.functions import col
df.filter(col("ProgressPercent") < 50).show()

# 4. Replace null ratings with average rating.
from pyspark.sql.functions import avg, when, col

avg_rating = df.select(avg("Rating")).first()[0]

df.withColumn("Rating",when(col("Rating").isNull(), avg_rating).otherwise(col("Rating"))).show()

# 5. Add column IsActive → 1 if Status is Active, else 0.
df_status = df.withColumn("IsActive", when(col("Status") == "Active", 1).otherwise(0))
df_status.show()


+------------+-----------+-------------------+-----------+----------+---------------+------+--------+
|EnrollmentID|StudentName|         CourseName|   Category|EnrollDate|ProgressPercent|Rating|  Status|
+------------+-----------+-------------------+-----------+----------+---------------+------+--------+
|      ENR003|     Aakash|Power BI Essentials|  Analytics|2024-05-13|             30|   3.8|  Active|
|      ENR004|       Neha|        Java Basics|Programming|2024-05-15|              0|  NULL|Inactive|
+------------+-----------+-------------------+-----------+----------+---------------+------+--------+

+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|
+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+
|      ENR001|     Aditya|Python for Begin

In [0]:
# Aggregations & Metrics
# 6. Find average progress by course.
from pyspark.sql.functions import round

df.groupBy("CourseName").agg(round(avg("ProgressPercent"), 2).alias("AvgProgress")).show()

# 7. Get count of students in each course category.
df.groupBy("Category").count().withColumnRenamed("count", "StudentCount").show()

# 8. Identify the most enrolled course.
from pyspark.sql.functions import desc

df.groupBy("CourseName").count().orderBy(desc("count")).limit(1).show()


+--------------------+-----------+
|          CourseName|AvgProgress|
+--------------------+-----------+
|Data Analysis wit...|      100.0|
|         Java Basics|        0.0|
|Machine Learning 101|       60.0|
|Python for Beginners|       85.0|
| Power BI Essentials|       30.0|
+--------------------+-----------+

+-----------+------------+
|   Category|StudentCount|
+-----------+------------+
|Programming|           3|
|         AI|           1|
|  Analytics|           2|
+-----------+------------+

+--------------------+-----+
|          CourseName|count|
+--------------------+-----+
|Python for Beginners|    2|
+--------------------+-----+



In [0]:
# Joins
# 9. Create second CSV: course_details.csv
# CourseName,DurationWeeks,Instructor
# Python for Beginners,4,Rakesh
# Data Analysis with Excel,3,Anjali
# Power BI Essentials,5,Rekha
# Java Basics,6,Manoj
# Machine Learning 101,8,Samir
df_second= spark.read.option("header", True).option("inferSchema", True).csv("file:/Workspace/Shared/course_details.csv")
df_second.show()
# 10. Join course_enrollments with course_details to include duration and instructor.
df_joined = df.join(df_second, on="CourseName", how="inner")

df_joined.display()


+--------------------+-------------+----------+
|          CourseName|DurationWeeks|Instructor|
+--------------------+-------------+----------+
|Python for Beginners|            4|    Rakesh|
|Data Analysis wit...|            3|    Anjali|
| Power BI Essentials|            5|     Rekha|
|         Java Basics|            6|     Manoj|
|Machine Learning 101|            8|     Samir|
+--------------------+-------------+----------+



CourseName,EnrollmentID,StudentName,Category,EnrollDate,ProgressPercent,Rating,Status,DurationWeeks,Instructor
Python for Beginners,ENR001,Aditya,Programming,2024-05-10,80,4.5,Active,4,Rakesh
Data Analysis with Excel,ENR002,Simran,Analytics,2024-05-12,100,4.7,Completed,3,Anjali
Power BI Essentials,ENR003,Aakash,Analytics,2024-05-13,30,3.8,Active,5,Rekha
Java Basics,ENR004,Neha,Programming,2024-05-15,0,,Inactive,6,Manoj
Machine Learning 101,ENR005,Zara,AI,2024-05-17,60,4.2,Active,8,Samir
Python for Beginners,ENR006,Ibrahim,Programming,2024-05-18,90,4.6,Completed,4,Rakesh


In [0]:
# Window Functions
# 11. Rank students in each course based on ProgressPercent .
from pyspark.sql.window import Window
from pyspark.sql.functions import rank

course_window = Window.partitionBy("CourseName").orderBy(col("ProgressPercent").desc())

rank = df.withColumn("ProgressRank", rank().over(course_window))
rank.select("StudentName", "CourseName", "ProgressPercent", "ProgressRank").show()

# 12. Get lead and lag of EnrollDate by Category.
from pyspark.sql.functions import lead, lag

category_window = Window.partitionBy("Category").orderBy("EnrollDate")

df_lead_lag = df.withColumn("NextEnrollDate", lead("EnrollDate", 1).over(category_window)) \
                .withColumn("PrevEnrollDate", lag("EnrollDate", 1).over(category_window))

df_lead_lag.select("StudentName", "Category", "EnrollDate", "PrevEnrollDate", "NextEnrollDate").show()


+-----------+--------------------+---------------+------------+
|StudentName|          CourseName|ProgressPercent|ProgressRank|
+-----------+--------------------+---------------+------------+
|     Simran|Data Analysis wit...|            100|           1|
|       Neha|         Java Basics|              0|           1|
|       Zara|Machine Learning 101|             60|           1|
|     Aakash| Power BI Essentials|             30|           1|
|    Ibrahim|Python for Beginners|             90|           1|
|     Aditya|Python for Beginners|             80|           2|
+-----------+--------------------+---------------+------------+

+-----------+-----------+----------+--------------+--------------+
|StudentName|   Category|EnrollDate|PrevEnrollDate|NextEnrollDate|
+-----------+-----------+----------+--------------+--------------+
|       Zara|         AI|2024-05-17|          NULL|          NULL|
|     Simran|  Analytics|2024-05-12|          NULL|    2024-05-13|
|     Aakash|  Analytics

In [0]:
#Pivoting & Formatting
#13. Pivot data to show total enrollments by Category and Status.
from pyspark.sql.functions import count

df_pivot = df.groupBy("Category").pivot("Status").agg(count("EnrollmentID"))
df_pivot.show()

#14. Extract year and month from EnrollDate .
from pyspark.sql.functions import year, month

df.withColumn("EnrollYear", year("EnrollDate")).withColumn("EnrollMonth", month("EnrollDate")).display()



+-----------+------+---------+--------+
|   Category|Active|Completed|Inactive|
+-----------+------+---------+--------+
|Programming|     1|        1|       1|
|         AI|     1|     NULL|    NULL|
|  Analytics|     1|        1|    NULL|
+-----------+------+---------+--------+



EnrollmentID,StudentName,CourseName,Category,EnrollDate,ProgressPercent,Rating,Status,EnrollYear,EnrollMonth
ENR001,Aditya,Python for Beginners,Programming,2024-05-10,80,4.5,Active,2024,5
ENR002,Simran,Data Analysis with Excel,Analytics,2024-05-12,100,4.7,Completed,2024,5
ENR003,Aakash,Power BI Essentials,Analytics,2024-05-13,30,3.8,Active,2024,5
ENR004,Neha,Java Basics,Programming,2024-05-15,0,,Inactive,2024,5
ENR005,Zara,Machine Learning 101,AI,2024-05-17,60,4.2,Active,2024,5
ENR006,Ibrahim,Python for Beginners,Programming,2024-05-18,90,4.6,Completed,2024,5


In [0]:
# Cleaning and Deduplication
# 15. Drop rows where Status is null or empty.
from pyspark.sql.functions import col, trim

df_cleaned = df.filter((col("Status").isNotNull()) & (trim(col("Status")) != ""))
df_cleaned.show()

# 16. Remove duplicate enrollments using dropDuplicates()

df_cleaned.dropDuplicates(["EnrollmentID"]).show()


+------------+-----------+--------------------+-----------+----------+---------------+------+---------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|Rating|   Status|
+------------+-----------+--------------------+-----------+----------+---------------+------+---------+
|      ENR001|     Aditya|Python for Beginners|Programming|2024-05-10|             80|   4.5|   Active|
|      ENR002|     Simran|Data Analysis wit...|  Analytics|2024-05-12|            100|   4.7|Completed|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|2024-05-13|             30|   3.8|   Active|
|      ENR004|       Neha|         Java Basics|Programming|2024-05-15|              0|  NULL| Inactive|
|      ENR005|       Zara|Machine Learning 101|         AI|2024-05-17|             60|   4.2|   Active|
|      ENR006|    Ibrahim|Python for Beginners|Programming|2024-05-18|             90|   4.6|Completed|
+------------+-----------+--------------------+-----------+-----

In [0]:
# Export
# 17. Write the final cleaned DataFrame to:
# CSV (overwrite mode)
# JSON (overwrite mode)
# Parquet (snappy compression)
df_cleaned.write.mode("overwrite").option("header", True).csv("file:/Workspace/Shared/output/courses_csv")
df_cleaned.write.mode("overwrite").json("file:/Workspace/Shared/output/courses_json")
df_cleaned.write.mode("overwrite").option("compression", "snappy").parquet("file:/Workspace/Shared/output/courses_parquet")
