In [0]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.window import Window as W
from pyspark.sql.types import *

#**Data Loading**

In [0]:
# 1. Load the data with schema inference enabled.
df = spark.read.csv(r"file:/Workspace/Shared/jun-16/course_enrollments.csv", header=True, inferSchema=True)

In [0]:
# 2. Manually define schema and compare both approaches.
schema = StructType([
    StructField("EnrollmentID", StringType(), nullable=False),
    StructField("StudentName", StringType(), True),
    StructField("CourseName", StringType(), True),
    StructField("Category", StringType(), True),
    StructField("EnrollDate", DateType(), True),
    StructField("ProgressPercent", FloatType(), True),
    StructField("Rating", FloatType(), True),
    StructField("Status", StringType(), True),
])

dfManual = spark.read.csv(r"file:/Workspace/Shared/jun-16/course_enrollments.csv", header=True, schema=schema)

In [0]:
df.printSchema()

dfManual.printSchema()

root
 |-- EnrollmentID: string (nullable = true)
 |-- StudentName: string (nullable = true)
 |-- CourseName: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- EnrollDate: date (nullable = true)
 |-- ProgressPercent: integer (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Status: string (nullable = true)

root
 |-- EnrollmentID: string (nullable = true)
 |-- StudentName: string (nullable = true)
 |-- CourseName: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- EnrollDate: date (nullable = true)
 |-- ProgressPercent: float (nullable = true)
 |-- Rating: float (nullable = true)
 |-- Status: string (nullable = true)



#**Filtering and Transformation**

In [0]:
# 3. Filter records where ProgressPercent < 50 .
df.filter(df.ProgressPercent < 50).show()

+------------+-----------+-------------------+-----------+----------+---------------+------+--------+
|EnrollmentID|StudentName|         CourseName|   Category|EnrollDate|ProgressPercent|Rating|  Status|
+------------+-----------+-------------------+-----------+----------+---------------+------+--------+
|      ENR003|     Aakash|Power BI Essentials|  Analytics|2024-05-13|             30|   3.8|  Active|
|      ENR004|       Neha|        Java Basics|Programming|2024-05-15|              0|  NULL|Inactive|
+------------+-----------+-------------------+-----------+----------+---------------+------+--------+



In [0]:
# 4. Replace null ratings with average rating.
avgRating = df.select((F.mean("Rating"))).collect()[0][0]
df = df.fillna(avgRating, subset=["Rating"])
df.show()

+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+--------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|
+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+--------+
|      ENR001|     Aditya|Python for Beginners|Programming|2024-05-10|             80|              4.5|   Active|       1|
|      ENR002|     Simran|Data Analysis wit...|  Analytics|2024-05-12|            100|              4.7|Completed|       0|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|2024-05-13|             30|              3.8|   Active|       1|
|      ENR004|       Neha|         Java Basics|Programming|2024-05-15|              0|4.359999999999999| Inactive|       0|
|      ENR005|       Zara|Machine Learning 101|         AI|2024-05-17|             60|              4.2|   Active|       1|
|      E

In [0]:
# 5. Add column IsActive → 1 if Status is Active, else 0.
df = df.withColumn("IsActive", F.when(F.col("Status") == "Active", 1).otherwise(0))
df.show()

+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+--------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|
+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+--------+
|      ENR001|     Aditya|Python for Beginners|Programming|2024-05-10|             80|              4.5|   Active|       1|
|      ENR002|     Simran|Data Analysis wit...|  Analytics|2024-05-12|            100|              4.7|Completed|       0|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|2024-05-13|             30|              3.8|   Active|       1|
|      ENR004|       Neha|         Java Basics|Programming|2024-05-15|              0|4.359999999999999| Inactive|       0|
|      ENR005|       Zara|Machine Learning 101|         AI|2024-05-17|             60|              4.2|   Active|       1|
|      E

#**Aggregation & Metrics**

In [0]:
# 6. Find average progress by course.
df.groupBy("CourseName").agg(
  F.round(F.mean("ProgressPercent")).alias("AverageProgress")
).show(truncate=False)

+------------------------+---------------+
|CourseName              |AverageProgress|
+------------------------+---------------+
|Data Analysis with Excel|100.0          |
|Java Basics             |0.0            |
|Machine Learning 101    |60.0           |
|Python for Beginners    |85.0           |
|Power BI Essentials     |30.0           |
+------------------------+---------------+



In [0]:
# 7. Get count of students in each course category.
df.groupBy("Category").agg(
    F.count("EnrollmentID").alias("StudentCount")
).show()

+-----------+------------+
|   Category|StudentCount|
+-----------+------------+
|Programming|           3|
|         AI|           1|
|  Analytics|           2|
+-----------+------------+



In [0]:
# 8. Identify the most enrolled course.
df.groupBy("CourseName").agg(
    F.count("EnrollmentID").alias("StudentCount")
).sort("StudentCount", ascending=False).show(1)

+--------------------+------------+
|          CourseName|StudentCount|
+--------------------+------------+
|Python for Beginners|           2|
+--------------------+------------+
only showing top 1 row


#**Joins**

In [0]:
# 9. Create second CSV: course_details.csv
dfCourse = spark.read.csv(r"file:/Workspace/Users/azuser3563_mml.local@techademy.com/course_details.csv", header=True, inferSchema=True)

In [0]:
# 10. Join course_enrollments with course_details to include duration and instructor.
dfJoined = df.join(dfCourse, on="CourseName", how="inner")
dfJoined.show()

+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+
|          CourseName|EnrollmentID|StudentName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|DurationWeeks|Instructor|
+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+
|Python for Beginners|      ENR001|     Aditya|Programming|2024-05-10|             80|              4.5|   Active|       1|            4|    Rakesh|
|Data Analysis wit...|      ENR002|     Simran|  Analytics|2024-05-12|            100|              4.7|Completed|       0|            3|    Anjali|
| Power BI Essentials|      ENR003|     Aakash|  Analytics|2024-05-13|             30|              3.8|   Active|       1|            5|     Rekha|
|         Java Basics|      ENR004|       Neha|Programming|2024-05-15|              0|4.359999999999999| I

#**Window Functions**

In [0]:
# 11. Rank students in each course based on ProgressPercent .
rankWindow = W.partitionBy("CourseName").orderBy(F.desc("ProgressPercent"))
df.withColumn("Rank", F.rank().over(rankWindow)).show()

+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+--------+----+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|Rank|
+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+--------+----+
|      ENR002|     Simran|Data Analysis wit...|  Analytics|2024-05-12|            100|              4.7|Completed|       0|   1|
|      ENR004|       Neha|         Java Basics|Programming|2024-05-15|              0|4.359999999999999| Inactive|       0|   1|
|      ENR005|       Zara|Machine Learning 101|         AI|2024-05-17|             60|              4.2|   Active|       1|   1|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|2024-05-13|             30|              3.8|   Active|       1|   1|
|      ENR006|    Ibrahim|Python for Beginners|Programming|2024-05-18|             90|           

In [0]:
# 12. Get lead and lag of EnrollDate by Category.
leadLagWindow = W.partitionBy("Category").orderBy("EnrollDate")

df.withColumn("Lead", F.lead("EnrollDate").over(leadLagWindow)) \
  .withColumn("Lag", F.lag("EnrollDate").over(leadLagWindow)) \
  .show()

+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+--------+----------+----------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|      Lead|       Lag|
+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+--------+----------+----------+
|      ENR005|       Zara|Machine Learning 101|         AI|2024-05-17|             60|              4.2|   Active|       1|      NULL|      NULL|
|      ENR002|     Simran|Data Analysis wit...|  Analytics|2024-05-12|            100|              4.7|Completed|       0|2024-05-13|      NULL|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|2024-05-13|             30|              3.8|   Active|       1|      NULL|2024-05-12|
|      ENR001|     Aditya|Python for Beginners|Programming|2024-05-10|             80|              4.5|   Active|       1|2

#**Pivoting and Formatting**

In [0]:
# 13. Pivot data to show total enrollments by Category and Status.
df.groupBy("Category").pivot("Status").count().fillna(0).show()

+-----------+------+---------+--------+
|   Category|Active|Completed|Inactive|
+-----------+------+---------+--------+
|Programming|     1|        1|       1|
|         AI|     1|        0|       0|
|  Analytics|     1|        1|       0|
+-----------+------+---------+--------+



In [0]:
# 14. Extract year and month from EnrollDate .
df.withColumn("Year", F.year(df.EnrollDate)) \
    .withColumn("Month", F.month(df.EnrollDate)) \
    .show()

+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+--------+----+-----+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|Year|Month|
+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+--------+----+-----+
|      ENR001|     Aditya|Python for Beginners|Programming|2024-05-10|             80|              4.5|   Active|       1|2024|    5|
|      ENR002|     Simran|Data Analysis wit...|  Analytics|2024-05-12|            100|              4.7|Completed|       0|2024|    5|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|2024-05-13|             30|              3.8|   Active|       1|2024|    5|
|      ENR004|       Neha|         Java Basics|Programming|2024-05-15|              0|4.359999999999999| Inactive|       0|2024|    5|
|      ENR005|       Zara|Machine Learning 101|        

#**Cleaning and Deduplication**

In [0]:
# 15. Drop rows where Status is null or empty.
dfCleaned = dfJoined.dropna(subset=["Status"])
dfCleaned.show()

+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+
|          CourseName|EnrollmentID|StudentName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|DurationWeeks|Instructor|
+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+
|Python for Beginners|      ENR001|     Aditya|Programming|2024-05-10|             80|              4.5|   Active|       1|            4|    Rakesh|
|Data Analysis wit...|      ENR002|     Simran|  Analytics|2024-05-12|            100|              4.7|Completed|       0|            3|    Anjali|
| Power BI Essentials|      ENR003|     Aakash|  Analytics|2024-05-13|             30|              3.8|   Active|       1|            5|     Rekha|
|         Java Basics|      ENR004|       Neha|Programming|2024-05-15|              0|4.359999999999999| I

In [0]:
# 16. Remove duplicate enrollments using dropDuplicates() .
dfCleaned = df.dropDuplicates(subset=["EnrollmentID"])
dfCleaned.show()

+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+--------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|
+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+--------+
|      ENR004|       Neha|         Java Basics|Programming|2024-05-15|              0|4.359999999999999| Inactive|       0|
|      ENR005|       Zara|Machine Learning 101|         AI|2024-05-17|             60|              4.2|   Active|       1|
|      ENR002|     Simran|Data Analysis wit...|  Analytics|2024-05-12|            100|              4.7|Completed|       0|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|2024-05-13|             30|              3.8|   Active|       1|
|      ENR006|    Ibrahim|Python for Beginners|Programming|2024-05-18|             90|              4.6|Completed|       0|
|      E

#**Export**

In [0]:
# 17. Write the final cleaned DataFrame to:
dfCleaned.write.mode("overwrite").csv("file:/Workspace/Shared/jun-16/dfCleanedCSV")
dfCleaned.write.mode("overwrite").json("file:/Workspace/Shared/jun-16/dfCleanedJSON")
dfCleanedpd = dfCleaned.toPandas()
dfCleanedpd.to_parquet("file:/Workspace/Shared/jun-16/dfCleanedPARQUET.parquet", compression="snappy")