In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
spark = SparkSession.builder.appName("Online Course Enrollments").getOrCreate()
spark

In [0]:
df_infer = spark.read.csv("dbfs:/FileStore/shared_uploads/azuser3559_mml.local@techademy.com/course_enrollments-1.csv", header=True, inferSchema=True)
df_infer.display()


EnrollmentID,StudentName,CourseName,Category,EnrollDate,ProgressPercent,Rating,Status
ENR001,Aditya,Python for Beginners,Programming,2024-05-10,80,4.5,Active
ENR002,Simran,Data Analysis with Excel,Analytics,2024-05-12,100,4.7,Completed
ENR003,Aakash,Power BI Essentials,Analytics,2024-05-13,30,3.8,Active
ENR004,Neha,Java Basics,Programming,2024-05-15,0,,Inactive
ENR005,Zara,Machine Learning 101,AI,2024-05-17,60,4.2,Active
ENR006,Ibrahim,Python for Beginners,Programming,2024-05-18,90,4.6,Completed


In [0]:
manual_schema = StructType([
    StructField("EnrollmentID", StringType(), True),
    StructField("StudentName", StringType(), True),
    StructField("CourseName", StringType(), True),
    StructField("Category", StringType(), True),
    StructField("EnrollDate", DateType(), True),
    StructField("ProgressPercent", IntegerType(), True),
    StructField("Rating", DoubleType(), True),
    StructField("Status", StringType(), True)
])

df_manual = spark.read.csv("dbfs:/FileStore/shared_uploads/azuser3559_mml.local@techademy.com/course_enrollments-1.csv", header=True, schema=manual_schema)
df_manual.display()



EnrollmentID,StudentName,CourseName,Category,EnrollDate,ProgressPercent,Rating,Status
ENR001,Aditya,Python for Beginners,Programming,2024-05-10,80,4.5,Active
ENR002,Simran,Data Analysis with Excel,Analytics,2024-05-12,100,4.7,Completed
ENR003,Aakash,Power BI Essentials,Analytics,2024-05-13,30,3.8,Active
ENR004,Neha,Java Basics,Programming,2024-05-15,0,,Inactive
ENR005,Zara,Machine Learning 101,AI,2024-05-17,60,4.2,Active
ENR006,Ibrahim,Python for Beginners,Programming,2024-05-18,90,4.6,Completed


In [0]:
df_pp = df_infer.filter(df_infer['ProgressPercent'] < 50)
df_pp.display()

EnrollmentID,StudentName,CourseName,Category,EnrollDate,ProgressPercent,Rating,Status
ENR003,Aakash,Power BI Essentials,Analytics,2024-05-13,30,3.8,Active
ENR004,Neha,Java Basics,Programming,2024-05-15,0,,Inactive


In [0]:
df_null = df_infer.filter(df_infer['Rating'].isNull())
df_null.display()
avg_rating = df_infer.select(avg("Rating")).first()[0]
df_filled = df_infer.withColumn("Rating", when(col("Rating").isNull(), avg_rating).otherwise(col("Rating")))
df_filled.display()

EnrollmentID,StudentName,CourseName,Category,EnrollDate,ProgressPercent,Rating,Status
ENR004,Neha,Java Basics,Programming,2024-05-15,0,,Inactive


EnrollmentID,StudentName,CourseName,Category,EnrollDate,ProgressPercent,Rating,Status
ENR001,Aditya,Python for Beginners,Programming,2024-05-10,80,4.5,Active
ENR002,Simran,Data Analysis with Excel,Analytics,2024-05-12,100,4.7,Completed
ENR003,Aakash,Power BI Essentials,Analytics,2024-05-13,30,3.8,Active
ENR004,Neha,Java Basics,Programming,2024-05-15,0,4.359999999999999,Inactive
ENR005,Zara,Machine Learning 101,AI,2024-05-17,60,4.2,Active
ENR006,Ibrahim,Python for Beginners,Programming,2024-05-18,90,4.6,Completed


In [0]:
df_status = df_infer.withColumn("Status", when(col("Status") == "Active",1).otherwise(0))
df_status.display()

EnrollmentID,StudentName,CourseName,Category,EnrollDate,ProgressPercent,Rating,Status
ENR001,Aditya,Python for Beginners,Programming,2024-05-10,80,4.5,1
ENR002,Simran,Data Analysis with Excel,Analytics,2024-05-12,100,4.7,0
ENR003,Aakash,Power BI Essentials,Analytics,2024-05-13,30,3.8,1
ENR004,Neha,Java Basics,Programming,2024-05-15,0,,0
ENR005,Zara,Machine Learning 101,AI,2024-05-17,60,4.2,1
ENR006,Ibrahim,Python for Beginners,Programming,2024-05-18,90,4.6,0


In [0]:
df_progress = df_infer.groupBy("CourseName").agg(avg("ProgressPercent").alias("AvgProgress"))
df_progress.display()

CourseName,AvgProgress
Data Analysis with Excel,100.0
Java Basics,0.0
Machine Learning 101,60.0
Python for Beginners,85.0
Power BI Essentials,30.0


In [0]:
df_count = df_infer.groupBy("Category").count().withColumnRenamed("count", "Student Count")
df_count.display()

Category,Student Count
Programming,3
AI,1
Analytics,2


In [0]:
df_most = df_infer.groupBy("CourseName").count().orderBy(desc("count")).limit(1)
df_most.display()

CourseName,count
Python for Beginners,2


In [0]:
df_infer1 = spark.read.csv("dbfs:/FileStore/shared_uploads/azuser3559_mml.local@techademy.com/course_details.csv", header=True, inferSchema=True)
df_infer1.display()


CourseName,DurationWeeks,Instructor
Python for Beginners,4,Rakesh
Data Analysis with Excel,3,Anjali
Power BI Essentials,5,Rekha
Java Basics,6,Manoj
Machine Learning 101,8,Samir


In [0]:
df_join = df_infer.join(df_infer1, on = "CourseName", how = "left")
df_join.display()

CourseName,EnrollmentID,StudentName,Category,EnrollDate,ProgressPercent,Rating,Status,DurationWeeks,Instructor
Python for Beginners,ENR001,Aditya,Programming,2024-05-10,80,4.5,Active,4,Rakesh
Data Analysis with Excel,ENR002,Simran,Analytics,2024-05-12,100,4.7,Completed,3,Anjali
Power BI Essentials,ENR003,Aakash,Analytics,2024-05-13,30,3.8,Active,5,Rekha
Java Basics,ENR004,Neha,Programming,2024-05-15,0,,Inactive,6,Manoj
Machine Learning 101,ENR005,Zara,AI,2024-05-17,60,4.2,Active,8,Samir
Python for Beginners,ENR006,Ibrahim,Programming,2024-05-18,90,4.6,Completed,4,Rakesh


In [0]:
from pyspark.sql.window import Window

window = Window.partitionBy("CourseName").orderBy(col("ProgressPercent").desc())

df_rank = df_join.withColumn("Rank", rank().over(window))
df_rank.display()


CourseName,EnrollmentID,StudentName,Category,EnrollDate,ProgressPercent,Rating,Status,DurationWeeks,Instructor,Rank
Data Analysis with Excel,ENR002,Simran,Analytics,2024-05-12,100,4.7,Completed,3,Anjali,1
Java Basics,ENR004,Neha,Programming,2024-05-15,0,,Inactive,6,Manoj,1
Machine Learning 101,ENR005,Zara,AI,2024-05-17,60,4.2,Active,8,Samir,1
Power BI Essentials,ENR003,Aakash,Analytics,2024-05-13,30,3.8,Active,5,Rekha,1
Python for Beginners,ENR006,Ibrahim,Programming,2024-05-18,90,4.6,Completed,4,Rakesh,1
Python for Beginners,ENR001,Aditya,Programming,2024-05-10,80,4.5,Active,4,Rakesh,2


In [0]:
date_window = Window.partitionBy("Category").orderBy("EnrollDate")

df_dates = df_join.withColumn("NextEnrollDate", lead("EnrollDate").over(date_window)) \
                    .withColumn("PrevEnrollDate", lag("EnrollDate").over(date_window))
df_dates.display()

CourseName,EnrollmentID,StudentName,Category,EnrollDate,ProgressPercent,Rating,Status,DurationWeeks,Instructor,NextEnrollDate,PrevEnrollDate
Machine Learning 101,ENR005,Zara,AI,2024-05-17,60,4.2,Active,8,Samir,,
Data Analysis with Excel,ENR002,Simran,Analytics,2024-05-12,100,4.7,Completed,3,Anjali,2024-05-13,
Power BI Essentials,ENR003,Aakash,Analytics,2024-05-13,30,3.8,Active,5,Rekha,,2024-05-12
Python for Beginners,ENR001,Aditya,Programming,2024-05-10,80,4.5,Active,4,Rakesh,2024-05-15,
Java Basics,ENR004,Neha,Programming,2024-05-15,0,,Inactive,6,Manoj,2024-05-18,2024-05-10
Python for Beginners,ENR006,Ibrahim,Programming,2024-05-18,90,4.6,Completed,4,Rakesh,,2024-05-15


In [0]:
df_pivot = df_join.groupBy("Category").pivot("Status").count()
df_pivot.display()

Category,Active,Completed,Inactive
Programming,1,1.0,1.0
AI,1,,
Analytics,1,1.0,


In [0]:

df_ym = df_dates.withColumn("EnrollYear", year("EnrollDate")) \
             .withColumn("EnrollMonth", month("EnrollDate"))
df_ym.display()


CourseName,EnrollmentID,StudentName,Category,EnrollDate,ProgressPercent,Rating,Status,DurationWeeks,Instructor,NextEnrollDate,PrevEnrollDate,EnrollYear,EnrollMonth
Machine Learning 101,ENR005,Zara,AI,2024-05-17,60,4.2,Active,8,Samir,,,2024,5
Data Analysis with Excel,ENR002,Simran,Analytics,2024-05-12,100,4.7,Completed,3,Anjali,2024-05-13,,2024,5
Power BI Essentials,ENR003,Aakash,Analytics,2024-05-13,30,3.8,Active,5,Rekha,,2024-05-12,2024,5
Python for Beginners,ENR001,Aditya,Programming,2024-05-10,80,4.5,Active,4,Rakesh,2024-05-15,,2024,5
Java Basics,ENR004,Neha,Programming,2024-05-15,0,,Inactive,6,Manoj,2024-05-18,2024-05-10,2024,5
Python for Beginners,ENR006,Ibrahim,Programming,2024-05-18,90,4.6,Completed,4,Rakesh,,2024-05-15,2024,5


In [0]:
df_drop = df_join.filter((col("Status").isNotNull()) & (col("Status") != ""))


In [0]:
df_duplicate = df_join.dropDuplicates(["EnrollmentID"])

In [0]:
output_path = "/FileStore/cleaned_enrollments"

df_csv = df_duplicate.write.mode("overwrite").option("header", True).csv(f"{output_path}/csv")

df_json = df_duplicate.write.mode("overwrite").json(f"{output_path}/json")

df_parquet = df_duplicate.write.mode("overwrite").parquet(f"{output_path}/parquet")
