In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder .appName("Course Analytics").getOrCreate()

In [0]:
students_df = spark.read.option("header", True).csv("file:/Workspace/Shared/students.csv")
courses_df = spark.read.option("header", True).csv("file:/Workspace/Shared/courses.csv")
enrollments_df=spark.read.option("header", True).csv("file:/Workspace/Shared/enrollments.csv")
progress_df=spark.read.option("header", True).csv("file:/Workspace/Shared/progress.csv")


In [0]:
# Join enrollments with students and courses
final_df = enrollments_df \
    .join(students_df, on="student_id", how="left") \
    .join(courses_df, on="course_id", how="left") \
    .join(progress_df, on=["student_id", "course_id"], how="left")


In [0]:
from pyspark.sql.functions import to_date,col

final_table = final_df.select(
    "student_id",
    "student_name",
    "course_id",
    "course_name",
    to_date("enrollment_date").alias("enrollment_date"),
    col("completion").cast("int").alias("progress")
)

final_table.display()


student_id,student_name,course_id,course_name,enrollment_date,progress
S92,Chris Allen,C103,Blockchain Basics,2024-01-25,43
S51,Anna Wilson,C107,Big Data Analytics,2024-05-27,23
S39,Emily Brown,C109,Cybersecurity,2024-02-19,84
S71,John Young,C102,Web Development,2024-02-01,52
S15,Sarah Taylor,C108,Mobile App Development,2024-01-31,5
S36,Jane Taylor,C103,Blockchain Basics,2024-02-10,54
S40,Sarah Wilson,C108,Mobile App Development,2024-02-03,82
S17,David Taylor,C106,Web Development,2024-01-06,50
S51,Anna Wilson,C110,Artificial Intelligence,2024-04-19,49
S57,David Taylor,C109,Cybersecurity,2024-01-11,6


In [0]:
# Save as Delta
final_table.write.format("delta").mode("overwrite").save("/mnt/final_output_course_progress_delta")

# Save as CSV (for dashboard tools)
final_table.write.option("header", True).mode("overwrite").csv("/mnt/final_output_course_progress_csv")
