In [2]:
from google.colab import files

# This opens a file upload dialog in Colab
uploaded = files.upload()

Saving progress.csv to progress.csv
Saving enrollments.csv to enrollments.csv


#Load the datasets using PySpark


In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("CourseAnalysis").getOrCreate()

# Load datasets
df_enroll = spark.read.csv("enrollments.csv", header=True, inferSchema=True)
df_progress = spark.read.csv("progress.csv", header=True, inferSchema=True)

df_enroll.show(5)
df_progress.show(5)


+----------+---------+---------------+
|student_id|course_id|enrollment_date|
+----------+---------+---------------+
|      S008|     C002|     2024-04-28|
|      S017|     C004|     2024-01-03|
|      S009|     C005|     2024-04-07|
|      S018|     C002|     2024-06-03|
|      S002|     C001|     2024-03-28|
+----------+---------+---------------+
only showing top 5 rows

+----------+---------+--------+
|student_id|course_id|progress|
+----------+---------+--------+
|      S008|     C002|   13.36|
|      S017|     C004|   64.28|
|      S009|     C005|   20.01|
|      S018|     C002|   72.63|
|      S002|     C001|   64.31|
+----------+---------+--------+
only showing top 5 rows



# Join the two DataFrames

In [4]:
df_joined = df_enroll.join(df_progress, on=["student_id", "course_id"], how="inner")
df_joined.show(5)


+----------+---------+---------------+--------+
|student_id|course_id|enrollment_date|progress|
+----------+---------+---------------+--------+
|      S008|     C002|     2024-04-28|   70.36|
|      S008|     C002|     2024-04-28|   13.36|
|      S017|     C004|     2024-01-03|    56.8|
|      S017|     C004|     2024-01-03|   64.28|
|      S009|     C005|     2024-04-07|   20.01|
+----------+---------+---------------+--------+
only showing top 5 rows



# Group by course and analyze

In [5]:
from pyspark.sql.functions import col, count, when

df_summary = df_joined.groupBy("course_id").agg(
    count("student_id").alias("Total_Enrolled"),
    count(when(col("progress") >= 100, True)).alias("Total_Completed"),
    count(when(col("progress") < 100, True)).alias("Total_Dropped")
)

df_summary.show()


+---------+--------------+---------------+-------------+
|course_id|Total_Enrolled|Total_Completed|Total_Dropped|
+---------+--------------+---------------+-------------+
|     C003|            28|              0|           28|
|     C004|             8|              0|            8|
|     C005|             9|              0|            9|
|     C001|            12|              0|           12|
|     C002|            19|              0|           19|
+---------+--------------+---------------+-------------+



#Show Top Courses

In [6]:
# Top 5 completed
df_summary.orderBy(col("Total_Completed").desc()).show(5)

# Top 5 dropped
df_summary.orderBy(col("Total_Dropped").desc()).show(5)


+---------+--------------+---------------+-------------+
|course_id|Total_Enrolled|Total_Completed|Total_Dropped|
+---------+--------------+---------------+-------------+
|     C003|            28|              0|           28|
|     C004|             8|              0|            8|
|     C005|             9|              0|            9|
|     C001|            12|              0|           12|
|     C002|            19|              0|           19|
+---------+--------------+---------------+-------------+

+---------+--------------+---------------+-------------+
|course_id|Total_Enrolled|Total_Completed|Total_Dropped|
+---------+--------------+---------------+-------------+
|     C003|            28|              0|           28|
|     C002|            19|              0|           19|
|     C001|            12|              0|           12|
|     C005|             9|              0|            9|
|     C004|             8|              0|            8|
+---------+--------------+----