### Dataset: Inline CSV – student_scores.csv

In [0]:
csv_data = """student_id,name,subject,score,grade
1,Ankit,Math,85,A
2,Divya,Science,92,A
3,Rahul,English,78,B
4,Sneha,Math,65,C
5,Aryan,Science,55,D
6,Isha,English,88,A
7,Tanvi,Math,91,A
8,Kunal,Science,72,B
9,Megha,English,60,C
10,Rohan,Math,40,F
"""

dbutils.fs.put("dbfs:/tmp/student_scores.csv", csv_data, overwrite=True)

Wrote 234 bytes.


True

Step 1: Read into DataFrame

In [0]:
df = spark.read.option("header", True).option("inferSchema", True).csv("dbfs:/tmp/student_scores.csv")
df.show()

+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         1|Ankit|   Math|   85|    A|
|         2|Divya|Science|   92|    A|
|         3|Rahul|English|   78|    B|
|         4|Sneha|   Math|   65|    C|
|         5|Aryan|Science|   55|    D|
|         6| Isha|English|   88|    A|
|         7|Tanvi|   Math|   91|    A|
|         8|Kunal|Science|   72|    B|
|         9|Megha|English|   60|    C|
|        10|Rohan|   Math|   40|    F|
+----------+-----+-------+-----+-----+



Step 2: Write to Delta

In [0]:
delta_path = "/tmp/delta/student_scores"
df.write.mode("overwrite").format("delta").save(delta_path)


Delta table created at: /tmp/delta/student_scores


In [0]:
print("Delta table created at:", delta_path)

Delta table created at: /tmp/delta/student_scores


 Basic Tasks

Show all students and their scores

In [0]:
df = spark.read.format("delta").load(delta_path)
display(df)



student_id,name,subject,score,grade
3,Rahul,English,83,B
6,Isha,English,93,A
9,Megha,English,65,C
1,Ankit,Math,85,A
2,Divya,Science,92,A
4,Sneha,Math,65,C
5,Aryan,Science,55,D
7,Tanvi,Math,91,A
8,Kunal,Science,72,B


Count number of students in each subject

In [0]:
df = spark.read.format("delta").load(delta_path)
df.groupBy("subject").count().show()



+-------+-----+
|subject|count|
+-------+-----+
|English|    3|
|   Math|    4|
|Science|    3|
+-------+-----+



Find average score per subject

In [0]:
df = spark.read.format("delta").load(delta_path)
df.groupBy("subject").avg("score") \
  .withColumnRenamed("avg(score)", "avg_score") \
  .show()



+-------+-----------------+
|subject|        avg_score|
+-------+-----------------+
|English|75.33333333333333|
|   Math|            70.25|
|Science|             73.0|
+-------+-----------------+



4. List all students who scored more than 80

In [0]:

df = spark.read.format("delta").load(delta_path)
df.filter(df.score > 80).show()


+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         1|Ankit|   Math|   85|    A|
|         2|Divya|Science|   92|    A|
|         6| Isha|English|   88|    A|
|         7|Tanvi|   Math|   91|    A|
+----------+-----+-------+-----+-----+



 Advanced Queries

5. Student(s) with highest score in each subject

In [0]:
from pyspark.sql.functions import col, max as spark_max

df = spark.read.format("delta").load(delta_path)
max_scores = df.groupBy("subject").agg(spark_max("score").alias("max_score"))

df_a = df.alias("d")
max_a = max_scores.alias("m")

df_a.join(
    max_a,
    (col("d.subject") == col("m.subject")) & (col("d.score") == col("m.max_score"))
).select("d.name", "d.subject", "d.score").show()


+-----+-------+-----+
| name|subject|score|
+-----+-------+-----+
|Divya|Science|   92|
| Isha|English|   88|
|Tanvi|   Math|   91|
+-----+-------+-----+



6. Grades with count of students in each

In [0]:
df = spark.read.format("delta").load(delta_path)
df.groupBy("grade").count().show()


+-----+-----+
|grade|count|
+-----+-----+
|    A|    4|
|    C|    2|
|    B|    2|
|    F|    1|
|    D|    1|
+-----+-----+



7. Names of students who failed (grade F)

In [0]:
df = spark.read.format("delta").load(delta_path)
df.filter(df.grade == "F").select("name").show()


+-----+
| name|
+-----+
|Rohan|
+-----+



8. Students with score between 60 and 90

In [0]:
df = spark.read.format("delta").load(delta_path)
df.filter((df.score >= 60) & (df.score <= 90)).show()


+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         1|Ankit|   Math|   85|    A|
|         3|Rahul|English|   78|    B|
|         4|Sneha|   Math|   65|    C|
|         6| Isha|English|   88|    A|
|         8|Kunal|Science|   72|    B|
|         9|Megha|English|   60|    C|
+----------+-----+-------+-----+-----+



9. Rank students within each subject

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank

df = spark.read.format("delta").load(delta_path)
windowSpec = Window.partitionBy("subject").orderBy(col("score").desc())
df.withColumn("rank", rank().over(windowSpec)).show()


+----------+-----+-------+-----+-----+----+
|student_id| name|subject|score|grade|rank|
+----------+-----+-------+-----+-----+----+
|         6| Isha|English|   88|    A|   1|
|         3|Rahul|English|   78|    B|   2|
|         9|Megha|English|   60|    C|   3|
|         7|Tanvi|   Math|   91|    A|   1|
|         1|Ankit|   Math|   85|    A|   2|
|         4|Sneha|   Math|   65|    C|   3|
|        10|Rohan|   Math|   40|    F|   4|
|         2|Divya|Science|   92|    A|   1|
|         8|Kunal|Science|   72|    B|   2|
|         5|Aryan|Science|   55|    D|   3|
+----------+-----+-------+-----+-----+----+



 Data Transformation & Views

10. Increase score of all English students by 5

In [0]:
from delta.tables import DeltaTable

delta_table = DeltaTable.forPath(spark, delta_path)
delta_table.update(
    condition=col("subject") == "English",
    set={"score": col("score") + 5}
)
display(delta_table.toDF())


student_id,name,subject,score,grade
3,Rahul,English,83,B
6,Isha,English,93,A
9,Megha,English,65,C
1,Ankit,Math,85,A
2,Divya,Science,92,A
4,Sneha,Math,65,C
5,Aryan,Science,55,D
7,Tanvi,Math,91,A
8,Kunal,Science,72,B
10,Rohan,Math,40,F


11. Delete all records where score is less than 50

In [0]:
delta_table = DeltaTable.forPath(spark, delta_path)
delta_table.delete(condition=col("score") < 50)
display(delta_table.toDF())


student_id,name,subject,score,grade
3,Rahul,English,83,B
6,Isha,English,93,A
9,Megha,English,65,C
1,Ankit,Math,85,A
2,Divya,Science,92,A
4,Sneha,Math,65,C
5,Aryan,Science,55,D
7,Tanvi,Math,91,A
8,Kunal,Science,72,B


12. Add pass_status column

In [0]:
from pyspark.sql.functions import when

df = spark.read.format("delta").load(delta_path) \
    .withColumn("pass_status", when(col("score") >= 50, "PASS").otherwise("FAIL"))
display(df)


student_id,name,subject,score,grade,pass_status
3,Rahul,English,83,B,PASS
6,Isha,English,93,A,PASS
9,Megha,English,65,C,PASS
1,Ankit,Math,85,A,PASS
2,Divya,Science,92,A,PASS
4,Sneha,Math,65,C,PASS
5,Aryan,Science,55,D,PASS
7,Tanvi,Math,91,A,PASS
8,Kunal,Science,72,B,PASS


Update & Delete Tasks

13. Create temporary view & get average scores

In [0]:
df = spark.read.format("delta").load(delta_path)
df.createOrReplaceTempView("student_scores_view")
spark.sql("SELECT subject, AVG(score) AS avg_score FROM student_scores_view GROUP BY subject").show()


+-------+-----------------+
|subject|        avg_score|
+-------+-----------------+
|English|80.33333333333333|
|   Math|80.33333333333333|
|Science|             73.0|
+-------+-----------------+



14. Save updated DataFrame as new Delta table

In [0]:
df.write.format("delta").mode("overwrite").save("/tmp/delta/student_scores_v2")
print(" New Delta table saved at /tmp/delta/student_scores_v2")


 New Delta table saved at /tmp/delta/student_scores_v2


15. Write final data to Parquet and JSON

In [0]:
df.write.mode("overwrite").parquet("/tmp/student_scores_parquet")
df.write.mode("overwrite").json("/tmp/student_scores_json")
print("Data saved in Parquet and JSON formats")


Data saved in Parquet and JSON formats
