In [0]:
spark.sql("USE CATALOG hive_metastore")
spark.sql("USE SCHEMA default")

# Imports used throughout
from pyspark.sql.functions import col, when, avg, max as spark_max, count as spark_count, desc, dense_rank, row_number
from pyspark.sql.window import Window
from delta.tables import DeltaTable
from datetime import datetime

In [0]:
# Read CSV from DBFS
df = spark.read.option("header", True).option("inferSchema", True) \
    .csv("/FileStore/tables/students_scores.csv")

df.display()


student_id,name,subject,score,grade
1,Ankit,Math,85,A
2,Divya,Science,92,A
3,Rahul,English,78,B
4,Sneha,Math,65,C
5,Aryan,Science,55,D
6,Isha,English,88,A
7,Tanvi,Math,91,A
8,Kunal,Science,72,B
9,Megha,English,60,C
10,Rohan,Math,40,F


In [0]:
df.write.format("delta").mode("overwrite").save("/tmp/delta/student_scores")


In [0]:
spark.sql("DROP TABLE IF EXISTS student_scores")
spark.sql("""
    CREATE TABLE student_scores
    USING DELTA
    LOCATION '/tmp/delta/student_scores'
""")


DataFrame[]

**Basic Tasks**
1. Show all students and their scores.
2. Count number of students in each subject.
3. Find average score per subject.
4. List all students who scored more than 80

In [0]:
# DataFrame API
display(df.select("student_id","name","subject","score").orderBy("student_id"))

# SQL
display(spark.sql("SELECT student_id, name, subject, score FROM student_scores ORDER BY student_id"))


student_id,name,subject,score
1,Ankit,Math,85
2,Divya,Science,92
3,Rahul,English,78
4,Sneha,Math,65
5,Aryan,Science,55
6,Isha,English,88
7,Tanvi,Math,91
8,Kunal,Science,72
9,Megha,English,60
10,Rohan,Math,40


student_id,name,subject,score
1,Ankit,Math,85
2,Divya,Science,92
3,Rahul,English,78
4,Sneha,Math,65
5,Aryan,Science,55
6,Isha,English,88
7,Tanvi,Math,91
8,Kunal,Science,72
9,Megha,English,60
10,Rohan,Math,40


In [0]:
# 2. Count number of students in each subject
display(df.groupBy("subject").agg(spark_count("*").alias("student_count")).orderBy("subject"))
display(spark.sql("SELECT subject, COUNT(*) AS student_count FROM student_scores GROUP BY subject"))

# 3. Find average score per subject
display(df.groupBy("subject").agg(avg("score").alias("avg_score")).orderBy("subject"))
display(spark.sql("SELECT subject, AVG(score) AS avg_score FROM student_scores GROUP BY subject"))

# 4. List all students who scored more than 80
display(df.filter(col("score") > 80).select("student_id","name","subject","score").orderBy(desc("score")))
display(spark.sql("SELECT * FROM student_scores WHERE score > 80"))

subject,student_count
English,3
Math,4
Science,3


subject,student_count
English,3
Math,4
Science,3


subject,avg_score
English,75.33333333333333
Math,70.25
Science,73.0


subject,avg_score
English,75.33333333333333
Math,70.25
Science,73.0


student_id,name,subject,score
2,Divya,Science,92
7,Tanvi,Math,91
6,Isha,English,88
1,Ankit,Math,85


student_id,name,subject,score,grade
1,Ankit,Math,85,A
2,Divya,Science,92,A
6,Isha,English,88,A
7,Tanvi,Math,91,A


**Advanced Queries**

5. Show student(s) with the highest score in each subject.

6. Display grades with count of students in each.

7. Show names of students who failed (grade F).

8. List students with score between 60 and 80.

9. Rank students within each subject based on scores.

In [0]:
# 5. Student(s) with highest score in each subject
spark.sql("""
    SELECT subject, name, score
    FROM student_scores
    WHERE score IN (
        SELECT MAX(score) FROM student_scores GROUP BY subject
    )
""").show()

# 6. Grades with count of students
spark.sql("SELECT grade, COUNT(*) AS count FROM student_scores GROUP BY grade").show()

# 7. Students who failed
spark.sql("SELECT name FROM student_scores WHERE grade = 'F'").show()

# 8. Students with score between 60 and 80
spark.sql("SELECT * FROM student_scores WHERE score BETWEEN 60 AND 80").show()

# 9. Rank students in each subject
spark.sql("""
    SELECT name, subject, score,
           RANK() OVER (PARTITION BY subject ORDER BY score DESC) AS rank
    FROM student_scores
""").show()


+-------+-----+-----+
|subject| name|score|
+-------+-----+-----+
|Science|Divya|   92|
|English| Isha|   88|
|   Math|Tanvi|   91|
+-------+-----+-----+

+-----+-----+
|grade|count|
+-----+-----+
|    A|    4|
|    C|    2|
|    B|    2|
|    F|    1|
|    D|    1|
+-----+-----+

+-----+
| name|
+-----+
|Rohan|
+-----+

+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         3|Rahul|English|   78|    B|
|         4|Sneha|   Math|   65|    C|
|         8|Kunal|Science|   72|    B|
|         9|Megha|English|   60|    C|
+----------+-----+-------+-----+-----+

+-----+-------+-----+----+
| name|subject|score|rank|
+-----+-------+-----+----+
| Isha|English|   88|   1|
|Rahul|English|   78|   2|
|Megha|English|   60|   3|
|Tanvi|   Math|   91|   1|
|Ankit|   Math|   85|   2|
|Sneha|   Math|   65|   3|
|Rohan|   Math|   40|   4|
|Divya|Science|   92|   1|
|Kunal|Science|   72|   2|
|Aryan|Science|   55|   3|
+-----+------

** Update & Delete Tasks**

10.  Increase score of all English subject students by 5.

11.  Delete all records where score is less than 50.

12.  Add a new column pass_status (PASS if score >= 50 else FAIL)

In [0]:
from delta.tables import DeltaTable

# Load delta table
delta_table = DeltaTable.forPath(spark, "/tmp/delta/student_scores")

# 10. Increase score of English students by 5
delta_table.update(
    condition="subject = 'English'",
    set={"score": "score + 5"}
)
display(delta_table.toDF())

# 11. Delete students with score less than 50
delta_table.delete("score < 50")
display(delta_table.toDF())




student_id,name,subject,score,grade
3,Rahul,English,93,B
6,Isha,English,103,A
9,Megha,English,75,C
1,Ankit,Math,85,A
2,Divya,Science,92,A
4,Sneha,Math,65,C
5,Aryan,Science,55,D
7,Tanvi,Math,91,A
8,Kunal,Science,72,B


student_id,name,subject,score,grade
3,Rahul,English,93,B
6,Isha,English,103,A
9,Megha,English,75,C
1,Ankit,Math,85,A
2,Divya,Science,92,A
4,Sneha,Math,65,C
5,Aryan,Science,55,D
7,Tanvi,Math,91,A
8,Kunal,Science,72,B


In [0]:
from pyspark.sql.functions import when, col

# Load delta table into DataFrame
df_delta = delta_table.toDF()

# Add new column
df_delta = df_delta.withColumn(
    "pass_status",
    when(col("score") >= 50, "PASS").otherwise("FAIL")
)
display(df_delta)


student_id,name,subject,score,grade,pass_status
3,Rahul,English,93,B,PASS
6,Isha,English,103,A,PASS
9,Megha,English,75,C,PASS
1,Ankit,Math,85,A,PASS
2,Divya,Science,92,A,PASS
4,Sneha,Math,65,C,PASS
5,Aryan,Science,55,D,PASS
7,Tanvi,Math,91,A,PASS
8,Kunal,Science,72,B,PASS


**Data Transformation & Views**

13. Create a temporary view and run SQL to get average scores.

14. Convert updated DataFrame into a new Delta table called student_scores_v2 .

15. Write the final data to Parquet and JSON format.

In [0]:
# 13. Temporary view for SQL queries
df.createOrReplaceTempView("student_scores_view")
spark.sql("SELECT subject, AVG(score) AS avg_score FROM student_scores_view GROUP BY subject").show()

# 14. Save updated data as a new Delta table
delta_table.toDF().write.format("delta").mode("overwrite").save("/tmp/delta/student_scores_v2")

# 15. Write final data to Parquet and JSON
delta_table.toDF().write.mode("overwrite").parquet("/tmp/parquet/student_scores")
delta_table.toDF().write.mode("overwrite").json("/tmp/json/student_scores")


+-------+-----------------+
|subject|        avg_score|
+-------+-----------------+
|English|75.33333333333333|
|   Math|            70.25|
|Science|             73.0|
+-------+-----------------+

