In [0]:
csv_data = """student_id,name,subject,score,grade
1,Ankit,Math,85,A
2,Divya,Science,92,A
3,Rahul,English,78,B
4,Sneha,Math,65,C
5,Aryan,Science,55,D
6,Isha,English,88,A
7,Tanvi,Math,91,A
8,Kunal,Science,72,B
9,Megha,English,60,C
10,Rohan,Math,40,F
"""

dbutils.fs.put("dbfs:/tmp/student_scores.csv", csv_data, overwrite=True)

Wrote 234 bytes.


True

##Step 1: Read into DataFrame

In [0]:
df = spark.read.option("header", True).option("inferSchema", True).csv("dbfs:/tmp/student_scores.csv")
df.show()
     

+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         1|Ankit|   Math|   85|    A|
|         2|Divya|Science|   92|    A|
|         3|Rahul|English|   78|    B|
|         4|Sneha|   Math|   65|    C|
|         5|Aryan|Science|   55|    D|
|         6| Isha|English|   88|    A|
|         7|Tanvi|   Math|   91|    A|
|         8|Kunal|Science|   72|    B|
|         9|Megha|English|   60|    C|
|        10|Rohan|   Math|   40|    F|
+----------+-----+-------+-----+-----+



##Step 2: Write to Delta

In [0]:
df.write.mode("overwrite").format("delta").save("/tmp/delta/student_scores")

##Step 3: Register Delta Table

In [0]:
spark.sql("DROP TABLE IF EXISTS student_scores")


DataFrame[]

In [0]:
spark.sql("CREATE TABLE default.student_scores USING DELTA")


DataFrame[]

In [0]:
from delta.tables import DeltaTable
delta_table = DeltaTable.forPath(spark, "/tmp/delta/student_scores")
df = delta_table.toDF()
df.show()

+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         1|Ankit|   Math|   85|    A|
|         2|Divya|Science|   92|    A|
|         3|Rahul|English|   78|    B|
|         4|Sneha|   Math|   65|    C|
|         5|Aryan|Science|   55|    D|
|         6| Isha|English|   88|    A|
|         7|Tanvi|   Math|   91|    A|
|         8|Kunal|Science|   72|    B|
|         9|Megha|English|   60|    C|
|        10|Rohan|   Math|   40|    F|
+----------+-----+-------+-----+-----+



#Basic Tasks

In [0]:
df = spark.read.format("delta").load("/tmp/delta/student_scores")
df.createOrReplaceTempView("student_scores")

In [0]:
#1. Show all students and their scores.
spark.sql("SELECT name, score FROM student_scores").show()

+-----+-----+
| name|score|
+-----+-----+
|Ankit|   85|
|Divya|   92|
|Rahul|   78|
|Sneha|   65|
|Aryan|   55|
| Isha|   88|
|Tanvi|   91|
|Kunal|   72|
|Megha|   60|
|Rohan|   40|
+-----+-----+



In [0]:
#2. Count number of students in each subject.
spark.sql("""SELECT subject, COUNT(*) AS student_count FROM student_scores GROUP BY subject""").show()
     

+-------+-------------+
|subject|student_count|
+-------+-------------+
|Science|            3|
|   Math|            4|
|English|            3|
+-------+-------------+



In [0]:
#3. Find average score per subject.
spark.sql("""SELECT subject, AVG(score) AS avg_score FROM student_scores GROUP BY subject""").show()
#4. List all students who scored more than 80.
spark.sql("""SELECT name FROM student_scores WHERE score > 80""").show()

+-------+-----------------+
|subject|        avg_score|
+-------+-----------------+
|Science|             73.0|
|   Math|            70.25|
|English|75.33333333333333|
+-------+-----------------+

+-----+
| name|
+-----+
|Ankit|
|Divya|
| Isha|
|Tanvi|
+-----+



##Advanced Queries

In [0]:
#5. Show student(s) with the highest score in each subject.
spark.sql("""SELECT name, subject, score FROM student_scores WHERE score = (SELECT MAX(score) FROM student_scores WHERE subject = student_scores.subject)""").show()
#6. Display grades with count of students in each.
spark.sql("""SELECT grade, COUNT(*) AS student_count FROM student_scores GROUP BY grade""").show()
#7. Show names of students who failed (grade F)
spark.sql("""SELECT name FROM student_scores WHERE grade = 'F'""").show()


+-----+-------+-----+
| name|subject|score|
+-----+-------+-----+
|Divya|Science|   92|
+-----+-------+-----+

+-----+-------------+
|grade|student_count|
+-----+-------------+
|    F|            1|
|    B|            2|
|    D|            1|
|    C|            2|
|    A|            4|
+-----+-------------+

+-----+
| name|
+-----+
|Rohan|
+-----+



In [0]:
#8. List students with score between 60 and 90.
spark.sql("""SELECT name, subject, score, RANK() OVER (PARTITION BY subject ORDER BY score DESC) AS rank FROM student_scores""").show()
#9. Rank students within each subject based on scores.
spark.sql("""SELECT name, subject, score, RANK() OVER (PARTITION BY subject ORDER BY score DESC) AS rank FROM student_scores""").show()


+-----+-------+-----+----+
| name|subject|score|rank|
+-----+-------+-----+----+
| Isha|English|   88|   1|
|Rahul|English|   78|   2|
|Megha|English|   60|   3|
|Tanvi|   Math|   91|   1|
|Ankit|   Math|   85|   2|
|Sneha|   Math|   65|   3|
|Rohan|   Math|   40|   4|
|Divya|Science|   92|   1|
|Kunal|Science|   72|   2|
|Aryan|Science|   55|   3|
+-----+-------+-----+----+

+-----+-------+-----+----+
| name|subject|score|rank|
+-----+-------+-----+----+
| Isha|English|   88|   1|
|Rahul|English|   78|   2|
|Megha|English|   60|   3|
|Tanvi|   Math|   91|   1|
|Ankit|   Math|   85|   2|
|Sneha|   Math|   65|   3|
|Rohan|   Math|   40|   4|
|Divya|Science|   92|   1|
|Kunal|Science|   72|   2|
|Aryan|Science|   55|   3|
+-----+-------+-----+----+



##Update & Delete Tasks

In [0]:
#10.Increase score of all English subject students by 5.
spark.sql("""UPDATE student_scores SET score = score + 5 WHERE subject = 'English'""")
spark.sql("""SELECT * FROM student_scores""").show()

#11.Delete all records where score is less than 50.
spark.sql("""DELETE FROM student_scores WHERE score < 50""")
spark.sql("""SELECT * FROM student_scores""").show()



+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         1|Ankit|   Math|   85|    A|
|         2|Divya|Science|   92|    A|
|         4|Sneha|   Math|   65|    C|
|         5|Aryan|Science|   55|    D|
|         7|Tanvi|   Math|   91|    A|
|         8|Kunal|Science|   72|    B|
|         3|Rahul|English|   93|    B|
|         6| Isha|English|  103|    A|
|         9|Megha|English|   75|    C|
+----------+-----+-------+-----+-----+

+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         3|Rahul|English|   93|    B|
|         6| Isha|English|  103|    A|
|         9|Megha|English|   75|    C|
|         1|Ankit|   Math|   85|    A|
|         2|Divya|Science|   92|    A|
|         4|Sneha|   Math|   65|    C|
|         5|Aryan|Science|   55|    D|
|         7|Tanvi|   Math|   91|    A|
|         8|Kunal|Science|   72|    B|
+----------+-----+------

In [0]:
#12.Add a new column pass_status (PASS if score >= 50 else FAIL).
spark.sql("""SELECT *, CASE WHEN score >= 50 THEN 'PASS' ELSE 'FAIL' END AS pass_status FROM student_scores;""").show()
     

+----------+-----+-------+-----+-----+-----------+
|student_id| name|subject|score|grade|pass_status|
+----------+-----+-------+-----+-----+-----------+
|         3|Rahul|English|   93|    B|       PASS|
|         6| Isha|English|  103|    A|       PASS|
|         9|Megha|English|   75|    C|       PASS|
|         1|Ankit|   Math|   85|    A|       PASS|
|         2|Divya|Science|   92|    A|       PASS|
|         4|Sneha|   Math|   65|    C|       PASS|
|         5|Aryan|Science|   55|    D|       PASS|
|         7|Tanvi|   Math|   91|    A|       PASS|
|         8|Kunal|Science|   72|    B|       PASS|
+----------+-----+-------+-----+-----+-----------+



##Data Transformation & Views

In [0]:
#13. Create a temporary view and run SQL to get average scores.
spark.sql("""SELECT subject, ROUND(AVG(score), 2) AS avg_score FROM student_scores GROUP BY subject""").show()


+-------+---------+
|subject|avg_score|
+-------+---------+
|Science|     73.0|
|   Math|    80.33|
|English|    90.33|
+-------+---------+



In [0]:
#14. Convert updated DataFrame into a new Delta table called student_scores_v2 .
spark.sql("""CREATE OR REPLACE TABLE student_scores_v2 USING DELTA AS SELECT * FROM student_scores""")


DataFrame[num_affected_rows: bigint, num_inserted_rows: bigint]

In [0]:
#15. Write the final data to Parquet and JSON formats.
df_final = spark.table("student_scores")
df_final.write.mode("overwrite").parquet("/tmp/parquet/student_scores")
df_final.write.mode("overwrite").json("/tmp/json/student_scores")
     