In [0]:
# Step 0: Prepare CSV data
csv_data = """student_id,name,subject,score,grade
1,Ankit,Math,85,A
2,Divya,Science,92,A
3,Rahul,English,78,B
4,Sneha,Math,65,C
5,Aryan,Science,55,D
6,Isha,English,88,A
7,Tanvi,Math,91,A
8,Kunal,Science,72,B
9,Megha,English,60,C
10,Rohan,Math,40,F
"""

# Save CSV to DBFS
dbutils.fs.put("/tmp/student_scores.csv", csv_data, overwrite=True)

Wrote 234 bytes.


True

In [0]:
# Read CSV into DataFrame
df = spark.read.option("header", True).option("inferSchema", True) \
    .csv("/tmp/student_scores.csv")
df.show()

+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         1|Ankit|   Math|   85|    A|
|         2|Divya|Science|   92|    A|
|         3|Rahul|English|   78|    B|
|         4|Sneha|   Math|   65|    C|
|         5|Aryan|Science|   55|    D|
|         6| Isha|English|   88|    A|
|         7|Tanvi|   Math|   91|    A|
|         8|Kunal|Science|   72|    B|
|         9|Megha|English|   60|    C|
|        10|Rohan|   Math|   40|    F|
+----------+-----+-------+-----+-----+



In [0]:
#Step 2: Write to Delta
df.write.format("delta").mode("overwrite").save("/tmp/delta/student_scores")

In [0]:
# Save as Delta table
df.write.format("delta").mode("overwrite").save("/tmp/delta/student_scores")

In [0]:
# Save as Delta
df.write.format("delta").mode("overwrite").save("/tmp/delta/student_scores")

# Later, read back
df_delta = spark.read.format("delta").load("/tmp/delta/student_scores")
df_delta.show()


+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         1|Ankit|   Math|   85|    A|
|         2|Divya|Science|   92|    A|
|         3|Rahul|English|   78|    B|
|         4|Sneha|   Math|   65|    C|
|         5|Aryan|Science|   55|    D|
|         6| Isha|English|   88|    A|
|         7|Tanvi|   Math|   91|    A|
|         8|Kunal|Science|   72|    B|
|         9|Megha|English|   60|    C|
|        10|Rohan|   Math|   40|    F|
+----------+-----+-------+-----+-----+



In [0]:
spark.sql("SHOW TABLES IN default").show()

+--------+------------------+-----------+
|database|         tableName|isTemporary|
+--------+------------------+-----------+
| default|sales_transactions|      false|
+--------+------------------+-----------+



In [0]:
# Save as Delta
df.write.format("delta").mode("overwrite").saveAsTable("default.student_scores")

# Later, read back
df_delta = spark.read.table("default.student_scores")
display(df_delta)

student_id,name,subject,score,grade
1,Ankit,Math,85,A
2,Divya,Science,92,A
3,Rahul,English,78,B
4,Sneha,Math,65,C
5,Aryan,Science,55,D
6,Isha,English,88,A
7,Tanvi,Math,91,A
8,Kunal,Science,72,B
9,Megha,English,60,C
10,Rohan,Math,40,F


BASIC QUERIES



In [0]:
%sql
SELECT *
FROM default.student_scores;

student_id,name,subject,score,grade
1,Ankit,Math,85,A
2,Divya,Science,92,A
3,Rahul,English,78,B
4,Sneha,Math,65,C
5,Aryan,Science,55,D
6,Isha,English,88,A
7,Tanvi,Math,91,A
8,Kunal,Science,72,B
9,Megha,English,60,C
10,Rohan,Math,40,F


In [0]:
%sql
SELECT subject, COUNT(*) AS student_count
FROM default.student_scores
GROUP BY subject;

subject,student_count
Science,3
Math,4
English,3


In [0]:
%sql
SELECT subject, AVG(score) AS average_score
FROM default.student_scores
GROUP BY subject;

subject,average_score
Science,73.0
Math,70.25
English,75.33333333333333


In [0]:
%sql
SELECT *
FROM default.student_scores
WHERE score > 80;

student_id,name,subject,score,grade
1,Ankit,Math,85,A
2,Divya,Science,92,A
6,Isha,English,88,A
7,Tanvi,Math,91,A


ADVANCED QUERIES


In [0]:
%sql
SELECT subject, name, score
FROM default.student_scores
WHERE (subject, score) IN (
    SELECT subject, MAX(score)
    FROM default.student_scores
    GROUP BY subject
);

subject,name,score
Science,Divya,92
English,Isha,88
Math,Tanvi,91


In [0]:
%sql
SELECT grade, COUNT(*) AS student_count
FROM default.student_scores
GROUP BY grade;

grade,student_count
F,1
B,2
D,1
C,2
A,4


In [0]:
%sql
SELECT name
FROM default.student_scores
WHERE grade = 'F';

name
Rohan


In [0]:
%sql
SELECT name, score
FROM default.student_scores
WHERE score BETWEEN 60 AND 80;

name,score
Rahul,78
Sneha,65
Kunal,72
Megha,60


In [0]:
%sql
SELECT 
    subject, 
    name AS student_name, 
    score,
    RANK() OVER (PARTITION BY subject ORDER BY score DESC) AS rank
FROM default.student_scores;

subject,student_name,score,rank
English,Isha,88,1
English,Rahul,78,2
English,Megha,60,3
Math,Tanvi,91,1
Math,Ankit,85,2
Math,Sneha,65,3
Math,Rohan,40,4
Science,Divya,92,1
Science,Kunal,72,2
Science,Aryan,55,3


 Update & Delete Task

In [0]:
%sql
UPDATE default.student_scores
SET score = score + 5
WHERE subject = 'English';

num_affected_rows
3


In [0]:
%sql
DELETE FROM default.student_scores
WHERE score < 50;

num_affected_rows
1


In [0]:
%sql
ALTER TABLE default.student_scores
ADD COLUMNS (pass_status STRING);

UPDATE default.student_scores
SET pass_status = CASE 
    WHEN score >= 50 THEN 'PASS'
    ELSE 'FAIL'
END;

num_affected_rows
9


Data Transformation and Views

In [0]:
%sql
UPDATE default.student_scores
SET score = score + 5
WHERE subject = 'English';

num_affected_rows
3


In [0]:
%sql
DELETE FROM default.student_scores
WHERE score < 50;

num_affected_rows
0


In [0]:
%sql
UPDATE default.student_scores
SET pass_status = CASE 
    WHEN score >= 50 THEN 'PASS'
    ELSE 'FAIL'
END;

num_affected_rows
9


In [0]:
# Create a temporary view
df = spark.read.table("default.student_scores")
df.createOrReplaceTempView("student_scores_view")

# Run SQL to get average scores
avg_scores = spark.sql("""
SELECT subject, AVG(score) AS average_score
FROM student_scores_view
GROUP BY subject
""")
display(avg_scores)

subject,average_score
Science,73.0
Math,80.33333333333333
English,85.33333333333333


In [0]:
# Convert updated DataFrame into a new Delta table
df.write.format("delta").mode("overwrite").saveAsTable("default.student_scores_v2")

In [0]:
# Write the final data to Parquet
df.write.mode("overwrite").parquet("/tmp/student_scores_v2.parquet")

# Write the final data to JSON
df.write.mode("overwrite").json("/tmp/student_scores_v2.json")