# Parquet vs CSV vs JSON: EXERCISE

Tasks & Questions (15 Total)
1. Read the above data from CSV into a DataFrame and print the schema.
2. Read the same data from JSON and compare with the CSV schema. Any differences?
3. Convert the CSV data into Parquet format and save to disk.
4. Measure the size of CSV vs JSON vs Parquet on disk. Which one is smallest?
5. Add a column total_revenue = price * quantity for each record.
6. Find the top 3 products with the highest total revenue.
7. Filter and display only Furniture products with price > 3000.
8. Create a new column price_band with values:
'High' if price > 10000
'Medium' if 3000 < price <= 10000
'Low' if price ≤ 3000
9. Group by category and calculate total quantity sold.
10. Calculate average price of products for each category.
11. Count how many products fall in each price_band .
12. Write the filtered Electronics products (price > 5000) into a Parquet file.
13. Write the Stationery products into a JSON file.
14. Load Parquet back and run a query to find which category has highest total
revenue.
15. BONUS: Create a temporary view from the DataFrame and run Spark SQL to find all
products with quantity > 100 and price < 1000.

In [0]:
# 1. Read the above data from CSV into a DataFrame and print the schema.
df_csv = spark.read.option("header", True).option("inferSchema", True).csv("dbfs:/FileStore/tables/Products-1.csv")
df_csv.printSchema()

root
 |-- product_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- quantity: integer (nullable = true)



In [0]:
# 2. Read the same data from JSON and compare with the CSV schema. Any differences?
df_json = spark.read.option("inferSchema", True).json("dbfs:/FileStore/tables/Products-1.json")
df_json.printSchema()

root
 |-- category: string (nullable = true)
 |-- price: long (nullable = true)
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- quantity: long (nullable = true)



In [0]:
# 3. Convert the CSV data into Parquet format and save to disk
df_csv.write.mode("overwrite").parquet("dbfs:/FileStore/tables/products_parquet")

# 4. Measure the size of CSV vs JSON vs Parquet on disk
csv_size = dbutils.fs.ls("dbfs:/FileStore/tables/Products-1.csv")[0].size
json_size = dbutils.fs.ls("dbfs:/FileStore/tables/Products-1.json")[0].size

# For Parquet, we need to sum the sizes of all files in the directory
parquet_files = dbutils.fs.ls("dbfs:/FileStore/tables/products_parquet")
parquet_size = sum(f.size for f in parquet_files)

# Print file sizes
print(f"CSV Size: {csv_size} bytes")
print(f"JSON Size: {json_size} bytes")
print(f"Parquet Size: {parquet_size} bytes")


# For small datasets, CSV is smaller.
# For large-scale analytics, Parquet is much more efficient in storage, reading speed, and querying (especially with Spark or Hive).

CSV Size: 364 bytes
JSON Size: 955 bytes
Parquet Size: 2833 bytes


In [0]:
# 5. Add a column total_revenue = price * quantity for each record.
from pyspark.sql.functions import col, when

df = df_csv.withColumn("total_revenue", col("price") * col("quantity"))
display(df)

product_id,product_name,category,price,quantity,total_revenue
101,Laptop,Electronics,55000,10,550000
102,Smartphone,Electronics,30000,25,750000
103,Chair,Furniture,2500,50,125000
104,Book,Stationery,400,200,80000
105,Headphones,Electronics,1500,100,150000
106,Table,Furniture,3200,40,128000
107,Pen,Stationery,20,500,10000
108,Monitor,Electronics,12000,15,180000
109,Notebook,Stationery,60,300,18000
110,Sofa,Furniture,45000,5,225000


In [0]:
# 6. Find the top 3 products with the highest total revenue.
top_3 = df.orderBy(col("total_revenue").desc()).limit(3)
display(top_3)

product_id,product_name,category,price,quantity,total_revenue
102,Smartphone,Electronics,30000,25,750000
101,Laptop,Electronics,55000,10,550000
110,Sofa,Furniture,45000,5,225000


In [0]:
# 7. Filter and display only Furniture products with price > 3000.
furniture_products = df.filter((col("category") == "Furniture") & (col("price") > 3000))
display(furniture_products)

product_id,product_name,category,price,quantity,total_revenue
106,Table,Furniture,3200,40,128000
110,Sofa,Furniture,45000,5,225000


In [0]:
# 8. Create a new column price_band with values:
# 'High' if price > 10000
# 'Medium' if 3000 < price <= 10000
# 'Low' if price ≤ 3000
df = df.withColumn(
    "price_band",
    when(col("price") > 10000, "High")
    .when((col("price") > 3000) & (col("price") <= 10000), "Medium")
    .otherwise("Low")
)
display(df)

product_id,product_name,category,price,quantity,total_revenue,price_band
101,Laptop,Electronics,55000,10,550000,High
102,Smartphone,Electronics,30000,25,750000,High
103,Chair,Furniture,2500,50,125000,Low
104,Book,Stationery,400,200,80000,Low
105,Headphones,Electronics,1500,100,150000,Low
106,Table,Furniture,3200,40,128000,Medium
107,Pen,Stationery,20,500,10000,Low
108,Monitor,Electronics,12000,15,180000,High
109,Notebook,Stationery,60,300,18000,Low
110,Sofa,Furniture,45000,5,225000,High


In [0]:
# 9. Group by category and calculate total quantity sold. and display
category_quantity = df.groupBy("category").sum("quantity").withColumnRenamed("sum(quantity)", "total_quantity_sold")
display(category_quantity)


category,total_quantity_sold
Stationery,1000
Electronics,150
Furniture,95


In [0]:
# 10. Calculate average price of products for each category.
category_avg_price = df.groupBy("category").avg("price").withColumnRenamed("avg(price)", "avg_price")
display(category_avg_price)

category,avg_price
Stationery,160.0
Electronics,24625.0
Furniture,16900.0


In [0]:
# 11. Count how many products fall in each price_band .
df.groupBy("price_band").count().display()

price_band,count
High,4
Low,5
Medium,1


In [0]:
# 12. Write the filtered Electronics products (price > 5000) into a Parquet file.
df.filter((col("category") == "Electronics") & (col("price") > 5000)) \
  .write.mode("overwrite").parquet("dbfs:/FileStore/tables/electronics_filtered.parquet")

In [0]:
# 13. Write the Stationery products into a JSON file.
df.filter(col("category") == "Stationery") \
  .write.mode("overwrite").json("dbfs:/FileStore/tables/stationery_products.json")

In [0]:
# 14. Load Parquet back and run a query to find which category has highest total revenue.
df_loaded = spark.read.parquet("dbfs:/FileStore/tables/products_parquet")
df_loaded = df_loaded.withColumn("total_revenue", col("price") * col("quantity"))

df_loaded.groupBy("category").sum("total_revenue").withColumnRenamed("sum(total_revenue)","total_revenue_sum") \
  .orderBy("sum(total_revenue)", ascending=False).display()

category,total_revenue_sum
Electronics,1630000
Furniture,478000
Stationery,108000


In [0]:
# BONUS: 
# 15. Create a temporary view from the DataFrame and run Spark SQL to find all products with quantity > 100 and price < 1000.
df.createOrReplaceTempView("products_view")

spark.sql("""
    SELECT * FROM products_view WHERE quantity > 100 AND price < 1000
""").display()

product_id,product_name,category,price,quantity,total_revenue,price_band
104,Book,Stationery,400,200,80000,Low
107,Pen,Stationery,20,500,10000,Low
109,Notebook,Stationery,60,300,18000,Low
