1. Read the above data from CSV into a DataFrame and print the schema.

In [0]:
# Read the CSV
df_csv = spark.read.option("header", True).option("inferSchema", True).csv("/FileStore/tables/products.csv")
df_csv.printSchema()


root
 |-- product_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- quantity: integer (nullable = true)



2. Read the same data from JSON and compare with the CSV schema. Any differences?

In [0]:
# Read the JSON
df_json = spark.read.option("inferSchema", True).json("/FileStore/tables/products.json")
df_json.printSchema()

root
 |-- _corrupt_record: string (nullable = true)
 |-- category: string (nullable = true)
 |-- price: long (nullable = true)
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- quantity: long (nullable = true)



3. Convert the CSV data into Parquet format and save to disk.

In [0]:
df_csv.write.mode("overwrite").parquet("dbfs:/FileStore/tables/products_parquet")

df_parquet = spark.read.parquet("dbfs:/FileStore/tables/products_parquet")
df_parquet.show()
df_parquet.printSchema()

+----------+------------+-----------+-----+--------+
|product_id|product_name|   category|price|quantity|
+----------+------------+-----------+-----+--------+
|       101|      Laptop|Electronics|55000|      10|
|       102|  Smartphone|Electronics|30000|      25|
|       103|       Chair|  Furniture| 2500|      50|
|       104|        Book| Stationery|  400|     200|
|       105|  Headphones|Electronics| 1500|     100|
|       106|       Table|  Furniture| 3200|      40|
|       107|         Pen| Stationery|   20|     500|
|       108|     Monitor|Electronics|12000|      15|
|       109|    Notebook| Stationery|   60|     300|
|       110|        Sofa|  Furniture|45000|       5|
+----------+------------+-----------+-----+--------+

root
 |-- product_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- quantity: integer (nullable = true)



4. Measure the size of CSV vs JSON vs Parquet on disk. Which one is smallest?

In [0]:
# Get sizes using dbutils (works in notebooks & jobs)
csv_files = dbutils.fs.ls("dbfs:/FileStore/tables/products.csv")
json_files = dbutils.fs.ls("dbfs:/FileStore/tables/products.json")
parquet_files = dbutils.fs.ls("dbfs:/FileStore/tables/products_parquet")

csv_size = sum(f.size for f in csv_files)
json_size = sum(f.size for f in json_files)
parquet_size = sum(f.size for f in parquet_files)

print("CSV size:", csv_size, "bytes")
print("JSON size:", json_size, "bytes")
print("Parquet size:", parquet_size, "bytes")



CSV size: 364 bytes
JSON size: 990 bytes
Parquet size: 1839 bytes


5. Add a column total_revenue = price * quantity for each record.

In [0]:
from pyspark.sql.functions import col
df_with_revenue = df_csv.withColumn("total_revenue", col("price") * col("quantity"))
df_with_revenue.show()


+----------+------------+-----------+-----+--------+-------------+
|product_id|product_name|   category|price|quantity|total_revenue|
+----------+------------+-----------+-----+--------+-------------+
|       101|      Laptop|Electronics|55000|      10|       550000|
|       102|  Smartphone|Electronics|30000|      25|       750000|
|       103|       Chair|  Furniture| 2500|      50|       125000|
|       104|        Book| Stationery|  400|     200|        80000|
|       105|  Headphones|Electronics| 1500|     100|       150000|
|       106|       Table|  Furniture| 3200|      40|       128000|
|       107|         Pen| Stationery|   20|     500|        10000|
|       108|     Monitor|Electronics|12000|      15|       180000|
|       109|    Notebook| Stationery|   60|     300|        18000|
|       110|        Sofa|  Furniture|45000|       5|       225000|
+----------+------------+-----------+-----+--------+-------------+



6. Find the top 3 products with the highest total revenue.

In [0]:
top3_df = df_with_revenue.orderBy(col("total_revenue").desc())
top3_df.show(3)

+----------+------------+-----------+-----+--------+-------------+
|product_id|product_name|   category|price|quantity|total_revenue|
+----------+------------+-----------+-----+--------+-------------+
|       102|  Smartphone|Electronics|30000|      25|       750000|
|       101|      Laptop|Electronics|55000|      10|       550000|
|       110|        Sofa|  Furniture|45000|       5|       225000|
+----------+------------+-----------+-----+--------+-------------+
only showing top 3 rows


7. Filter and display only Furniture products with price > 3000.

In [0]:
df_furniture = df_with_revenue.filter((col("category") == "Furniture") &(col("price") > 3000) )
df_furniture.show()

+----------+------------+---------+-----+--------+-------------+
|product_id|product_name| category|price|quantity|total_revenue|
+----------+------------+---------+-----+--------+-------------+
|       106|       Table|Furniture| 3200|      40|       128000|
|       110|        Sofa|Furniture|45000|       5|       225000|
+----------+------------+---------+-----+--------+-------------+



8. Create a new column price_band with values:
'High' if price > 10000
'Medium' if 3000 < price <= 10000
'Low' if price ≤ 3000

In [0]:
from pyspark.sql.functions import when

df_with_band = df_with_revenue.withColumn("price_band", 
                                          when(col("price") > 10000, "High")
                                          .when((col("price") > 3000) & (col("price") <= 10000), "Medium")
                                          .otherwise("Low"))
df_with_band.show()


+----------+------------+-----------+-----+--------+-------------+----------+
|product_id|product_name|   category|price|quantity|total_revenue|price_band|
+----------+------------+-----------+-----+--------+-------------+----------+
|       101|      Laptop|Electronics|55000|      10|       550000|      High|
|       102|  Smartphone|Electronics|30000|      25|       750000|      High|
|       103|       Chair|  Furniture| 2500|      50|       125000|       Low|
|       104|        Book| Stationery|  400|     200|        80000|       Low|
|       105|  Headphones|Electronics| 1500|     100|       150000|       Low|
|       106|       Table|  Furniture| 3200|      40|       128000|    Medium|
|       107|         Pen| Stationery|   20|     500|        10000|       Low|
|       108|     Monitor|Electronics|12000|      15|       180000|      High|
|       109|    Notebook| Stationery|   60|     300|        18000|       Low|
|       110|        Sofa|  Furniture|45000|       5|       22500

9. Group by category and calculate total quantity sold.

In [0]:
df_with_band.groupBy("category").sum("quantity").withColumnRenamed("sum(quantity)", "total_quantity").show()

+-----------+--------------+
|   category|total_quantity|
+-----------+--------------+
|  Furniture|            95|
| Stationery|          1000|
|Electronics|           150|
+-----------+--------------+



10. Calculate average price of products for each category.

In [0]:
df_with_band.groupBy("category").avg("price").withColumnRenamed("avg(price)", "avg_price").show()

+-----------+---------+
|   category|avg_price|
+-----------+---------+
|  Furniture|  16900.0|
| Stationery|    160.0|
|Electronics|  24625.0|
+-----------+---------+



11. Count how many products fall in each price_band .

In [0]:
df_with_band.groupBy("price_band").count().show()

+----------+-----+
|price_band|count|
+----------+-----+
|    Medium|    1|
|      High|    4|
|       Low|    5|
+----------+-----+



12. Write the filtered Electronics products (price > 5000) into a Parquet file.

In [0]:
df_with_band.filter((col("category") == "Electronics") & (col("price") > 5000)) \
    .write.mode("overwrite").parquet("dbfs:/FileStore/tables/products_electronics.parquet")

In [0]:
spark.read.parquet("dbfs:/FileStore/tables/products_electronics.parquet").show()


+----------+------------+-----------+-----+--------+-------------+----------+
|product_id|product_name|   category|price|quantity|total_revenue|price_band|
+----------+------------+-----------+-----+--------+-------------+----------+
|       101|      Laptop|Electronics|55000|      10|       550000|      High|
|       102|  Smartphone|Electronics|30000|      25|       750000|      High|
|       108|     Monitor|Electronics|12000|      15|       180000|      High|
+----------+------------+-----------+-----+--------+-------------+----------+



13. Write the Stationery products into a JSON file.

In [0]:
df_with_band.filter(col("category") == "Stationery") \
    .write.mode("overwrite").json("dbfs:/FileStore/tables/stationery_products.json")


In [0]:
spark.read.json("dbfs:/FileStore/tables/stationery_products.json").show()


+----------+-----+----------+----------+------------+--------+-------------+
|  category|price|price_band|product_id|product_name|quantity|total_revenue|
+----------+-----+----------+----------+------------+--------+-------------+
|Stationery|  400|       Low|       104|        Book|     200|        80000|
|Stationery|   20|       Low|       107|         Pen|     500|        10000|
|Stationery|   60|       Low|       109|    Notebook|     300|        18000|
+----------+-----+----------+----------+------------+--------+-------------+



14. Load Parquet back and run a query to find which category has highest total
revenue.

In [0]:
#Load the parquet file
df_parquet = spark.read.parquet("dbfs:/FileStore/tables/products_parquet")


In [0]:
df_parquet = df_parquet.withColumn("total_revenue", col("price") * col("quantity"))

df_parquet.groupBy("category").sum("total_revenue") \
    .orderBy(col("sum(total_revenue)").desc()).show(1)


+-----------+------------------+
|   category|sum(total_revenue)|
+-----------+------------------+
|Electronics|           1630000|
+-----------+------------------+
only showing top 1 row


15. BONUS: Create a temporary view from the DataFrame and run Spark SQL to find all
products with quantity > 100 and price < 1000.

In [0]:
# Create a temporary view
df_with_band.createOrReplaceTempView("products_view")

# Run Spark SQL query
spark.sql("""
    SELECT * FROM products_view
    WHERE quantity > 100 AND price < 1000
""").show()


+----------+------------+----------+-----+--------+-------------+----------+
|product_id|product_name|  category|price|quantity|total_revenue|price_band|
+----------+------------+----------+-----+--------+-------------+----------+
|       104|        Book|Stationery|  400|     200|        80000|       Low|
|       107|         Pen|Stationery|   20|     500|        10000|       Low|
|       109|    Notebook|Stationery|   60|     300|        18000|       Low|
+----------+------------+----------+-----+--------+-------------+----------+

