In [0]:

df_csv = spark.read.csv("/FileStore/tables/products.csv", header=True, inferSchema=True)
df_csv.printSchema()
df_csv.show()


root
 |-- product_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- quantity: integer (nullable = true)

+----------+------------+-----------+-----+--------+
|product_id|product_name|   category|price|quantity|
+----------+------------+-----------+-----+--------+
|       101|      Laptop|Electronics|55000|      10|
|       102|  Smartphone|Electronics|30000|      25|
|       103|       Chair|  Furniture| 2500|      50|
|       104|        Book| Stationery|  400|     200|
|       105|  Headphones|Electronics| 1500|     100|
|       106|       Table|  Furniture| 3200|      40|
|       107|         Pen| Stationery|   20|     500|
|       108|     Monitor|Electronics|12000|      15|
|       109|    Notebook| Stationery|   60|     300|
|       110|        Sofa|  Furniture|45000|       5|
+----------+------------+-----------+-----+--------+



In [0]:
#1. Read the above data from CSV into a DataFrame and print the schema.
df_csv = spark.read.csv("/FileStore/tables/products.csv", header=True, inferSchema=True)
df_csv.printSchema()
df_csv.show()

root
 |-- product_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- quantity: integer (nullable = true)

+----------+------------+-----------+-----+--------+
|product_id|product_name|   category|price|quantity|
+----------+------------+-----------+-----+--------+
|       101|      Laptop|Electronics|55000|      10|
|       102|  Smartphone|Electronics|30000|      25|
|       103|       Chair|  Furniture| 2500|      50|
|       104|        Book| Stationery|  400|     200|
|       105|  Headphones|Electronics| 1500|     100|
|       106|       Table|  Furniture| 3200|      40|
|       107|         Pen| Stationery|   20|     500|
|       108|     Monitor|Electronics|12000|      15|
|       109|    Notebook| Stationery|   60|     300|
|       110|        Sofa|  Furniture|45000|       5|
+----------+------------+-----------+-----+--------+



In [0]:
#2. Read the same data from JSON and compare with the CSV schema. Any differences?
df_json = spark.read.json("/FileStore/tables/products-2.json")
df_json.printSchema()
df_csv.printSchema()

#differences
'''When we read the product data from a CSV file in Spark, it automatically figures out the type of each column. For example, numbers like product_id, price, and quantity are treated as regular integers, and text columns like product_name and category are treated as strings. But when we save the same data as a JSON file and read it, Spark  treats the number columns as a larger type called Long instead of Integer. This happens because JSON doesn’t clearly say what size a number is, so Spark uses a bigger type just to be safe.'''



root
 |-- category: string (nullable = true)
 |-- price: long (nullable = true)
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- quantity: long (nullable = true)

root
 |-- product_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- quantity: integer (nullable = true)



In [0]:
#3. Convert the CSV data into Parquet format and save to disk.
df_csv.write.mode("overwrite").parquet("/FileStore/tables/productsdata.parquet")
df_parquet = spark.read.parquet("/FileStore/tables/productsdata.parquet")
df_parquet.printSchema()



root
 |-- product_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- quantity: integer (nullable = true)



In [0]:
%fs ls /FileStore/tables/products.csv



path,name,size,modificationTime
dbfs:/FileStore/tables/products.csv,products.csv,364,1754545284000


In [0]:
%fs ls /FileStore/tables/products-2.json/


path,name,size,modificationTime
dbfs:/FileStore/tables/products-2.json,products-2.json,955,1754546339000


In [0]:
%fs ls /FileStore/tables/productsdata.parquet/

path,name,size,modificationTime
dbfs:/FileStore/tables/productsdata.parquet/_SUCCESS,_SUCCESS,0,1754547946000
dbfs:/FileStore/tables/productsdata.parquet/_committed_7035696517194952772,_committed_7035696517194952772,123,1754547946000
dbfs:/FileStore/tables/productsdata.parquet/_started_7035696517194952772,_started_7035696517194952772,0,1754547946000
dbfs:/FileStore/tables/productsdata.parquet/part-00000-tid-7035696517194952772-9426d74a-12d9-4d9d-a012-697c7912370b-96-1.c000.snappy.parquet,part-00000-tid-7035696517194952772-9426d74a-12d9-4d9d-a012-697c7912370b-96-1.c000.snappy.parquet,1712,1754547946000


In [0]:
#5. Add a column total_revenue = price * quantity for each record.
df_csv = df_csv.withColumn("total_revenue", df_csv["price"] * df_csv["quantity"])
df_csv.show()

+----------+------------+-----------+-----+--------+-------------+
|product_id|product_name|   category|price|quantity|total_revenue|
+----------+------------+-----------+-----+--------+-------------+
|       101|      Laptop|Electronics|55000|      10|       550000|
|       102|  Smartphone|Electronics|30000|      25|       750000|
|       103|       Chair|  Furniture| 2500|      50|       125000|
|       104|        Book| Stationery|  400|     200|        80000|
|       105|  Headphones|Electronics| 1500|     100|       150000|
|       106|       Table|  Furniture| 3200|      40|       128000|
|       107|         Pen| Stationery|   20|     500|        10000|
|       108|     Monitor|Electronics|12000|      15|       180000|
|       109|    Notebook| Stationery|   60|     300|        18000|
|       110|        Sofa|  Furniture|45000|       5|       225000|
+----------+------------+-----------+-----+--------+-------------+



In [0]:
#6. Find the top 3 products with the highest total revenue.
df_csv.orderBy(df_csv["total_revenue"].desc()).limit(3).show()

+----------+------------+-----------+-----+--------+-------------+
|product_id|product_name|   category|price|quantity|total_revenue|
+----------+------------+-----------+-----+--------+-------------+
|       102|  Smartphone|Electronics|30000|      25|       750000|
|       101|      Laptop|Electronics|55000|      10|       550000|
|       110|        Sofa|  Furniture|45000|       5|       225000|
+----------+------------+-----------+-----+--------+-------------+



In [0]:
#7. Filter and display only Furniture products with price > 3000.
df_csv.filter((df_csv["category"] == "Furniture") & (df_csv["price"] > 3000)).show()

+----------+------------+---------+-----+--------+-------------+
|product_id|product_name| category|price|quantity|total_revenue|
+----------+------------+---------+-----+--------+-------------+
|       106|       Table|Furniture| 3200|      40|       128000|
|       110|        Sofa|Furniture|45000|       5|       225000|
+----------+------------+---------+-----+--------+-------------+



In [0]:
'''8.Create a new column price_band with values:
'High' if price > 10000
'Medium' if 3000 < price <= 10000
'Low' if price ≤ 3000'''
from pyspark.sql.functions import when
df_csv = df_csv.withColumn("price_band", when(df_csv["price"] > 10000, "High").when((df_csv["price"] > 3000) & (df_csv["price"] <= 10000), "Medium").otherwise("Low"))
df_csv.show()

+----------+------------+-----------+-----+--------+-------------+----------+
|product_id|product_name|   category|price|quantity|total_revenue|price_band|
+----------+------------+-----------+-----+--------+-------------+----------+
|       101|      Laptop|Electronics|55000|      10|       550000|      High|
|       102|  Smartphone|Electronics|30000|      25|       750000|      High|
|       103|       Chair|  Furniture| 2500|      50|       125000|       Low|
|       104|        Book| Stationery|  400|     200|        80000|       Low|
|       105|  Headphones|Electronics| 1500|     100|       150000|       Low|
|       106|       Table|  Furniture| 3200|      40|       128000|    Medium|
|       107|         Pen| Stationery|   20|     500|        10000|       Low|
|       108|     Monitor|Electronics|12000|      15|       180000|      High|
|       109|    Notebook| Stationery|   60|     300|        18000|       Low|
|       110|        Sofa|  Furniture|45000|       5|       22500

In [0]:
#9. Group by category and calculate total quantity sold.
df_csv.groupBy("category").agg({"quantity": "sum"}).show()

+-----------+-------------+
|   category|sum(quantity)|
+-----------+-------------+
| Stationery|         1000|
|Electronics|          150|
|  Furniture|           95|
+-----------+-------------+



In [0]:
#10. Calculate average price of products for each category.
df_csv.groupBy("category").agg({"price": "avg"}).show()

+-----------+----------+
|   category|avg(price)|
+-----------+----------+
| Stationery|     160.0|
|Electronics|   24625.0|
|  Furniture|   16900.0|
+-----------+----------+



In [0]:
#11. Count how many products fall in each price_band .
df_csv.groupBy("price_band").count().show()


+----------+-----+
|price_band|count|
+----------+-----+
|      High|    4|
|       Low|    5|
|    Medium|    1|
+----------+-----+



In [0]:
#12. Write the filtered Electronics products (price > 5000) into a Parquet file.
df_csv.filter(df_csv["category"] == "Electronics").write.mode("overwrite").parquet("/FileStore/tables/productsdata.parquet")
#13. Write the Stationery products into a JSON file.
df_csv.filter(df_csv["category"] == "Stationery").write.mode("overwrite").json("/FileStore/tables/products-2.json")

In [0]:
#14. Load Parquet back and run a query to find which category has highest totalrevenue.
df_csv.write.mode("overwrite").parquet("/FileStore/tables/productswithdata.parquet")

df_parquet = spark.read.parquet("/FileStore/tables/productswithdata.parquet")
from pyspark.sql.functions import sum as _sum, desc
df_parquet.groupBy("category").agg(_sum("total_revenue").alias("total_revenue_sum")).orderBy(desc("total_revenue_sum")).limit(1).show()



+-----------+-----------------+
|   category|total_revenue_sum|
+-----------+-----------------+
|Electronics|          1630000|
+-----------+-----------------+



In [0]:
#15BONUS: Create a temporary view from the DataFrame and run Spark SQL to find all products with quantity > 100 and price <1000.
df_csv.createOrReplaceTempView("products")
spark.sql("SELECT * FROM products WHERE quantity > 100 AND price < 1000").show

<bound method DataFrame.show of DataFrame[product_id: int, product_name: string, category: string, price: int, quantity: int, total_revenue: int, price_band: string]>