In [8]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("MyPySparkAssignment").getOrCreate()

spark


In [9]:
# Step 1: Create the sales.csv file

data = """OrderID,Product,Category,Quantity,UnitPrice,Region
1001,Mobile,Electronics,2,15000,North
1002,Laptop,Electronics,1,55000,South
1003,T-Shirt,Apparel,3,500,East
1004,Jeans,Apparel,2,1200,North
1005,TV,Electronics,1,40000,West
1006,Shoes,Footwear,4,2000,South
1007,Watch,Accessories,2,3000,East
1008,Headphones,Electronics,3,2500,North"""


with open("/content/sales.csv", "w") as f:
    f.write(data)



In [10]:
# Step 2: Load CSV into PySpark DataFrame

df = spark.read.csv("/content/sales.csv", header=True, inferSchema=True)

# Show structure and first 5 rows
df.printSchema()
df.show(5)


root
 |-- OrderID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- UnitPrice: integer (nullable = true)
 |-- Region: string (nullable = true)

+-------+-------+-----------+--------+---------+------+
|OrderID|Product|   Category|Quantity|UnitPrice|Region|
+-------+-------+-----------+--------+---------+------+
|   1001| Mobile|Electronics|       2|    15000| North|
|   1002| Laptop|Electronics|       1|    55000| South|
|   1003|T-Shirt|    Apparel|       3|      500|  East|
|   1004|  Jeans|    Apparel|       2|     1200| North|
|   1005|     TV|Electronics|       1|    40000|  West|
+-------+-------+-----------+--------+---------+------+
only showing top 5 rows



In [11]:
from pyspark.sql.functions import col, expr

# Add Revenue column
df = df.withColumn("Revenue", col("Quantity") * col("UnitPrice"))

# Show updated DataFrame
df.select("OrderID", "Product", "Quantity", "UnitPrice", "Revenue").show()


+-------+----------+--------+---------+-------+
|OrderID|   Product|Quantity|UnitPrice|Revenue|
+-------+----------+--------+---------+-------+
|   1001|    Mobile|       2|    15000|  30000|
|   1002|    Laptop|       1|    55000|  55000|
|   1003|   T-Shirt|       3|      500|   1500|
|   1004|     Jeans|       2|     1200|   2400|
|   1005|        TV|       1|    40000|  40000|
|   1006|     Shoes|       4|     2000|   8000|
|   1007|     Watch|       2|     3000|   6000|
|   1008|Headphones|       3|     2500|   7500|
+-------+----------+--------+---------+-------+



In [12]:
# Group by Product and sum Revenue
product_sales = df.groupBy("Product").sum("Revenue").withColumnRenamed("sum(Revenue)", "TotalRevenue")

# Sort in descending order
product_sales.orderBy(col("TotalRevenue").desc()).show()


+----------+------------+
|   Product|TotalRevenue|
+----------+------------+
|    Laptop|       55000|
|        TV|       40000|
|    Mobile|       30000|
|     Shoes|        8000|
|Headphones|        7500|
|     Watch|        6000|
|     Jeans|        2400|
|   T-Shirt|        1500|
+----------+------------+



In [13]:
# Group by Product and sum Quantity
most_sold = df.groupBy("Product").sum("Quantity").withColumnRenamed("sum(Quantity)", "TotalQuantity")

# Show the most sold product
most_sold.orderBy(col("TotalQuantity").desc()).show(1)


+-------+-------------+
|Product|TotalQuantity|
+-------+-------------+
|  Shoes|            4|
+-------+-------------+
only showing top 1 row



In [14]:
# Group by Category and calculate total revenue
category_sales = df.groupBy("Category").sum("Revenue").withColumnRenamed("sum(Revenue)", "TotalRevenue")

category_sales.orderBy(col("TotalRevenue").desc()).show()


+-----------+------------+
|   Category|TotalRevenue|
+-----------+------------+
|Electronics|      132500|
|   Footwear|        8000|
|Accessories|        6000|
|    Apparel|        3900|
+-----------+------------+



In [15]:
# Group by Region and calculate total revenue
region_sales = df.groupBy("Region").sum("Revenue").withColumnRenamed("sum(Revenue)", "TotalRevenue")

region_sales.orderBy(col("TotalRevenue").desc()).show()


+------+------------+
|Region|TotalRevenue|
+------+------------+
| South|       63000|
|  West|       40000|
| North|       39900|
|  East|        7500|
+------+------------+

