# PySpark Assignment – Product Sales Analysis (Intermediate)

## Part 1: Environment Setup

In [1]:
# Set Environment Variables and Initialize Spark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("ProductSalesAnalysis").getOrCreate()
spark.version

'3.5.1'

## Part 2: Load Sales Data from CSV

In [2]:
# Create the sales.csv file
csv_data = """OrderID,Product,Category,Quantity,UnitPrice,Region
1001,Mobile,Electronics,2,15000,North
1002,Laptop,Electronics,1,55000,South
1003,T-Shirt,Apparel,3,500,East
1004,Jeans,Apparel,2,1200,North
1005,TV,Electronics,1,40000,West
1006,Shoes,Footwear,4,2000,South
1007,Watch,Accessories,2,3000,East
1008,Headphones,Electronics,3,2500,North"""

with open("sales.csv", "w") as file:
    file.write(csv_data)

In [3]:
# Read CSV into PySpark DataFrame
df = spark.read.csv("sales.csv", header=True, inferSchema=True)

In [4]:
# Print Schema and Show Top 5 Rows
df.printSchema()
df.show(5)

root
 |-- OrderID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- UnitPrice: integer (nullable = true)
 |-- Region: string (nullable = true)

+-------+-------+-----------+--------+---------+------+
|OrderID|Product|   Category|Quantity|UnitPrice|Region|
+-------+-------+-----------+--------+---------+------+
|   1001| Mobile|Electronics|       2|    15000| North|
|   1002| Laptop|Electronics|       1|    55000| South|
|   1003|T-Shirt|    Apparel|       3|      500|  East|
|   1004|  Jeans|    Apparel|       2|     1200| North|
|   1005|     TV|Electronics|       1|    40000|  West|
+-------+-------+-----------+--------+---------+------+
only showing top 5 rows



## Part 3: Business Questions

In [5]:
# 1. Add a new column TotalPrice = Quantity × UnitPrice
from pyspark.sql.functions import col

df = df.withColumn("TotalPrice", col("Quantity") * col("UnitPrice"))
df.show()

+-------+----------+-----------+--------+---------+------+----------+
|OrderID|   Product|   Category|Quantity|UnitPrice|Region|TotalPrice|
+-------+----------+-----------+--------+---------+------+----------+
|   1001|    Mobile|Electronics|       2|    15000| North|     30000|
|   1002|    Laptop|Electronics|       1|    55000| South|     55000|
|   1003|   T-Shirt|    Apparel|       3|      500|  East|      1500|
|   1004|     Jeans|    Apparel|       2|     1200| North|      2400|
|   1005|        TV|Electronics|       1|    40000|  West|     40000|
|   1006|     Shoes|   Footwear|       4|     2000| South|      8000|
|   1007|     Watch|Accessories|       2|     3000|  East|      6000|
|   1008|Headphones|Electronics|       3|     2500| North|      7500|
+-------+----------+-----------+--------+---------+------+----------+



In [6]:
# 2. Total revenue generated across all regions.
from pyspark.sql.functions import sum


df.agg(sum("TotalPrice").alias("TotalRevenue")).show()

+------------+
|TotalRevenue|
+------------+
|      150400|
+------------+



In [7]:
# 3. Category-wise revenue sorted in descending order.
from pyspark.sql.functions import desc

category_revenue = df.groupBy("Category").agg(sum("TotalPrice").alias("TotalRevenue"))
category_revenue = category_revenue.orderBy(desc("TotalRevenue"))
category_revenue.show()

+-----------+------------+
|   Category|TotalRevenue|
+-----------+------------+
|Electronics|      132500|
|   Footwear|        8000|
|Accessories|        6000|
|    Apparel|        3900|
+-----------+------------+



In [8]:
# 4. Region with the highest number of orders
from pyspark.sql.functions import desc

region_order_count = df.groupBy("Region").count()
region_order_count = region_order_count.orderBy(desc("count"))
region_order_count.show(1)

+------+-----+
|Region|count|
+------+-----+
| North|    3|
+------+-----+
only showing top 1 row



In [9]:
# 5. Average Unit Price per Category
from pyspark.sql.functions import avg

average_price_per_category = df.groupBy("Category").agg(avg("UnitPrice").alias("AverageUnitPrice"))
average_price_per_category.show()

+-----------+----------------+
|   Category|AverageUnitPrice|
+-----------+----------------+
|    Apparel|           850.0|
|Electronics|         28125.0|
|   Footwear|          2000.0|
|Accessories|          3000.0|
+-----------+----------------+



In [10]:
# 6. All orders where TotalPrice is more than 30,000
filtered_orders = df.filter(col("TotalPrice") > 30000)
filtered_orders.show()


+-------+-------+-----------+--------+---------+------+----------+
|OrderID|Product|   Category|Quantity|UnitPrice|Region|TotalPrice|
+-------+-------+-----------+--------+---------+------+----------+
|   1002| Laptop|Electronics|       1|    55000| South|     55000|
|   1005|     TV|Electronics|       1|    40000|  West|     40000|
+-------+-------+-----------+--------+---------+------+----------+



## Part 4: Data Transformations

In [11]:
# 1. Create a new column HighValueOrder which is "Yes" if TotalPrice > 20,000, else "No" .
from pyspark.sql.functions import when

df = df.withColumn("HighValueOrder", when(col("TotalPrice") > 20000, "Yes").otherwise("No"))
df.show()

+-------+----------+-----------+--------+---------+------+----------+--------------+
|OrderID|   Product|   Category|Quantity|UnitPrice|Region|TotalPrice|HighValueOrder|
+-------+----------+-----------+--------+---------+------+----------+--------------+
|   1001|    Mobile|Electronics|       2|    15000| North|     30000|           Yes|
|   1002|    Laptop|Electronics|       1|    55000| South|     55000|           Yes|
|   1003|   T-Shirt|    Apparel|       3|      500|  East|      1500|            No|
|   1004|     Jeans|    Apparel|       2|     1200| North|      2400|            No|
|   1005|        TV|Electronics|       1|    40000|  West|     40000|           Yes|
|   1006|     Shoes|   Footwear|       4|     2000| South|      8000|            No|
|   1007|     Watch|Accessories|       2|     3000|  East|      6000|            No|
|   1008|Headphones|Electronics|       3|     2500| North|      7500|            No|
+-------+----------+-----------+--------+---------+------+-------

In [12]:
# 2. Filter and display all high-value orders in the North region.
high_value_orders_north = df.filter((col("TotalPrice") > 20000) & (col("Region") == "North"))
high_value_orders_north.show()

+-------+-------+-----------+--------+---------+------+----------+--------------+
|OrderID|Product|   Category|Quantity|UnitPrice|Region|TotalPrice|HighValueOrder|
+-------+-------+-----------+--------+---------+------+----------+--------------+
|   1001| Mobile|Electronics|       2|    15000| North|     30000|           Yes|
+-------+-------+-----------+--------+---------+------+----------+--------------+



In [13]:
# 3. Count how many high-value orders exist per region.
high_value_orders_per_region = df.filter(col("HighValueOrder") == "Yes").groupBy("Region").count()
high_value_orders_per_region.show()

+------+-----+
|Region|count|
+------+-----+
| South|    1|
|  West|    1|
| North|    1|
+------+-----+



## Part 5: Save Results

In [14]:
# Save the transformed DataFrame as a CSV file named high_value_orders.csv with headers.
# Filter high-value orders
high_value_orders = df.filter(df["HighValueOrder"] == "Yes")

# Save to CSV
high_value_orders.coalesce(1).write.mode("overwrite").option("header",True).csv("high_value_orders")


### Download CSV from Google Colab

In [15]:
import shutil
import glob

# Find the part file created by Spark
output_file=glob.glob("high_value_orders/part-*.csv")[0]

# Copy to more accessible path
shutil.copy(output_file, "high_value_orders.csv")

# Download the csv file
from google.colab import files
files.download("high_value_orders.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>