# PySpark Assignment – Product Sales Analysis


# Part 1: Environment Setup

In [3]:
# Download Spark from the correct URL
!wget https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz



--2025-07-30 15:57:53--  https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz
Resolving archive.apache.org (archive.apache.org)... 65.108.204.189, 2a01:4f9:1a:a084::2
Connecting to archive.apache.org (archive.apache.org)|65.108.204.189|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 400395283 (382M) [application/x-gzip]
Saving to: ‘spark-3.5.0-bin-hadoop3.tgz’


2025-07-30 15:58:09 (24.5 MB/s) - ‘spark-3.5.0-bin-hadoop3.tgz’ saved [400395283/400395283]



In [5]:
# Extract Spark
!tar -xzf spark-3.5.0-bin-hadoop3.tgz

In [6]:
# Install Java & Findspark
!apt-get install openjdk-11-jdk -y
!pip install -q findspark

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
openjdk-11-jdk is already the newest version (11.0.28+6-1ubuntu1~22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [7]:
 #Set Environment Variables and Initialize Spark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.0-bin-hadoop3"

import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("ProductSalesAnalysis").getOrCreate()

# Part 2: Load Sales Data from CSV

In [8]:
data = """OrderID,Product,Category,Quantity,UnitPrice,Region
1001,Mobile,Electronics,2,15000,North
1002,Laptop,Electronics,1,55000,South
1003,T-Shirt,Apparel,3,500,East
1004,Jeans,Apparel,2,1200,North
1005,TV,Electronics,1,40000,West
1006,Shoes,Footwear,4,2000,South
1007,Watch,Accessories,2,3000,East
1008,Headphones,Electronics,3,2500,North"""

with open("sales.csv", "w") as f:
    f.write(data)


Task: Read the file into PySpark and print schema

In [9]:
df = spark.read.csv("sales.csv", header=True, inferSchema=True)
df.printSchema()
df.show(5)


root
 |-- OrderID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- UnitPrice: integer (nullable = true)
 |-- Region: string (nullable = true)

+-------+-------+-----------+--------+---------+------+
|OrderID|Product|   Category|Quantity|UnitPrice|Region|
+-------+-------+-----------+--------+---------+------+
|   1001| Mobile|Electronics|       2|    15000| North|
|   1002| Laptop|Electronics|       1|    55000| South|
|   1003|T-Shirt|    Apparel|       3|      500|  East|
|   1004|  Jeans|    Apparel|       2|     1200| North|
|   1005|     TV|Electronics|       1|    40000|  West|
+-------+-------+-----------+--------+---------+------+
only showing top 5 rows



# Part 3: Business Questions

1. Add a new column TotalPrice

In [10]:
from pyspark.sql.functions import col

df = df.withColumn("TotalPrice", col("Quantity") * col("UnitPrice"))
df.show()


+-------+----------+-----------+--------+---------+------+----------+
|OrderID|   Product|   Category|Quantity|UnitPrice|Region|TotalPrice|
+-------+----------+-----------+--------+---------+------+----------+
|   1001|    Mobile|Electronics|       2|    15000| North|     30000|
|   1002|    Laptop|Electronics|       1|    55000| South|     55000|
|   1003|   T-Shirt|    Apparel|       3|      500|  East|      1500|
|   1004|     Jeans|    Apparel|       2|     1200| North|      2400|
|   1005|        TV|Electronics|       1|    40000|  West|     40000|
|   1006|     Shoes|   Footwear|       4|     2000| South|      8000|
|   1007|     Watch|Accessories|       2|     3000|  East|      6000|
|   1008|Headphones|Electronics|       3|     2500| North|      7500|
+-------+----------+-----------+--------+---------+------+----------+



2. Total revenue generated

In [11]:
df.groupBy("Category").sum("TotalPrice").orderBy("sum(TotalPrice)", ascending=False).show()

+-----------+---------------+
|   Category|sum(TotalPrice)|
+-----------+---------------+
|Electronics|         132500|
|   Footwear|           8000|
|Accessories|           6000|
|    Apparel|           3900|
+-----------+---------------+



3. Category-wise revenue (descending)

In [12]:
df.groupBy("Category").sum("TotalPrice").orderBy("sum(TotalPrice)", ascending=False).show()

+-----------+---------------+
|   Category|sum(TotalPrice)|
+-----------+---------------+
|Electronics|         132500|
|   Footwear|           8000|
|Accessories|           6000|
|    Apparel|           3900|
+-----------+---------------+



4. Region with highest number of orders

In [13]:
df.groupBy("Region").count().orderBy("count", ascending=False).show(1)

+------+-----+
|Region|count|
+------+-----+
| North|    3|
+------+-----+
only showing top 1 row



5. Average Unit Price per Category

In [14]:
df.groupBy("Category").avg("UnitPrice").show()

+-----------+--------------+
|   Category|avg(UnitPrice)|
+-----------+--------------+
|    Apparel|         850.0|
|Electronics|       28125.0|
|   Footwear|        2000.0|
|Accessories|        3000.0|
+-----------+--------------+



6. Orders where TotalPrice > 30000

In [15]:
df.filter(col("TotalPrice") > 30000).show()

+-------+-------+-----------+--------+---------+------+----------+
|OrderID|Product|   Category|Quantity|UnitPrice|Region|TotalPrice|
+-------+-------+-----------+--------+---------+------+----------+
|   1002| Laptop|Electronics|       1|    55000| South|     55000|
|   1005|     TV|Electronics|       1|    40000|  West|     40000|
+-------+-------+-----------+--------+---------+------+----------+



# Part 4: Data Transformations

1. Add HighValueOrder column

In [16]:
from pyspark.sql.functions import when

df = df.withColumn("HighValueOrder", when(col("TotalPrice") > 20000, "Yes").otherwise("No"))
df.show()


+-------+----------+-----------+--------+---------+------+----------+--------------+
|OrderID|   Product|   Category|Quantity|UnitPrice|Region|TotalPrice|HighValueOrder|
+-------+----------+-----------+--------+---------+------+----------+--------------+
|   1001|    Mobile|Electronics|       2|    15000| North|     30000|           Yes|
|   1002|    Laptop|Electronics|       1|    55000| South|     55000|           Yes|
|   1003|   T-Shirt|    Apparel|       3|      500|  East|      1500|            No|
|   1004|     Jeans|    Apparel|       2|     1200| North|      2400|            No|
|   1005|        TV|Electronics|       1|    40000|  West|     40000|           Yes|
|   1006|     Shoes|   Footwear|       4|     2000| South|      8000|            No|
|   1007|     Watch|Accessories|       2|     3000|  East|      6000|            No|
|   1008|Headphones|Electronics|       3|     2500| North|      7500|            No|
+-------+----------+-----------+--------+---------+------+-------

2. . High-value orders in North region

In [17]:
df.filter((col("HighValueOrder") == "Yes") & (col("Region") == "North")).show()


+-------+-------+-----------+--------+---------+------+----------+--------------+
|OrderID|Product|   Category|Quantity|UnitPrice|Region|TotalPrice|HighValueOrder|
+-------+-------+-----------+--------+---------+------+----------+--------------+
|   1001| Mobile|Electronics|       2|    15000| North|     30000|           Yes|
+-------+-------+-----------+--------+---------+------+----------+--------------+



3. Count of high-value orders per region

In [18]:
df.filter(col("HighValueOrder") == "Yes").groupBy("Region").count().show()


+------+-----+
|Region|count|
+------+-----+
| South|    1|
|  West|    1|
| North|    1|
+------+-----+



# Part 5: Save Results

Save transformed DataFrame as CSV

In [19]:
df.filter(col("HighValueOrder") == "Yes").write.csv("high_value_orders.csv", header=True, mode="overwrite")

In [20]:
import shutil
shutil.make_archive("high_value_orders", 'zip', "high_value_orders.csv")
from google.colab import files
files.download("high_value_orders.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>