In [4]:
from pyspark.sql import SparkSession
# Create Spark session

spark = SparkSession.builder \
.appName("PySparkBasics") \
.getOrCreate()

# Confirm Spark is running

spark

In [5]:
# Sample data
data = [("Amit", 25), ("Sneha", 30), ("Kabir", 28)]
columns = ["Name", "Age"]

#Create DataFrame
df = spark.createDataFrame(data, columns)

# Show data
df.show()

+-----+---+
| Name|Age|
+-----+---+
| Amit| 25|
|Sneha| 30|
|Kabir| 28|
+-----+---+



In [7]:

csv_data = """Name, Department, Salary
Amit, Sales, 50000
Sneha, Engineering, 80000
Kabir, HR, 45000
Anaya, Marketing, 60000
Ravi, Engineering, 85000"""

with open('employees.csv', 'w') as f:
  f.write(csv_data)

In [8]:
df = spark.read.csv('employees.csv', header = True, inferSchema=True)
df.show()
df.printSchema()

+-----+------------+-------+
| Name|  Department| Salary|
+-----+------------+-------+
| Amit|       Sales|50000.0|
|Sneha| Engineering|80000.0|
|Kabir|          HR|45000.0|
|Anaya|   Marketing|60000.0|
| Ravi| Engineering|85000.0|
+-----+------------+-------+

root
 |-- Name: string (nullable = true)
 |--  Department: string (nullable = true)
 |--  Salary: double (nullable = true)



# ASSIGNMENT

# Part 1: Environment Setup

1. Install Spark + Java in Google Colab.
2. Initialize Spark with app name "ProductSalesAnalysis" .

In [9]:
# Install Java and Spark
!apt-get install openjdk-11-jdk -y
!wget -q https://downloads.apache.org/spark/spark-3.4.1/spark-3.4.1-bin-hadoop3.tgz
!tar xf spark-3.4.1-bin-hadoop3.tgz
!pip install -q findspark

# Set environment variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.4.1-bin-hadoop3"


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  fonts-dejavu-core fonts-dejavu-extra libatk-wrapper-java
  libatk-wrapper-java-jni libxt-dev libxtst6 libxxf86dga1 openjdk-11-jre
  x11-utils
Suggested packages:
  libxt-doc openjdk-11-demo openjdk-11-source visualvm mesa-utils
The following NEW packages will be installed:
  fonts-dejavu-core fonts-dejavu-extra libatk-wrapper-java
  libatk-wrapper-java-jni libxt-dev libxtst6 libxxf86dga1 openjdk-11-jdk
  openjdk-11-jre x11-utils
0 upgraded, 10 newly installed, 0 to remove and 35 not upgraded.
Need to get 5,367 kB of archives.
After this operation, 15.2 MB of additional disk space will be used.
Get:1 http://security.ubuntu.com/ubuntu jammy-security/main amd64 openjdk-11-jre amd64 11.0.28+6-1ubuntu1~22.04.1 [214 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 fonts-dejavu-core all 2.37-2build1 [1,041 kB]
Get:3 http://security.

In [10]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("ProductSalesAnalysis").getOrCreate()


# Part 2: Load Sales Data from CSV

Read the file into a PySpark DataFrame with header and inferred schema.
Print schema and show top 5 rows.


In [11]:
csv_data = """OrderID,Product,Category,Quantity,UnitPrice,Region
1001,Mobile,Electronics,2,15000,North
1002,Laptop,Electronics,1,55000,South
1003,T-Shirt,Apparel,3,500,East
1004,Jeans,Apparel,2,1200,North
1005,TV,Electronics,1,40000,West
1006,Shoes,Footwear,4,2000,South
1007,Watch,Accessories,2,3000,East
1008,Headphones,Electronics,3,2500,North
"""

with open("sales.csv", "w") as f:
    f.write(csv_data)


In [12]:
df = spark.read.csv("sales.csv", header=True, inferSchema=True)
df.printSchema()
df.show(5)


root
 |-- OrderID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- UnitPrice: integer (nullable = true)
 |-- Region: string (nullable = true)

+-------+-------+-----------+--------+---------+------+
|OrderID|Product|   Category|Quantity|UnitPrice|Region|
+-------+-------+-----------+--------+---------+------+
|   1001| Mobile|Electronics|       2|    15000| North|
|   1002| Laptop|Electronics|       1|    55000| South|
|   1003|T-Shirt|    Apparel|       3|      500|  East|
|   1004|  Jeans|    Apparel|       2|     1200| North|
|   1005|     TV|Electronics|       1|    40000|  West|
+-------+-------+-----------+--------+---------+------+
only showing top 5 rows



# Part 3: Business Questions

1. Add a new column TotalPrice = Quantity × UnitPrice
2. Total revenue generated across all regions.
3. Category-wise revenue sorted in descending order.
4. Region with the highest number of orders
5. Average Unit Price per Category
6. All orders where TotalPrice is more than
30,000

In [13]:
from pyspark.sql.functions import col

df = df.withColumn("TotalPrice", col("Quantity") * col("UnitPrice"))
df.show()


+-------+----------+-----------+--------+---------+------+----------+
|OrderID|   Product|   Category|Quantity|UnitPrice|Region|TotalPrice|
+-------+----------+-----------+--------+---------+------+----------+
|   1001|    Mobile|Electronics|       2|    15000| North|     30000|
|   1002|    Laptop|Electronics|       1|    55000| South|     55000|
|   1003|   T-Shirt|    Apparel|       3|      500|  East|      1500|
|   1004|     Jeans|    Apparel|       2|     1200| North|      2400|
|   1005|        TV|Electronics|       1|    40000|  West|     40000|
|   1006|     Shoes|   Footwear|       4|     2000| South|      8000|
|   1007|     Watch|Accessories|       2|     3000|  East|      6000|
|   1008|Headphones|Electronics|       3|     2500| North|      7500|
+-------+----------+-----------+--------+---------+------+----------+



In [14]:
total_revenue = df.agg({"TotalPrice": "sum"}).collect()[0][0]
print(f"Total Revenue: ₹{total_revenue}")


Total Revenue: ₹150400


In [15]:
df.groupBy("Category") \
  .sum("TotalPrice") \
  .withColumnRenamed("sum(TotalPrice)", "TotalRevenue") \
  .orderBy(col("TotalRevenue").desc()) \
  .show()


+-----------+------------+
|   Category|TotalRevenue|
+-----------+------------+
|Electronics|      132500|
|   Footwear|        8000|
|Accessories|        6000|
|    Apparel|        3900|
+-----------+------------+



In [16]:
df.groupBy("Region") \
  .count() \
  .orderBy(col("count").desc()) \
  .show(1)


+------+-----+
|Region|count|
+------+-----+
| North|    3|
+------+-----+
only showing top 1 row



In [18]:
df.filter(col("TotalPrice") > 30000).show()


+-------+-------+-----------+--------+---------+------+----------+
|OrderID|Product|   Category|Quantity|UnitPrice|Region|TotalPrice|
+-------+-------+-----------+--------+---------+------+----------+
|   1002| Laptop|Electronics|       1|    55000| South|     55000|
|   1005|     TV|Electronics|       1|    40000|  West|     40000|
+-------+-------+-----------+--------+---------+------+----------+



# Part 4: Data Transformations

1. Create a new column HighValueOrder which is "Yes" if TotalPrice > 20,000,
else "No" .
2. Filter and display all high-value orders in the North region.
3. Count how many high-value orders exist per region.

In [19]:
from pyspark.sql.functions import when

df = df.withColumn("HighValueOrder", when(col("TotalPrice") > 20000, "Yes").otherwise("No"))
df.show()


+-------+----------+-----------+--------+---------+------+----------+--------------+
|OrderID|   Product|   Category|Quantity|UnitPrice|Region|TotalPrice|HighValueOrder|
+-------+----------+-----------+--------+---------+------+----------+--------------+
|   1001|    Mobile|Electronics|       2|    15000| North|     30000|           Yes|
|   1002|    Laptop|Electronics|       1|    55000| South|     55000|           Yes|
|   1003|   T-Shirt|    Apparel|       3|      500|  East|      1500|            No|
|   1004|     Jeans|    Apparel|       2|     1200| North|      2400|            No|
|   1005|        TV|Electronics|       1|    40000|  West|     40000|           Yes|
|   1006|     Shoes|   Footwear|       4|     2000| South|      8000|            No|
|   1007|     Watch|Accessories|       2|     3000|  East|      6000|            No|
|   1008|Headphones|Electronics|       3|     2500| North|      7500|            No|
+-------+----------+-----------+--------+---------+------+-------

In [20]:
df.filter((col("HighValueOrder") == "Yes") & (col("Region") == "North")).show()


+-------+-------+-----------+--------+---------+------+----------+--------------+
|OrderID|Product|   Category|Quantity|UnitPrice|Region|TotalPrice|HighValueOrder|
+-------+-------+-----------+--------+---------+------+----------+--------------+
|   1001| Mobile|Electronics|       2|    15000| North|     30000|           Yes|
+-------+-------+-----------+--------+---------+------+----------+--------------+



In [21]:
df.filter(col("HighValueOrder") == "Yes") \
  .groupBy("Region") \
  .count() \
  .withColumnRenamed("count", "HighValueOrderCount") \
  .show()


+------+-------------------+
|Region|HighValueOrderCount|
+------+-------------------+
| South|                  1|
|  West|                  1|
| North|                  1|
+------+-------------------+



# Part 5: Save Results

Save the transformed DataFrame as a CSV file named high_value_orders.csv with
headers.

In [22]:
df.filter(col("HighValueOrder") == "Yes") \
  .coalesce(1) \
  .write.csv("high_value_orders.csv", header=True, mode="overwrite")
