<a href="https://colab.research.google.com/github/ShravankumarMR/BigData-Spark/blob/main/SaleDataAnalysis_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
pip install pyspark



In [22]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [23]:
spark = SparkSession.builder.appName(' Sales Data Analysis ').getOrCreate()
sc = spark.sparkContext

In [24]:
import os
#importing os to set environment variable
def install_java():
  !apt-get install -y openjdk-17-jdk-headless -qq > /dev/null
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-17-openjdk-amd64"
!java -version
install_java()

openjdk version "17.0.16" 2025-07-15
OpenJDK Runtime Environment (build 17.0.16+8-Ubuntu-0ubuntu122.04.1)
OpenJDK 64-Bit Server VM (build 17.0.16+8-Ubuntu-0ubuntu122.04.1, mixed mode, sharing)


In [25]:
!apt-get update

0% [Working]            Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [Connecting to security.ubuntu.com (91.189.92.24)] [Connected to cloud.r-pro                                                                               Hit:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
                                                                               Hit:3 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:4 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:5 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:6 https://cli.github.com/packages stable InRelease
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Hit:8 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcon

In [26]:
# Read data into df
inp = spark.read.format("csv").option("header","true").option("inferSchema","true").load("/content/Input/US_Sales_Datasets.csv")

In [27]:
print(" Total records : ", inp.count() , " \n\n Sample records: \n")
for i in inp.take(5):
  print(i)

 Total records :  9648  

 Sample records: 

Row(Retailer='Foot Locker', Retailer ID=1185732, Invoice Date='01-01-2020', Region='Northeast', State='New York', City='New York', Product="Men's Street Footwear", Price per Unit=50, Units Sold='1,200', Total Sales='6,00,000', Operating Profit='3,00,000', Operating Margin='50%', Sales Method='In-store')
Row(Retailer='Foot Locker', Retailer ID=1185732, Invoice Date='02-01-2020', Region='Northeast', State='New York', City='New York', Product="Men's Athletic Footwear", Price per Unit=50, Units Sold='1,000', Total Sales='5,00,000', Operating Profit='1,50,000', Operating Margin='30%', Sales Method='In-store')
Row(Retailer='Foot Locker', Retailer ID=1185732, Invoice Date='03-01-2020', Region='Northeast', State='New York', City='New York', Product="Women's Street Footwear", Price per Unit=40, Units Sold='1,000', Total Sales='4,00,000', Operating Profit='1,40,000', Operating Margin='35%', Sales Method='In-store')
Row(Retailer='Foot Locker', Retailer

In [28]:
inp.printSchema()

root
 |-- Retailer: string (nullable = true)
 |-- Retailer ID: integer (nullable = true)
 |-- Invoice Date: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- State: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Price per Unit: integer (nullable = true)
 |-- Units Sold: string (nullable = true)
 |-- Total Sales: string (nullable = true)
 |-- Operating Profit: string (nullable = true)
 |-- Operating Margin: string (nullable = true)
 |-- Sales Method: string (nullable = true)



In [29]:
#convert string date column into date object
rawDataDF1 = inp.withColumn("Invoice Date", to_date(col("Invoice Date"), "dd-MM-yyyy"))

for i in rawDataDF1.take(5):
  print(i)

Row(Retailer='Foot Locker', Retailer ID=1185732, Invoice Date=datetime.date(2020, 1, 1), Region='Northeast', State='New York', City='New York', Product="Men's Street Footwear", Price per Unit=50, Units Sold='1,200', Total Sales='6,00,000', Operating Profit='3,00,000', Operating Margin='50%', Sales Method='In-store')
Row(Retailer='Foot Locker', Retailer ID=1185732, Invoice Date=datetime.date(2020, 1, 2), Region='Northeast', State='New York', City='New York', Product="Men's Athletic Footwear", Price per Unit=50, Units Sold='1,000', Total Sales='5,00,000', Operating Profit='1,50,000', Operating Margin='30%', Sales Method='In-store')
Row(Retailer='Foot Locker', Retailer ID=1185732, Invoice Date=datetime.date(2020, 1, 3), Region='Northeast', State='New York', City='New York', Product="Women's Street Footwear", Price per Unit=40, Units Sold='1,000', Total Sales='4,00,000', Operating Profit='1,40,000', Operating Margin='35%', Sales Method='In-store')
Row(Retailer='Foot Locker', Retailer ID=11

In [30]:
rawDataDF2 = rawDataDF1.withColumn("Gender", split(col("Product"), "'s ").getItem(0)).withColumn("Category", split(col("Product"), "'s ").getItem(1))

rawDataDF3 = rawDataDF2.withColumn("Units Sold", regexp_replace(col("Units Sold"),",","")).withColumn("Units Sold", col("Units Sold").cast("Integer"))

rawDataDF4 = rawDataDF3.withColumn("Operating Margin", regexp_replace(col("Operating Margin"),"%","")).withColumn("Operating Margin", col("Operating Margin").cast("Integer"))

rawDataDF5 = rawDataDF4.withColumn("Total Sales", col("Units Sold")*col("Price per Unit")).withColumn("Total Sales", col("Total Sales").cast("Double"))

rawDataDF6 = rawDataDF5.withColumn("Operating Profit", col("Total Sales")*col("Operating Margin")/100  ).withColumn("Operating Profit", col("Operating Profit").cast("Double"))

rawDataDF7 = rawDataDF6.withColumnRenamed("Invoice Date","Invoice_Date").withColumnRenamed("Total Sales","Total_Sales").withColumnRenamed("Operating Profit","Operating_Profit").withColumnRenamed("Units Sold","Units_Sold")

In [31]:
print(" Sample records: \n")
for i in rawDataDF7.take(5):
  print(i)

 Sample records: 

Row(Retailer='Foot Locker', Retailer ID=1185732, Invoice_Date=datetime.date(2020, 1, 1), Region='Northeast', State='New York', City='New York', Product="Men's Street Footwear", Price per Unit=50, Units_Sold=1200, Total_Sales=60000.0, Operating_Profit=30000.0, Operating Margin=50, Sales Method='In-store', Gender='Men', Category='Street Footwear')
Row(Retailer='Foot Locker', Retailer ID=1185732, Invoice_Date=datetime.date(2020, 1, 2), Region='Northeast', State='New York', City='New York', Product="Men's Athletic Footwear", Price per Unit=50, Units_Sold=1000, Total_Sales=50000.0, Operating_Profit=15000.0, Operating Margin=30, Sales Method='In-store', Gender='Men', Category='Athletic Footwear')
Row(Retailer='Foot Locker', Retailer ID=1185732, Invoice_Date=datetime.date(2020, 1, 3), Region='Northeast', State='New York', City='New York', Product="Women's Street Footwear", Price per Unit=40, Units_Sold=1000, Total_Sales=40000.0, Operating_Profit=14000.0, Operating Margin=35

In [32]:
rawDataDF7.printSchema()

root
 |-- Retailer: string (nullable = true)
 |-- Retailer ID: integer (nullable = true)
 |-- Invoice_Date: date (nullable = true)
 |-- Region: string (nullable = true)
 |-- State: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Price per Unit: integer (nullable = true)
 |-- Units_Sold: integer (nullable = true)
 |-- Total_Sales: double (nullable = true)
 |-- Operating_Profit: double (nullable = true)
 |-- Operating Margin: integer (nullable = true)
 |-- Sales Method: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Category: string (nullable = true)



## Total sales by retailer

In [33]:
rawDataDF7.registerTempTable("sales")
q7 = spark.sql("""
    SELECT
        Retailer,
        ROUND(SUM(Total_Sales)/1000000, 2) AS Total_Sales_in_millions
    FROM sales
    GROUP BY Retailer
    ORDER BY Total_Sales_in_millions DESC
""")

print(" Total sales by retailer: \n\n ")
q7.show()

 Total sales by retailer: 

 




+-------------+-----------------------+
|     Retailer|Total_Sales_in_millions|
+-------------+-----------------------+
|    West Gear|                  32.41|
|  Foot Locker|                  29.02|
|Sports Direct|                  24.62|
|       Kohl's|                  13.51|
|      Walmart|                  10.51|
|       Amazon|                   10.1|
+-------------+-----------------------+



## Total Units sold by product category and Gender type (2 separate queries)

In [34]:
q8a = spark.sql("""
    SELECT
        Category,
        SUM(Units_Sold) AS Total_Units_Sold
    FROM sales
    GROUP BY Category
    ORDER BY Total_Units_Sold DESC
""")

print(" Total sales by Category: \n\n ")
q8a.show()

 Total sales by Category: 

 
+-----------------+----------------+
|         Category|Total_Units_Sold|
+-----------------+----------------+
|  Street Footwear|          985589|
|Athletic Footwear|          752762|
|          Apparel|          740510|
+-----------------+----------------+



In [35]:
q8b = spark.sql("""
    SELECT
        Gender,
        SUM(Units_Sold) AS Total_Units_Sold
    FROM sales
    GROUP BY Gender
    ORDER BY Total_Units_Sold DESC
""")

print(" Total sales by Gender: \n\n ")
q8b.show()

 Total sales by Gender: 

 
+------+----------------+
|Gender|Total_Units_Sold|
+------+----------------+
|   Men|         1335529|
| Women|         1143332|
+------+----------------+



## Top performing cities by profit

In [36]:
q9 = spark.sql("""
    SELECT
        City,
        ROUND(SUM(Operating_Profit), 2) AS Total_Profit
    FROM sales
    GROUP BY City
    ORDER BY Total_Profit DESC
""")

print(" Top performing cities: \n\n ")
q9.show()

 Top performing cities: 

 
+-------------+------------+
|         City|Total_Profit|
+-------------+------------+
|     New York|  2114664.41|
|   Charleston|  2024086.36|
|San Francisco|  1581993.31|
|        Miami|  1579387.86|
|     Portland|  1575860.62|
|      Houston|  1494772.31|
|  New Orleans|  1424389.74|
|  Los Angeles|  1378158.34|
|   Birmingham|  1368206.39|
|      Orlando|  1342206.56|
|       Dallas|  1337738.29|
|    Knoxville|  1269585.06|
|    Charlotte|  1263674.12|
|        Boise|  1222558.85|
|       Albany|  1220894.66|
|     Richmond|  1174799.43|
|    Las Vegas|  1084651.92|
|      Detroit|  1050351.52|
|      Atlanta|  1049533.36|
|  Albuquerque|  1027672.91|
+-------------+------------+
only showing top 20 rows

