In [4]:
# 1. Install & Import Libraries
!pip install pyspark

import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as _sum, round as _round
from google.colab import files

# Start Spark session
spark = SparkSession.builder \
    .appName("RetailSalesETL") \
    .getOrCreate()



In [5]:
# 2. Upload & Load Cleaned Sales Data
uploaded = files.upload()
# Read into pandas
df = pd.read_csv("cleaned_sales_data.csv")
display(df.head())
# Convert to Spark DataFrame
sales_df = spark.createDataFrame(df)
print("Schema of Sales Data:")
sales_df.printSchema()

print("Sample Data:")
sales_df.show(5)

Saving cleaned_sales_data.csv to cleaned_sales_data.csv


Unnamed: 0,SaleID,ProductID,StoreID,QuantitySold,SaleDate,ProductName,Category,Price,Stock,Revenue,Cost,Profit
0,301,1,101,3,2024-07-01,Laptop,Electronics,55000.0,30,165000.0,44000.0,33000.0
1,302,2,102,5,2024-07-02,Mobile Phone,Electronics,25000.0,50,125000.0,20000.0,25000.0
2,303,3,103,2,2024-07-02,Shoes,Footwear,2000.0,100,4000.0,1600.0,800.0
3,304,5,102,6,2024-07-03,Smart Watch,Electronics,5000.0,40,30000.0,4000.0,6000.0
4,305,4,101,4,2024-07-03,T-Shirt,Clothing,800.0,70,3200.0,640.0,640.0


Schema of Sales Data:
root
 |-- SaleID: long (nullable = true)
 |-- ProductID: long (nullable = true)
 |-- StoreID: long (nullable = true)
 |-- QuantitySold: long (nullable = true)
 |-- SaleDate: string (nullable = true)
 |-- ProductName: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Price: double (nullable = true)
 |-- Stock: long (nullable = true)
 |-- Revenue: double (nullable = true)
 |-- Cost: double (nullable = true)
 |-- Profit: double (nullable = true)

Sample Data:
+------+---------+-------+------------+----------+------------+-----------+-------+-----+--------+-------+-------+
|SaleID|ProductID|StoreID|QuantitySold|  SaleDate| ProductName|   Category|  Price|Stock| Revenue|   Cost| Profit|
+------+---------+-------+------------+----------+------------+-----------+-------+-----+--------+-------+-------+
|   301|        1|    101|           3|2024-07-01|      Laptop|Electronics|55000.0|   30|165000.0|44000.0|33000.0|
|   302|        2|    102|           

In [7]:
# 3. Data Cleaning / Casting
sales_df = sales_df.withColumn("QuantitySold", col("QuantitySold").cast("double")) \
                   .withColumn("Price", col("Price").cast("double")) \
                   .withColumn("Revenue", col("Revenue").cast("double")) \
                   .withColumn("Profit", col("Profit").cast("double"))

In [8]:
# 4. Transformations – Category KPIs
category_kpis = sales_df.groupBy("category").agg(
    _sum("revenue").alias("total_revenue"),
    _sum("profit").alias("total_profit")
)

category_kpis = category_kpis.withColumn(
    "profit_margin_percent",
    _round((col("total_profit") / col("total_revenue")) * 100, 2)
)

print("Category KPIs:")
category_kpis.show()

Category KPIs:
+-----------+-------------+------------+---------------------+
|   category|total_revenue|total_profit|profit_margin_percent|
+-----------+-------------+------------+---------------------+
|Electronics|     502500.0|    100500.0|                 20.0|
|   Footwear|       4000.0|       800.0|                 20.0|
|   Clothing|       3200.0|       640.0|                 20.0|
+-----------+-------------+------------+---------------------+



In [12]:
# 5. Save Final Metrics
category_kpis_pd = category_kpis.toPandas()
category_kpis_pd.to_csv("sales_kpis_final.csv", index=False)

print("Final category KPIs saved as sales_kpis_final.csv")

files.download("sales_kpis_final.csv")


Final category KPIs saved as sales_kpis_final.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [11]:
# 6. SQL Query – Top 3 Best-Selling Products
sales_df.createOrReplaceTempView("sales_data")

top_products = spark.sql("""
    SELECT ProductID, ProductName, SUM(Revenue) AS total_revenue
    FROM sales_data
    GROUP BY ProductID, ProductName
    ORDER BY total_revenue DESC
    LIMIT 3
""")

print("Top 3 Best-Selling Products:")
top_products.show()

Top 3 Best-Selling Products:
+---------+------------+-------------+
|ProductID| ProductName|total_revenue|
+---------+------------+-------------+
|        2|Mobile Phone|     300000.0|
|        1|      Laptop|     165000.0|
|        5| Smart Watch|      30000.0|
+---------+------------+-------------+

