In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, month, year, sum as _sum, avg
from google.colab import files


In [2]:
# Creating spark session
spark = SparkSession.builder.appName("RetailSalesDashboard").getOrCreate()


In [3]:
from google.colab import files
uploaded = files.upload()


Saving cleaned_sales_data.csv to cleaned_sales_data.csv


In [5]:
# Read into PySpark DataFrame
df = spark.read.csv("cleaned_sales_data.csv", header=True, inferSchema=True)
df.show(5)


+------+---------+-------+------------+----------+------------+-----------+-------+-----+--------+-------+-------+
|SaleID|ProductID|StoreID|QuantitySold|  SaleDate| ProductName|   Category|  Price|Stock| Revenue|   Cost| Profit|
+------+---------+-------+------------+----------+------------+-----------+-------+-----+--------+-------+-------+
|   301|        1|    101|           3|2024-07-01|      Laptop|Electronics|55000.0|   30|165000.0|44000.0|33000.0|
|   302|        2|    102|           5|2024-07-02|Mobile Phone|Electronics|25000.0|   50|125000.0|20000.0|25000.0|
|   303|        3|    103|           2|2024-07-02|       Shoes|   Footwear| 2000.0|  100|  4000.0| 1600.0|  800.0|
|   304|        5|    102|           6|2024-07-03| Smart Watch|Electronics| 5000.0|   40| 30000.0| 4000.0| 6000.0|
|   305|        4|    101|           4|2024-07-03|     T-Shirt|   Clothing|  800.0|   70|  3200.0|  640.0|  640.0|
+------+---------+-------+------------+----------+------------+-----------+-----

In [7]:
# Assuming columns: 'price' and 'quantity'
df = df.withColumn("revenue", col("Price") * col("QuantitySold"))



In [9]:
 #Filter Underperforming Products
product_sales = df.groupBy("ProductName").agg(_sum("QuantitySold").alias("total_quantity"))
underperforming_products = product_sales.filter(col("total_quantity") < 5)
underperforming_products.show()

+-----------+--------------+
|ProductName|total_quantity|
+-----------+--------------+
|     Laptop|             3|
|    T-Shirt|             4|
|      Shoes|             2|
+-----------+--------------+



In [11]:
# Assuming there's a 'SaleDate' column in proper date format
df = df.withColumn("month", month(col("SaleDate")))
df = df.withColumn("year", year(col("SaleDate")))

monthly_revenue = df.groupBy("StoreID", "year", "month").agg(_sum("revenue").alias("monthly_revenue"))

avg_monthly_revenue = monthly_revenue.groupBy("StoreID").agg(avg("monthly_revenue").alias("avg_monthly_revenue"))
avg_monthly_revenue.show()

+-------+-------------------+
|StoreID|avg_monthly_revenue|
+-------+-------------------+
|    101|           168200.0|
|    103|             4000.0|
|    102|           155000.0|
|    105|             7500.0|
|    104|           175000.0|
+-------+-------------------+



In [12]:
underperforming_products.coalesce(1).write.csv("underperforming_products.csv", header=True, mode="overwrite")
avg_monthly_revenue.coalesce(1).write.csv("store_revenue_summary.csv", header=True, mode="overwrite")
