In [0]:
#  Import Libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as _sum, round

In [0]:
# Create Spark Session
spark = SparkSession.builder.appName("RetailSalesETL_Week4").getOrCreate()

In [0]:
# Load cleaned sales data
sales_path = "/Volumes/workspace/default/nithyashree/cleaned_sales_dataa.csv"
sales_df = spark.read.csv(sales_path, header=True, inferSchema=True)

In [0]:
# Load products data
products_path = "/Volumes/workspace/default/nithyashree/products.csv"
products_df = spark.read.csv(products_path, header=True, inferSchema=True)

In [0]:
# Join sales and products data
joined_df = sales_df.join(products_df, on="product_id", how="inner")
joined_df.show()

+----------+-------+--------+--------+-----+-----+-------+-------+-------------------+----------+---------------+---------------+-----+
|product_id|sale_id|store_id|quantity|price| cost|revenue| profit|   discount_percent| sale_date|           name|       category|price|
+----------+-------+--------+--------+-----+-----+-------+-------+-------------------+----------+---------------+---------------+-----+
|         2|      1|       5|       3| 3000|10000|   9000| -21000|-233.33333333333334|2024-01-01|     Smartphone|    Electronics|50000|
|         3|      2|       1|       4|60000| 4000| 240000| 224000|  93.33333333333333|2024-01-02|     Headphones|    Accessories| 3000|
|         5|      3|       1|       2|30000|70000|  60000| -80000|-133.33333333333331|2024-01-03|Air Conditioner|Home Appliances|40000|
|         3|      4|       4|       5|30000|70000| 150000|-200000|-133.33333333333331|2024-01-04|     Headphones|    Accessories| 3000|
|         5|      5|       2|       5|10000|3000

In [0]:
# Calculate additional metrics
joined_df = joined_df.withColumn("profit_margin", round((col("profit") / col("revenue")) * 100, 2))
joined_df.show()

+----------+-------+--------+--------+-----+-----+-------+-------+-------------------+----------+---------------+---------------+-----+-------------+
|product_id|sale_id|store_id|quantity|price| cost|revenue| profit|   discount_percent| sale_date|           name|       category|price|profit_margin|
+----------+-------+--------+--------+-----+-----+-------+-------+-------------------+----------+---------------+---------------+-----+-------------+
|         2|      1|       5|       3| 3000|10000|   9000| -21000|-233.33333333333334|2024-01-01|     Smartphone|    Electronics|50000|      -233.33|
|         3|      2|       1|       4|60000| 4000| 240000| 224000|  93.33333333333333|2024-01-02|     Headphones|    Accessories| 3000|        93.33|
|         5|      3|       1|       2|30000|70000|  60000| -80000|-133.33333333333331|2024-01-03|Air Conditioner|Home Appliances|40000|      -133.33|
|         3|      4|       4|       5|30000|70000| 150000|-200000|-133.33333333333331|2024-01-04|   

In [0]:
# Aggregate: Profit Margin by Category
category_summary = joined_df.groupBy("category").agg(
    _sum("revenue").alias("total_revenue"),
    _sum("profit").alias("total_profit"),
    round((_sum("profit") / _sum("revenue")) * 100, 2).alias("profit_margin_percent")
)
category_summary.show()

+---------------+-------------+------------+---------------------+
|       category|total_revenue|total_profit|profit_margin_percent|
+---------------+-------------+------------+---------------------+
|    Accessories|     15459000|     3373000|                21.82|
|Home Appliances|      9905000|     3489000|                35.22|
|    Electronics|     33116000|     9152000|                27.64|
+---------------+-------------+------------+---------------------+



In [0]:
# Save as Delta Table
output_delta_path = "/Volumes/workspace/default/nithyashree/category_summary_delta"
category_summary.write.format("delta").mode("overwrite").save(output_delta_path)

In [0]:
# Save as CSV for Dashboard
output_csv_path = "/Volumes/workspace/default/nithyashree/category_summary_csv"
category_summary.write.mode("overwrite").option("header", True).csv(output_csv_path)