In [0]:
from pyspark.sql import SparkSession

# Spark session (Databricks already has one by default)
spark = SparkSession.builder.appName("RetailETL").getOrCreate()

# Load data
sales_df = spark.read.csv("/FileStore/tables/sales_cleaned_week4.csv", header=True, inferSchema=True)
products_df = spark.read.csv("/FileStore/tables/products.csv", header=True, inferSchema=True)

print("✅ Sales Schema:")
sales_df.printSchema()

print("✅ Products Schema:")
products_df.printSchema()


✅ Sales Schema:
root
 |-- sale_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- store_id: integer (nullable = true)
 |-- employee_id: integer (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price: integer (nullable = true)
 |-- discount: integer (nullable = true)
 |-- sale_date: date (nullable = true)
 |-- revenue: integer (nullable = true)
 |-- discount_percentage: double (nullable = true)
 |-- cost: double (nullable = true)
 |-- profit: double (nullable = true)

✅ Products Schema:
root
 |-- product_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- price: integer (nullable = true)



In [0]:
from pyspark.sql.functions import col, sum as spark_sum, avg

# Join sales with product info
joined_df = sales_df.join(products_df, on="product_id", how="left")

# Calculate profit margin
# profit_margin = (profit / revenue) * 100
joined_df = joined_df.withColumn("profit_margin", (col("profit") / col("revenue")) * 100)

# Show joined data
joined_df.select("product_id", "product_name", "category", "revenue", "profit", "profit_margin").show(10)


+----------+------------+--------+-------+-------+-------------+
|product_id|product_name|category|revenue| profit|profit_margin|
+----------+------------+--------+-------+-------+-------------+
|         3|        NULL|    NULL|  30000| 9000.0|         30.0|
|         4|        NULL|    NULL|  35000|10500.0|         30.0|
|         2|        NULL|    NULL|  75000|22500.0|         30.0|
|         1|        NULL|    NULL| 300000|90000.0|         30.0|
|         1|        NULL|    NULL| 120000|36000.0|         30.0|
+----------+------------+--------+-------+-------+-------------+



In [0]:
category_metrics = joined_df.groupBy("category").agg(
    spark_sum("revenue").alias("total_revenue"),
    spark_sum("profit").alias("total_profit"),
    avg("profit_margin").alias("avg_profit_margin")
)

print("Profit Margin by Category")
category_metrics.show()


📈 Profit Margin by Category
+--------+-------------+------------+-----------------+
|category|total_revenue|total_profit|avg_profit_margin|
+--------+-------------+------------+-----------------+
|    NULL|       560000|    168000.0|             30.0|
+--------+-------------+------------+-----------------+



In [0]:
# Save as Delta table (recommended in Databricks)
category_metrics.write.mode("overwrite").format("delta").save("/FileStore/tables/category_metrics_delta")

# OR save as CSV if needed
category_metrics.write.mode("overwrite").option("header", "true").csv("/FileStore/tables/category_metrics_csv")

print("✅ Category metrics saved in DBFS")


✅ Category metrics saved in DBFS


In [0]:
joined_df.createOrReplaceTempView("joined_df")

In [0]:
%sql
SELECT product_name, SUM(revenue) AS total_revenue
FROM joined_df
GROUP BY product_name
ORDER BY total_revenue DESC
LIMIT 3;


product_name,total_revenue
,560000
