In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as spark_sum, avg, month, year

spark = SparkSession.builder.appName("RetailSalesInsights").getOrCreate()

# Load sales and product data
sales_df = spark.read.csv("sales.csv", header=True, inferSchema=True)
products_df = spark.read.csv("products.csv", header=True, inferSchema=True)

print("Sales Schema")
sales_df.printSchema()

print("Products Schema")
products_df.printSchema()


Sales Schema
root
 |-- sale_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- store_id: integer (nullable = true)
 |-- employee_id: integer (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price: integer (nullable = true)
 |-- discount: integer (nullable = true)
 |-- returns: integer (nullable = true)
 |-- sale_date: date (nullable = true)

Products Schema
root
 |-- product_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- price: integer (nullable = true)



In [2]:
product_sales = sales_df.groupBy("product_id").agg(
    spark_sum("quantity").alias("total_quantity")
)

# Filter underperforming
underperforming = product_sales.filter(col("total_quantity") < 5)

print("Underperforming Products:")
underperforming.show()


Underperforming Products:
+----------+--------------+
|product_id|total_quantity|
+----------+--------------+
|         3|             2|
|         4|             1|
+----------+--------------+



In [3]:
# Revenue column
sales_df = sales_df.withColumn("revenue", col("quantity") * col("price"))

# Add month, year columns
sales_df = sales_df.withColumn("month", month("sale_date"))
sales_df = sales_df.withColumn("year", year("sale_date"))

# Average monthly revenue per store
monthly_revenue = sales_df.groupBy("store_id", "year", "month") \
    .agg(spark_sum("revenue").alias("monthly_revenue"))

avg_monthly_revenue = monthly_revenue.groupBy("store_id") \
    .agg(avg("monthly_revenue").alias("avg_monthly_revenue"))

print("Average Monthly Revenue per Store:")
avg_monthly_revenue.show()


Average Monthly Revenue per Store:
+--------+-------------------+
|store_id|avg_monthly_revenue|
+--------+-------------------+
|       1| 113666.66666666667|
|       2|            92500.0|
+--------+-------------------+



In [4]:
# Save underperforming products
underperforming.write.mode("overwrite").csv("underperforming_products.csv", header=True)

# Save store summary
avg_monthly_revenue.write.mode("overwrite").csv("store_avg_monthly_revenue.csv", header=True)

print("🎉 Outputs saved: underperforming_products.csv & store_avg_monthly_revenue.csv")


🎉 Outputs saved: underperforming_products.csv & store_avg_monthly_revenue.csv
