# Pareto Principle
- 80% of your sales comes from 20% of your products

In [0]:
%sql
-- Switch to my Catalog
USE CATALOG workspace;

-- Create schema if not exists
CREATE SCHEMA IF NOT EXISTS sql_pyspark_practice;

-- Use this schema
USE sql_pyspark_practice;

In [0]:
%sql
-- display(
--     spark.sql(
--         """
--         select sum(sales) * 0.8 as discounted_sales
--         from orders
--         """
--     )
-- )


-- 80% --> 1837760.7771199604

with product_wise_sales as (
  select product_id, sum(sales) as product_sales
  from orders
  group by product_id
), calc_sales as(
select  product_id, product_sales,
        sum(product_sales) over(order by product_sales desc) as running_sales,
        0.8*sum(product_sales) over() as total_sales
from product_wise_sales
)

select * from calc_sales
where running_sales <= total_sales;

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

orders_df = spark.table("orders")

# Step 1: Compute total sales per product
product_wise_sales = (
    orders_df.groupBy("product_id")
             .agg(F.sum("sales").alias("product_sales"))
)

# Step 2: Window for running cumulative sales (sorted by sales desc)
running_window = Window.orderBy(F.desc("product_sales")) \
                       .rowsBetween(Window.unboundedPreceding, Window.currentRow)

# Step 3: Window for total sales (all rows)
total_window = Window.rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)

calc_sales = (
    product_wise_sales
        .withColumn("running_sales", F.sum("product_sales").over(running_window))
        .withColumn("total_sales", 0.8 * F.sum("product_sales").over(total_window))
)

# Step 4: Filter products contributing to 80% of sales
result = calc_sales.filter(F.col("running_sales") <= F.col("total_sales"))

display(result)
