In [2]:
import pandas as pd
import plotly.express as px
df = pd.read_csv("../data/03_PK_customer_analysis.csv")

In [3]:
df = df.sort_values("total_gross_profit", ascending=False).reset_index(drop=True)

df["customer_rank"] = df.index + 1
df["customer_pct"] = df["customer_rank"] / len(df)
df["cumulative_profit_pct"] = (
    df["total_gross_profit"].cumsum() / df["total_gross_profit"].sum()
)

In [4]:
fig = px.line(
    df,
    x="customer_pct",
    y="cumulative_profit_pct",
    title="Cumulative Profit Contribution by Customers",
    labels={
        "customer_pct": "Cumulative % of Customers",
        "cumulative_profit_pct": "Cumulative % of Total Profit"
    }
)

fig.add_hline(y=0.8, line_dash="dash")
fig.add_vline(x=0.2, line_dash="dash")

fig.update_layout(
    width=800,
    height=500
)

fig.show()


- Customers are ranked from left to right based on how much profit they generate. 
- The dotted line shows how total profit accumulates as we include more customers.
- Profit accumulates very quickly at the beginning, meaning a small group of customers contributes a large share of total profit - roughly 20% of the customers generate upto 50% of the profit
- This supports the hypothesis that a small subset of customers contributes a disproportionately large share of total profit