In [1]:
!pip install --upgrade pip
!pip install jupyter pandas matplotlib openpyxl ipykernel




In [2]:
# I create a small Australian dataset so I have clear evidence today.
import pandas as pd

data = pd.DataFrame({
    "State": ["NSW","VIC","QLD","WA","SA","TAS","ACT","NT"],
    "Treatment": ["Hip Replacement"]*8,
    "Average_Cost_AUD": [24000,23500,22000,23000,22500,21000,24500,20500],
    "Outcome_Score": [0.90,0.88,0.84,0.86,0.83,0.80,0.92,0.78]
})

# I save my raw-like input so my repo shows a data file I produced.
data.to_csv("../data/treatment_efficiency_raw_au.csv", index=False)

data.head()


Unnamed: 0,State,Treatment,Average_Cost_AUD,Outcome_Score
0,NSW,Hip Replacement,24000,0.9
1,VIC,Hip Replacement,23500,0.88
2,QLD,Hip Replacement,22000,0.84
3,WA,Hip Replacement,23000,0.86
4,SA,Hip Replacement,22500,0.83


In [3]:
# I standardise names, compute my metric, and save the analysis table.
df = data.rename(columns={
    "Treatment":"treatment_name",
    "Average_Cost_AUD":"cost",
    "Outcome_Score":"outcome_score"
}).copy()

summary = df.groupby(["treatment_name","State"], as_index=False).agg(
    avg_cost=("cost","mean"),
    avg_outcome=("outcome_score","mean"),
    cases=("outcome_score","count")
)
summary["cost_per_outcome"] = summary["avg_cost"] / summary["avg_outcome"]
summary = summary.sort_values("cost_per_outcome")

summary.to_csv("../data/treatment_efficiency.csv", index=False)
summary.head(10)


Unnamed: 0,treatment_name,State,avg_cost,avg_outcome,cases,cost_per_outcome
3,Hip Replacement,QLD,22000.0,0.84,1,26190.47619
5,Hip Replacement,TAS,21000.0,0.8,1,26250.0
2,Hip Replacement,NT,20500.0,0.78,1,26282.051282
0,Hip Replacement,ACT,24500.0,0.92,1,26630.434783
1,Hip Replacement,NSW,24000.0,0.9,1,26666.666667
6,Hip Replacement,VIC,23500.0,0.88,1,26704.545455
7,Hip Replacement,WA,23000.0,0.86,1,26744.186047
4,Hip Replacement,SA,22500.0,0.83,1,27108.433735
