In [1]:
import pandas as pd
df = pd.read_excel("Opportunity Details.xlsx", engine="openpyxl")
df.head()

Unnamed: 0,Index,-SUM([Annl Agg Savings - treated]),Customer,Super Payer Short,Payer Short,Solution,LOB,Product Type,Medical Policy,Topic,...,Dp Desc,DP Age in Years,Disposition,Disposition Date,Cpw Reason,Decision Status,Decision Date,Model Recommendation,Annl Edits,Annl Agg Savings
0,Grand Total,Total,Total,Total,Total,Total,Total,Total,Total,Total,...,Total,Total,Total,Total,Total,Total,Total,Total,26492900.0,1323582000.0
1,1,-103495450.318931,Highmark,HIGAL,HIGPB,PPM,BlueCard,ICMO,Duplicate Services Policy,Duplicate Claim Logic for Claims Submitted on ...,...,Deny duplicate claim lines using the 11 basic ...,13,Invalid,Jul-2025,"[""Mutually Exclusive DP (e.g. CMS vs CMS + Cot...",Reject,Oct-2018,More Likely,274529.0,103495500.0
2,2,-97478904.4027787,Highmark,HIGAL,HIGPB,PPM,BlueCard,ICMO,Bundled Services Policy,Packaged Services for the Outpatient Hospital,...,Deny packaged HCPCS codes (Status indicator N)...,13,Invalid,Jul-2025,"[""Claims System Limitation (e.g. Client does n...",No Decision,Apr-2020,Less Likely,561179.4,97478900.0
3,3,-59890843.9398087,Highmark,HIGAL,HIGPB,PPM,BlueCard,ICM,Duplicate Services Policy,Duplicate Claims From Any Provider ID Under Sa...,...,Deny duplicate claim lines using the nine basi...,13,Invalid,Jul-2025,"[""Claims System Limitation (e.g. Client does n...",Reject,Aug-2018,More Likely,890145.8,59890840.0
4,4,-51834723.6435765,Highmark,HIGAL,HIGPB,PPM,BlueCard,ICMO,Bundled Services Policy,Packaged Revenue Codes for the Outpatient Hosp...,...,Deny packaged revenue codes when billed withou...,13,Invalid,Jul-2025,"[""Claims System Limitation (e.g. Client does n...",,Never Presented,Less Likely,188976.0,51834720.0


In [2]:
# STEP 2: remove the "Grand Total" row, clean columns, and join PolicyCollection + PolicyGroup from mapping file

# 1) Drop the total row (keep header as-is, start data from original Index 1)
df = df.iloc[1:].reset_index(drop=True)

# 2) Drop unwanted columns
df = df.drop(columns=["Index", "-SUM([Annl Agg Savings - treated])"], errors="ignore")

# 3) Join PolicyCollection + PolicyGroup from Policy Collection Mapping (by DP Key)
df_map = pd.read_excel(
    "Policy Collection Mapping.xlsx",
    engine="openpyxl",
    usecols=["DPKey", "PolicyCollection", "PolicyGroup"]
)

df_map["DPKey"] = pd.to_numeric(df_map["DPKey"], errors="coerce")
df["Dp Key"] = pd.to_numeric(df["Dp Key"], errors="coerce")

df = (
    df.merge(df_map.drop_duplicates("DPKey"), how="left", left_on="Dp Key", right_on="DPKey")
      .drop(columns=["DPKey"])
      .rename(columns={"PolicyCollection": "Policy Collection", "PolicyGroup": "Policy Group"})
)


In [3]:
# quick view to confirm
print(df.shape)
print(df.columns.tolist())
df.head(5)

(14108, 21)
['Customer', 'Super Payer Short', 'Payer Short', 'Solution', 'LOB', 'Product Type', 'Medical Policy', 'Topic', 'Dp Key', 'Dp Desc', 'DP Age in Years', 'Disposition', 'Disposition Date  ', 'Cpw Reason', 'Decision Status', 'Decision Date', 'Model Recommendation', 'Annl Edits', 'Annl Agg Savings', 'Policy Collection', 'Policy Group']


Unnamed: 0,Customer,Super Payer Short,Payer Short,Solution,LOB,Product Type,Medical Policy,Topic,Dp Key,Dp Desc,...,Disposition,Disposition Date,Cpw Reason,Decision Status,Decision Date,Model Recommendation,Annl Edits,Annl Agg Savings,Policy Collection,Policy Group
0,Highmark,HIGAL,HIGPB,PPM,BlueCard,ICMO,Duplicate Services Policy,Duplicate Claim Logic for Claims Submitted on ...,3324,Deny duplicate claim lines using the 11 basic ...,...,Invalid,Jul-2025,"[""Mutually Exclusive DP (e.g. CMS vs CMS + Cot...",Reject,Oct-2018,More Likely,274529.001557,103495500.0,,
1,Highmark,HIGAL,HIGPB,PPM,BlueCard,ICMO,Bundled Services Policy,Packaged Services for the Outpatient Hospital,11369,Deny packaged HCPCS codes (Status indicator N)...,...,Invalid,Jul-2025,"[""Claims System Limitation (e.g. Client does n...",No Decision,Apr-2020,Less Likely,561179.364133,97478900.0,,
2,Highmark,HIGAL,HIGPB,PPM,BlueCard,ICM,Duplicate Services Policy,Duplicate Claims From Any Provider ID Under Sa...,71,Deny duplicate claim lines using the nine basi...,...,Invalid,Jul-2025,"[""Claims System Limitation (e.g. Client does n...",Reject,Aug-2018,More Likely,890145.804387,59890840.0,,
3,Highmark,HIGAL,HIGPB,PPM,BlueCard,ICMO,Bundled Services Policy,Packaged Revenue Codes for the Outpatient Hosp...,3518,Deny packaged revenue codes when billed withou...,...,Invalid,Jul-2025,"[""Claims System Limitation (e.g. Client does n...",,Never Presented,Less Likely,188976.00304,51834720.0,,
4,Highmark,HIGAL,HIGPA,PPM,Commercial,ICMO,Bundled Services Policy,Packaged Services for the Outpatient Hospital,11369,Deny packaged HCPCS codes (Status indicator N)...,...,Invalid,Jul-2025,"[""Claims System Limitation (e.g. Client does n...",No Decision,Apr-2020,Less Likely,336132.710468,49373890.0,,


In [4]:
# STEP 3: Format Annl Edits + Annl Agg Savings as whole numbers and display

# Annl Edits -> whole number
if "Annl Edits" in df.columns:
    df["Annl Edits"] = pd.to_numeric(df["Annl Edits"], errors="coerce").round(0).astype("Int64")

# Annl Agg Savings -> whole number (no decimals)
if "Annl Agg Savings" in df.columns:
    df["Annl Agg Savings"] = pd.to_numeric(df["Annl Agg Savings"], errors="coerce").round(0).astype("Int64")

# Display formatting (no scientific notation, no decimals)
pd.set_option("display.float_format", lambda x: f"{x:,.0f}")

df.head(10)

Unnamed: 0,Customer,Super Payer Short,Payer Short,Solution,LOB,Product Type,Medical Policy,Topic,Dp Key,Dp Desc,...,Disposition,Disposition Date,Cpw Reason,Decision Status,Decision Date,Model Recommendation,Annl Edits,Annl Agg Savings,Policy Collection,Policy Group
0,Highmark,HIGAL,HIGPB,PPM,BlueCard,ICMO,Duplicate Services Policy,Duplicate Claim Logic for Claims Submitted on ...,3324,Deny duplicate claim lines using the 11 basic ...,...,Invalid,Jul-2025,"[""Mutually Exclusive DP (e.g. CMS vs CMS + Cot...",Reject,Oct-2018,More Likely,274529,103495450,,
1,Highmark,HIGAL,HIGPB,PPM,BlueCard,ICMO,Bundled Services Policy,Packaged Services for the Outpatient Hospital,11369,Deny packaged HCPCS codes (Status indicator N)...,...,Invalid,Jul-2025,"[""Claims System Limitation (e.g. Client does n...",No Decision,Apr-2020,Less Likely,561179,97478904,,
2,Highmark,HIGAL,HIGPB,PPM,BlueCard,ICM,Duplicate Services Policy,Duplicate Claims From Any Provider ID Under Sa...,71,Deny duplicate claim lines using the nine basi...,...,Invalid,Jul-2025,"[""Claims System Limitation (e.g. Client does n...",Reject,Aug-2018,More Likely,890146,59890844,,
3,Highmark,HIGAL,HIGPB,PPM,BlueCard,ICMO,Bundled Services Policy,Packaged Revenue Codes for the Outpatient Hosp...,3518,Deny packaged revenue codes when billed withou...,...,Invalid,Jul-2025,"[""Claims System Limitation (e.g. Client does n...",,Never Presented,Less Likely,188976,51834724,,
4,Highmark,HIGAL,HIGPA,PPM,Commercial,ICMO,Bundled Services Policy,Packaged Services for the Outpatient Hospital,11369,Deny packaged HCPCS codes (Status indicator N)...,...,Invalid,Jul-2025,"[""Claims System Limitation (e.g. Client does n...",No Decision,Apr-2020,Less Likely,336133,49373893,,
5,Highmark,HIGAL,HIGPB,PPM,BlueCard,ICMO,Bundled Services Policy,Packaged Services for the Outpatient Hospital,9065,Deny conditionally packaged laboratory service...,...,Invalid,Jul-2025,"[""Claims System Limitation (e.g. Client does n...",No Decision,Apr-2020,Less Likely,512080,23659795,,
6,Highmark,HIGAL,HIGAC,PPM,Commercial,ICMO,Bundled Services Policy,Packaged Services for the Outpatient Hospital,11369,Deny packaged HCPCS codes (Status indicator N)...,...,Invalid,Jul-2025,"[""Claims System Limitation (e.g. Client does n...",No Decision,Apr-2020,Likely,129219,21551929,,
7,Highmark,HIGAL,HIGPB,PPM,BlueCard,ICM,Durable Medical Equipment and Supplies Policy,DME Quality of Care,4651,Deny any code billed by a DME provider when th...,...,Invalid,Jul-2025,"[""Primary Reference Not Accepted by the Client""]",No Decision,Jan-2021,Less Likely,54557,20846850,,
8,Highmark,HIGAL,HIGPB,PPM,BlueCard,ICM,Drug and Biological Policy Processing and Poli...,Drug Wastage Modifiers JW and JZ,17227,Deny a drug supplied in only single-dose formu...,...,Invalid,Aug-2025,"[""Inappropriate for Line of Business""]",Reject,Oct-2024,Not Recommended,30301,19178174,Drug wastage,Drug Wastage
9,Highmark,HIGAL,HIGPA,PPM,Commercial,ICM,Durable Medical Equipment and Supplies Policy,DME Quality of Care,4651,Deny any code billed by a DME provider when th...,...,Invalid,Jul-2025,"[""Primary Reference Not Accepted by the Client""]",Reject,Jan-2021,Less Likely,73834,17511964,,


In [5]:
# STEP 4: Define filter dropdown options + apply filters + create pivot (based on selections)

FILTER_COLS = ["Recently presented", "LOB", "Decision Status", "Decision Date"]

# dropdown options (unique values for each filter column)
filter_options = {
    c: sorted(df[c].dropna().astype(str).str.strip().loc[lambda s: s.ne("")].unique().tolist())
    for c in FILTER_COLS if c in df.columns
}

# choose filters here (single value OR list of values). Use "All" or None to skip a filter.
selected_filters = {
    "Recently presented": "All",
    "LOB": ["Medicare", "Commercial"],          # example multi-select
    "Decision Status": ["Reject", "No Decision", "Suppress"],
    "Decision Date": "All"
}


In [6]:
# apply filters on base df
df_f = df.copy()
for c, v in selected_filters.items():
    if c not in df_f.columns or v is None or (isinstance(v, str) and v.strip().lower() == "all"):
        continue
    s = df_f[c].astype(str).str.strip()
    if isinstance(v, (list, tuple, set)):
        df_f = df_f[s.isin([str(x).strip() for x in v])]
    else:
        df_f = df_f[s == str(v).strip()]

# create pivoted df from filtered data
df_pivot = (
    df_f.groupby(["Topic", "Policy Collection", "Dp Key", "Payer Short"], as_index=False)[["Annl Edits", "Annl Agg Savings"]]
        .sum()
        .rename(columns={"Annl Edits": "Sum of Annl Edits", "Annl Agg Savings": "Sum of Annl Agg Savings"})
)

df_pivot.head(10)


Unnamed: 0,Topic,Policy Collection,Dp Key,Payer Short,Sum of Annl Edits,Sum of Annl Agg Savings
0,Ado-Trastuzumab Emtansine (J9354),Laboratory,8489,HIGAC,1,17521
1,"Aflibercept (J0178, Q5147, Q5149, Q5150, Q5153...",All indications,8075,HIGNY,3,1966
2,"Alpha 1-Proteinase Inhibitor (J0256, J0257)",All indications,6506,HIGAC,3,2608
3,"Alpha 1-Proteinase Inhibitor (J0256, J0257)",All indications,6506,HIGPA,201,252405
4,"Alpha 1-Proteinase Inhibitor (J0256, J0257)",Dual diagnosis,11674,HIGAC,194,189607
5,"Alpha 1-Proteinase Inhibitor (J0256, J0257)",Dual diagnosis,11674,HIGDE,49,34614
6,"Alpha 1-Proteinase Inhibitor (J0256, J0257)",Dual diagnosis,11674,HIGPA,1184,716739
7,"Alpha 1-Proteinase Inhibitor (J0256, J0257)",Dual diagnosis,11674,HIGWV,50,72941
8,Aripiprazole (Abilify Maintena) (J0401),All indications,11726,HIGNY,9,11723
9,Aripiprazole (Abilify Maintena) (J0401),All indications,11726,HIGPA,30,46620


In [7]:

# ---------- Build a payer-agnostic pivot (Topic + DP Key) ----------
edits_col  = "Sum of Annl Edits" if "Sum of Annl Edits" in df_pivot.columns else "Annl Edits"
sav_col    = "Sum of Annl Agg Savings" if "Sum of Annl Agg Savings" in df_pivot.columns else "Annl Agg Savings"

df_piv2 = (
    df_pivot.groupby(["Topic", "Dp Key"], as_index=False)[[edits_col, sav_col]]
              .sum()
              .rename(columns={edits_col: "Annl Edits", sav_col: "Annl Agg Savings"})
)

# ---------- Topic_Dp_Count (drives <=10 vs >10 logic) ----------
tdc = pd.read_excel("Topic_Dp_Count.xlsx", engine="openpyxl")
tdc.columns = [c.strip() for c in tdc.columns]  # safety
tdc = tdc.rename(columns={"Dp key": "Dp Key"})  # matches your screenshot
tdc["Dp Key"] = pd.to_numeric(tdc["Dp Key"], errors="coerce")
tdc["Topic_Dp_Count"] = pd.to_numeric(tdc["Topic_Dp_Count"], errors="coerce")

topic_counts = tdc.groupby("Topic", as_index=False)["Topic_Dp_Count"].max()
small_topics = set(topic_counts.loc[topic_counts["Topic_Dp_Count"] <= 10, "Topic"])
large_topics = set(topic_counts.loc[topic_counts["Topic_Dp_Count"] > 10, "Topic"])

# ---------- Decision Date lookup for comments (from filtered base df_f) ----------
def _pick_decision_date(s: pd.Series) -> str:
    vals = s.dropna().astype(str).str.strip()
    vals = vals[vals.ne("")]
    if vals.empty:
        return "Never presented"
    if (vals.str.lower().str.contains("never")).any():
        return "Never presented"
    dt = pd.to_datetime(vals, format="%b-%Y", errors="coerce")
    if dt.notna().any():
        return dt.max().strftime("%b-%Y")
    return vals.iloc[0]

dec_map = (
    df_f.assign(**{"Dp Key": pd.to_numeric(df_f["Dp Key"], errors="coerce")})
       .groupby(["Topic", "Dp Key"], as_index=False)["Decision Date"]
       .agg(_pick_decision_date)
       .rename(columns={"Decision Date": "Comments"})
)


In [8]:
# ---- Building df_to_present 

# 0) Topic+DP pivot (payer-agnostic) from df_pivot
edits_col = "Sum of Annl Edits" if "Sum of Annl Edits" in df_pivot.columns else "Annl Edits"
sav_col   = "Sum of Annl Agg Savings" if "Sum of Annl Agg Savings" in df_pivot.columns else "Annl Agg Savings"

df_piv2 = (
    df_pivot.groupby(["Topic", "Dp Key"], as_index=False)[[edits_col, sav_col]]
              .sum()
              .rename(columns={edits_col: "Annl Edits", sav_col: "Annl Agg Savings"})
)
df_piv2["Annl Edits"] = pd.to_numeric(df_piv2["Annl Edits"], errors="coerce").fillna(0)
df_piv2["Annl Agg Savings"] = pd.to_numeric(df_piv2["Annl Agg Savings"], errors="coerce").fillna(0)

# 1) Read Topic_Dp_Count
tdc = pd.read_excel("Topic_Dp_Count.xlsx", engine="openpyxl")
tdc.columns = [c.strip() for c in tdc.columns]
tdc = tdc.rename(columns={"Dp key": "Dp Key"})
tdc["Dp Key"] = pd.to_numeric(tdc["Dp Key"], errors="coerce")
tdc["Topic_Dp_Count"] = pd.to_numeric(tdc["Topic_Dp_Count"], errors="coerce")

topic_cnt = tdc.groupby("Topic", as_index=False)["Topic_Dp_Count"].max()
topic_cnt_map = dict(zip(topic_cnt["Topic"], topic_cnt["Topic_Dp_Count"]))

# 2) Decide which <=10 topics qualify (must have >=1 active DP with edits OR savings > 0)
active_topic = set(
    df_piv2.loc[(df_piv2["Annl Edits"] > 0) | (df_piv2["Annl Agg Savings"] > 0), "Topic"].unique()
)
small_topics_ok = set(topic_cnt.loc[(topic_cnt["Topic_Dp_Count"] <= 10) & (topic_cnt["Topic"].isin(active_topic)), "Topic"])
large_topics = set(topic_cnt.loc[topic_cnt["Topic_Dp_Count"] > 10, "Topic"])

# 3) Latest Decision Date + matching Decision Status per (Topic, Dp Key) from df_f
df_f2 = df_f.copy()
df_f2["Dp Key"] = pd.to_numeric(df_f2["Dp Key"], errors="coerce")
df_f2["_dd_dt"] = pd.to_datetime(df_f2["Decision Date"].astype(str).str.strip(), format="%b-%Y", errors="coerce")

latest_idx = (
    df_f2.sort_values(["Topic", "Dp Key", "_dd_dt"])
         .groupby(["Topic", "Dp Key"], as_index=False)
         .tail(1)
         .index
)

dec_latest = df_f2.loc[latest_idx, ["Topic", "Dp Key", "Decision Status", "_dd_dt"]].copy()
dec_latest = dec_latest.rename(columns={"_dd_dt": "DecisionDate_dt"})
dec_latest["DecisionYear"] = dec_latest["DecisionDate_dt"].dt.year

def _comment_from_status(status, year, dt):
    s = "" if pd.isna(status) else str(status).strip()
    if pd.isna(dt):
        return "Never presented"
    if s.lower() == "no decision":
        return f"No Decision in ({int(year)})" if pd.notna(year) else "No Decision"
    if s.lower() == "reject":
        return f"Previously Rejected in ({int(year)})" if pd.notna(year) else "Previously Rejected"
    if s.lower() == "suppress":
        return f"Previously Suppressed in ({int(year)})" if pd.notna(year) else "Previously Suppressed"
    # else: keep date as the comment
    return dt.strftime("%b-%Y")

dec_latest["Comments"] = dec_latest.apply(
    lambda r: _comment_from_status(r["Decision Status"], r["DecisionYear"], r["DecisionDate_dt"]),
    axis=1
)

# 4A) <=10 topics (ONLY those qualifying): include ALL DP keys -> "To complete topic"
df_small = (
    tdc.loc[tdc["Topic"].isin(small_topics_ok), ["Topic", "Dp Key"]]
       .dropna(subset=["Dp Key"])
       .drop_duplicates()
       .merge(df_piv2, on=["Topic", "Dp Key"], how="left")
)
df_small["Annl Edits"] = pd.to_numeric(df_small["Annl Edits"], errors="coerce").fillna(0).astype("int64")
df_small["Annl Agg Savings"] = pd.to_numeric(df_small["Annl Agg Savings"], errors="coerce").fillna(0).astype("int64")
df_small["Comments"] = "To complete topic"
df_small["DecisionDate_dt"] = pd.NaT  # for Present logic

# 4B) >10 topics: include ONLY active DPs -> Comments from Decision Status/Date logic above
df_large = (
    df_piv2.loc[
        df_piv2["Topic"].isin(large_topics) &
        ((df_piv2["Annl Edits"] > 0) | (df_piv2["Annl Agg Savings"] > 0))
    ]
    .merge(dec_latest[["Topic", "Dp Key", "Comments", "DecisionDate_dt"]], on=["Topic", "Dp Key"], how="left")
)
df_large["Annl Edits"] = df_large["Annl Edits"].astype("int64")
df_large["Annl Agg Savings"] = df_large["Annl Agg Savings"].astype("int64")
df_large["Comments"] = df_large["Comments"].fillna("Never presented")

df_to_present = pd.concat([df_small, df_large], ignore_index=True)

# 5) Add Policy Collection (join on DP Key)
pc_map = pd.read_excel(
    "Policy Collection Mapping.xlsx",
    engine="openpyxl",
    usecols=["DPKey", "PolicyCollection"]   # adjust names if needed
).drop_duplicates("DPKey")
pc_map["DPKey"] = pd.to_numeric(pc_map["DPKey"], errors="coerce")

df_to_present = (
    df_to_present.merge(pc_map, how="left", left_on="Dp Key", right_on="DPKey")
                 .drop(columns=["DPKey"])
                 .rename(columns={"PolicyCollection": "Policy Collection"})
)

# 6) Completing Topic (YES only for "To complete topic"; else bucket by Topic_Dp_Count in steps of 5)
def completing_topic_label(topic: str, comment: str) -> str:
    if str(comment).strip().lower() == "to complete topic":
        return "YES"
    n = topic_cnt_map.get(topic, None)
    if n is None or n <= 10:
        return "NO"
    threshold = ((int(n) - 1) // 5) * 5  # 11-15->10, 16-20->15, 21-25->20, ...
    return f"NO- Too many DPs ({threshold}+)"

df_to_present["Completing Topic"] = df_to_present.apply(
    lambda r: completing_topic_label(r["Topic"], r["Comments"]),
    axis=1
)

# 7) Present column
today = pd.Timestamp.today().normalize()
cutoff = today - pd.DateOffset(months=24)

df_to_present["Present"] = "NO"
c_lower = df_to_present["Comments"].astype(str).str.strip().str.lower()
df_to_present.loc[c_lower.isin(["never presented", "to complete topic"]), "Present"] = "YES"
df_to_present.loc[df_to_present["DecisionDate_dt"].notna() & (df_to_present["DecisionDate_dt"] <= cutoff), "Present"] = "YES"

# Final column order
df_to_present = df_to_present[
    ["Topic", "Policy Collection", "Dp Key", "Annl Edits", "Annl Agg Savings", "Present", "Comments", "Completing Topic"]
]

df_to_present.head(20)


Unnamed: 0,Topic,Policy Collection,Dp Key,Annl Edits,Annl Agg Savings,Present,Comments,Completing Topic
0,"Autologous Cultured Chondrocytes, Implant (J7330)",Age,7366,0,0,YES,To complete topic,YES
1,"Autologous Cultured Chondrocytes, Implant (J7330)",Drug requires procedure,7364,0,0,YES,To complete topic,YES
2,"Autologous Cultured Chondrocytes, Implant (J7330)",Drug requires procedure,7367,0,0,YES,To complete topic,YES
3,"Autologous Cultured Chondrocytes, Implant (J7330)",Daily maximum units,7363,0,0,YES,To complete topic,YES
4,"Autologous Cultured Chondrocytes, Implant (J7330)",Procedure requires drug,7129,36,5258,YES,To complete topic,YES
5,"Autologous Cultured Chondrocytes, Implant (J7330)",Age,7368,0,0,YES,To complete topic,YES
6,"Autologous Cultured Chondrocytes, Implant (J7330)",Modifier required,14437,0,0,YES,To complete topic,YES
7,"Autologous Cultured Chondrocytes, Implant (J7330)",All indications,7365,0,0,YES,To complete topic,YES
8,BCG (Intravesical) (J9030),Daily maximum units,11533,0,0,YES,To complete topic,YES
9,BCG (Intravesical) (J9030),All indications,9966,0,0,YES,To complete topic,YES


In [39]:
import pandas as pd
import numpy as np

# --- Read + clean Benchmark ---
df_benchmark = pd.read_excel("Benchmark.xlsx", engine="openpyxl")
df_benchmark = df_benchmark.drop(columns=["Hide rows", "Level 1", "Level 2", "z_internal_Index"], errors="ignore")

# CRITICAL: remove leading/trailing spaces from ALL benchmark column names
df_benchmark.columns = df_benchmark.columns.astype(str).str.strip()

# Exact benchmark column names AFTER strip()
bench_dp_col    = "DP Key"
bench_adopt_col = "Payer Adoption Rate"
bench_gpv_col   = "GPV %"
bench_apv_col   = "APV%"
bench_npv_col   = "NPV %"

need = [bench_dp_col, bench_adopt_col, bench_gpv_col, bench_apv_col, bench_npv_col]
missing = [c for c in need if c not in df_benchmark.columns]
if missing:
    raise KeyError("Missing columns in Benchmark.xlsx after stripping spaces: {}".format(missing))

def to_num(s):
    s = s.astype(str).str.replace("%", "", regex=False).str.replace(",", "", regex=False).str.strip()
    return pd.to_numeric(s, errors="coerce")

# Minimal lookup: 1 row per DP Key
bench_lookup = df_benchmark[need].copy()
bench_lookup[bench_dp_col] = pd.to_numeric(bench_lookup[bench_dp_col], errors="coerce")
bench_lookup = bench_lookup.dropna(subset=[bench_dp_col]).drop_duplicates(subset=[bench_dp_col], keep="first")

bench_lookup[bench_adopt_col] = to_num(bench_lookup[bench_adopt_col])
bench_lookup[bench_gpv_col]   = to_num(bench_lookup[bench_gpv_col])
bench_lookup[bench_apv_col]   = to_num(bench_lookup[bench_apv_col])
bench_lookup[bench_npv_col]   = to_num(bench_lookup[bench_npv_col])

bench_lookup = bench_lookup.rename(columns={
    bench_dp_col: "Dp Key",
    bench_adopt_col: "Payer Adoption Rate_raw",
    bench_gpv_col:   "GPV_raw",
    bench_apv_col:   "APV_raw",
    bench_npv_col:   "NPV_raw"
})

# --- Ensure Dp Key numeric in df_to_present ---
df_to_present["Dp Key"] = pd.to_numeric(df_to_present["Dp Key"], errors="coerce")

# Mark Already in Prod (DP exists in benchmark)
bench_dp_set = set(bench_lookup["Dp Key"].dropna().astype(int).unique())
mask_in_prod = df_to_present["Dp Key"].fillna(-1).astype(int).isin(bench_dp_set)
df_to_present.loc[mask_in_prod, "Comments"] = "Already in Prod"

# Remove existing output cols if they already exist (prevents duplicates)
df_to_present = df_to_present.drop(columns=["Payer Adoption Rate", "GPV %", "APV %", "NPV %"], errors="ignore")

# Merge raw metrics in (stable alignment)
df_to_present = df_to_present.merge(bench_lookup, on="Dp Key", how="left")

# Format only when Comments == "Already in Prod"
mask_prod = df_to_present["Comments"].astype(str).str.strip().eq("Already in Prod")

def fmt_percent(series, decimals):
    s = pd.to_numeric(series, errors="coerce")
    if s.dropna().empty:
        return pd.Series([""] * len(series), index=series.index, dtype=object)
    scale = 100 if s.dropna().max() <= 1 else 1  # fraction vs already-percent
    s = s * scale
    fmt = "{:." + str(decimals) + "f}%"
    return s.map(lambda v: fmt.format(v) if pd.notna(v) else "")

# Create final 4 columns (blank by default)
df_to_present["Payer Adoption Rate"] = ""
df_to_present["GPV %"] = ""
df_to_present["APV %"] = ""
df_to_present["NPV %"] = ""

df_to_present.loc[mask_prod, "Payer Adoption Rate"] = fmt_percent(df_to_present.loc[mask_prod, "Payer Adoption Rate_raw"], 2)
df_to_present.loc[mask_prod, "GPV %"]               = fmt_percent(df_to_present.loc[mask_prod, "GPV_raw"], 3)
df_to_present.loc[mask_prod, "APV %"]               = fmt_percent(df_to_present.loc[mask_prod, "APV_raw"], 3)
df_to_present.loc[mask_prod, "NPV %"]               = fmt_percent(df_to_present.loc[mask_prod, "NPV_raw"], 3)

# Drop raw columns (no extras)
df_to_present = df_to_present.drop(columns=["Payer Adoption Rate_raw", "GPV_raw", "APV_raw", "NPV_raw"], errors="ignore")

# FINAL: only your required output columns
final_cols = [
    "Topic", "Policy Collection", "Dp Key", "Annl Edits", "Annl Agg Savings",
    "Present", "Comments", "Completing Topic",
    "Payer Adoption Rate", "GPV %", "APV %", "NPV %"
]
df_to_present = df_to_present[final_cols]

df_to_present.head(20)


Unnamed: 0,Topic,Policy Collection,Dp Key,Annl Edits,Annl Agg Savings,Present,Comments,Completing Topic,Payer Adoption Rate,GPV %,APV %,NPV %
0,"Autologous Cultured Chondrocytes, Implant (J7330)",Age,7366,0,0,YES,Already in Prod,YES,3.11%,0.000%,,0.000%
1,"Autologous Cultured Chondrocytes, Implant (J7330)",Drug requires procedure,7364,0,0,YES,Already in Prod,YES,2.80%,0.000%,,0.000%
2,"Autologous Cultured Chondrocytes, Implant (J7330)",Drug requires procedure,7367,0,0,YES,Already in Prod,YES,2.80%,0.001%,0.000%,0.001%
3,"Autologous Cultured Chondrocytes, Implant (J7330)",Daily maximum units,7363,0,0,YES,Already in Prod,YES,3.11%,0.000%,,0.000%
4,"Autologous Cultured Chondrocytes, Implant (J7330)",Procedure requires drug,7129,36,5258,YES,Already in Prod,YES,3.11%,0.000%,-5.126%,0.000%
5,"Autologous Cultured Chondrocytes, Implant (J7330)",Age,7368,0,0,YES,Already in Prod,YES,2.80%,0.000%,,0.000%
6,"Autologous Cultured Chondrocytes, Implant (J7330)",Modifier required,14437,0,0,YES,Already in Prod,YES,1.24%,0.000%,,0.000%
7,"Autologous Cultured Chondrocytes, Implant (J7330)",All indications,7365,0,0,YES,Already in Prod,YES,3.11%,0.000%,,0.000%
8,BCG (Intravesical) (J9030),Daily maximum units,11533,0,0,YES,Already in Prod,YES,2.80%,0.000%,,0.000%
9,BCG (Intravesical) (J9030),All indications,9966,0,0,YES,Already in Prod,YES,3.11%,0.000%,-36.938%,0.000%


In [40]:
df_to_present.to_excel("df_to_present.xlsx", index=False)

In [34]:
df_benchmark.columns.tolist()

['DP Key',
 'DP Desc',
 'Customer Count',
 'Customer Adoption Rate   ',
 'Super Payer Count',
 ' Super Payer Adoption Rate   ',
 'Payer Count',
 ' Payer Adoption Rate   ',
 'GPV %',
 'APV%',
 'Adjusted lines %',
 'NPV %',
 'Unapplied%',
 'GPV',
 'Adjustments',
 'NPV',
 'Unapplied',
 'Edits per 1000',
 'Orig Paid per Edit',
 'Adjusted Lines']