In [1]:
import difflib
import json
import os
import re
import sys

import numpy as np
import pandas as pd

pd.set_option("display.max_columns", None)
sys.path.append("../..")

from src.utils.data.claims_denials import (
    get_overturn_rate,
)

INPUT_DATA_DIR = "./input_data"
OUTPUT_IMAGE_DIR = "./images/ma"
OUTPUT_DATA_DIR = "./output_data/ma"  # Set to None if you don't want to save non-required data for external use
NO_PLOT_TITLES = False  # If True, plots have no titles. Useful for cases where one wants titles rendered separately from underlying plots.

In [2]:
# TODO: download data via script to local, if necessary
df = pd.read_csv(os.path.join(INPUT_DATA_DIR, "medicare_qic_partc.csv"))

In [3]:
df.head()

Unnamed: 0,decision_number,part,decision_date,decision,appeal_type,_condition,item_service,decision_rationale,coverage_rules,related_reference_id
0,QIC20-000006,Part C,01/02/2020,Unfavorable,Imaging,,Mammography,We decided that the Plan does not have to pre-...,The rules say that plans must pay for a medica...,
1,QIC20-000016,Part C,01/02/2020,Unfavorable,Inpatient Hospital,,Acute Inpatient,We decided that the Plan does not have to pay ...,The rules say that plans must pay for a medica...,
2,QIC20-000015,Part C,01/02/2020,Unfavorable,Surgery,,Other - Surgery,We decided that the Plan does not have to pre-...,The rules say that plans must pay for a medica...,
3,QIC20-000014,Part C,01/02/2020,Unfavorable,DME/ Orthotics,,Pneumatic Compression Device,We decided that the Plan does not have to prea...,The rules say that plans must pay for a medica...,
4,QIC20-000012,Part C,01/02/2020,Favorable,Practitioner Services,,Injections,We decided that the Plan has to pre-approve sa...,The rules say that plans must pay for a medica...,


In [4]:
num_appeals = len(df)
complete_overturn_rate = get_overturn_rate(df,"decision", "Favorable") # TODO: Verify favorable means favorable to insured. Ambiguous descriptor.
overturn_rate = (df["decision"].value_counts()['Favorable'] + df["decision"].value_counts()['Partially Favorable']) / df["decision"].value_counts()['Unfavorable']
print(f"Number of appeals: {num_appeals}")
print(f"Overturn rate (complete overturn): {complete_overturn_rate}")
print(f"Overturn rate (complete + partial): {overturn_rate}")

Number of appeals: 499286
Overturn rate (complete overturn): 0.04534675516637759
Overturn rate (complete + partial): 0.05273533864390627


In [5]:
# Split dates
df["decision_year"] = df["decision_date"].apply(lambda x: int(x.split("/")[-1]))
df["decision_month"] = df["decision_date"].apply(lambda x: int(x.split("/")[0]))
df["decision_day"] = df["decision_date"].apply(lambda x: int(x.split("/")[1]))

def generate_summary_df(df, group_col):
    """Get df of counts and stats by groups specified by group_col.
    """
    # Group by 'date' and calculate the required statistics
    summary_df = df.groupby(group_col).agg(
        num_appeals=('decision_number', 'count'),
        num_favorable=('decision', lambda x: (x == 'Favorable').sum()),
        num_partially_favorable=('decision', lambda x: (x == 'Partially Favorable').sum()),
        num_unfavorable=('decision', lambda x: (x == 'Unfavorable').sum()),
        num_withdrawn=('decision', lambda x: (x == 'Withdrawn').sum()),
        overturn_rate=('decision', lambda x: (x == 'Favorable').sum() / len(x)),
        partial_overturn_rate=('decision', lambda x: (x == 'Partially Favorable').sum() / len(x)),
    ).reset_index()
    return summary_df


def generate_json_summary(df):
    """Get dict of counts and stats by groups specified by group_col.
    """

    # Convert the summary DataFrame to JSON
    json_output = df.to_json(orient='records', lines=True)

    # Load JSON string to a list of dictionaries
    json_list = [json.loads(line) for line in json_output.split('\n') if line]

    return json_list

In [6]:
# Yearly summary
json_summary = generate_json_summary(generate_summary_df(df, "decision_year"))
json_summary

[{'decision_year': 2018,
  'num_appeals': 1,
  'num_favorable': 1,
  'num_partially_favorable': 0,
  'num_unfavorable': 0,
  'num_withdrawn': 0,
  'overturn_rate': 1.0,
  'partial_overturn_rate': 0.0},
 {'decision_year': 2020,
  'num_appeals': 94882,
  'num_favorable': 4797,
  'num_partially_favorable': 449,
  'num_unfavorable': 89636,
  'num_withdrawn': 0,
  'overturn_rate': 0.0505575346,
  'partial_overturn_rate': 0.0047321937},
 {'decision_year': 2021,
  'num_appeals': 117465,
  'num_favorable': 4734,
  'num_partially_favorable': 487,
  'num_unfavorable': 112243,
  'num_withdrawn': 1,
  'overturn_rate': 0.0403013664,
  'partial_overturn_rate': 0.0041459158},
 {'decision_year': 2022,
  'num_appeals': 150261,
  'num_favorable': 6424,
  'num_partially_favorable': 618,
  'num_unfavorable': 143219,
  'num_withdrawn': 0,
  'overturn_rate': 0.0427522777,
  'partial_overturn_rate': 0.0041128437},
 {'decision_year': 2023,
  'num_appeals': 136677,
  'num_favorable': 6685,
  'num_partially_fav

In [7]:
def filter_df(df: pd.DataFrame, search_term: str, cols: list[str]):
    """Filter for sub_df matching certain pattern in any of cols.
    """
    # pattern = r"\b(?:" + re.escape(search_term)+ ")\b"
    pattern = r"(?:" + re.escape(search_term) + ")"
    mask = np.column_stack(
        [
            df[col].str.contains(pattern, na=False, flags=re.IGNORECASE)
            for col in cols
        ]
    )
    sub_df = df.loc[mask.any(axis=1)]
    return sub_df

In [10]:
# Attempt to dedupe, a bit (this is complicated in this context, only merge if a lot of overlap, print all merges)
def merge_similar_objects(data, threshold=0.8):
    merged_data = []
    merges = []

    for item in data:
        matched = False

        for merged_item in merged_data:
            similarity_ratio = difflib.SequenceMatcher(None, item['name'], merged_item['name']).ratio()

            if similarity_ratio > threshold:
                # Merge the items
                merged_item['num_appeals'] += item['num_appeals']
                merged_item['num_favorable'] += item['num_favorable']
                merged_item['num_partially_favorable'] += item['num_partially_favorable']
                merged_item['num_withdrawn'] += item['num_withdrawn']
                merged_item['num_unfavorable'] += item['num_unfavorable']
                merged_item['overturn_rate'] = merged_item["num_favorable"] / merged_item["num_appeals"]
                merged_item['partial_overturn_rate'] = merged_item["num_partially_favorable"] / merged_item["num_appeals"]
                matched = True
                merges.append([item["name"], merged_item["name"]])
                
                break

        if not matched:
            # If no match is found, add the item as is
            merged_data.append(item)

    return merged_data, merges

In [8]:
condition_df = generate_summary_df(df, group_col="_condition")

In [11]:
# Restrict to conditions appealed more than Threshold times
APPEAL_THRESHOLD = 50
sub_df = condition_df[condition_df["num_appeals"] > APPEAL_THRESHOLD]

In [12]:
# Sort DF by overturn rate.
sub_df = sub_df.sort_values(by="overturn_rate", ascending=False)

# Enforce consistent capitalization
sub_df["name"] = sub_df["_condition"].str.title()

# Convert to dict/json
json_summary = generate_json_summary(sub_df)

In [13]:
deduped_summary, merges = merge_similar_objects(json_summary)

# Retain only what we show in visualization
for rec in deduped_summary:
    del rec["_condition"]
    del rec['num_favorable']
    del rec['num_partially_favorable']
    del rec['num_withdrawn']
    del rec['num_unfavorable']

In [17]:
service_df = generate_summary_df(df, group_col="item_service")

# Restrict to services appealed more than Threshold times
APPEAL_THRESHOLD = 50
sub_df = service_df[service_df["num_appeals"] > APPEAL_THRESHOLD]

# Sort DF by overturn rate.
sub_df = sub_df.sort_values(by="overturn_rate", ascending=False)

# Enforce consistent capitalization
sub_df["name"] = sub_df["item_service"].str.title()

# Convert to dict/json
json_summary = generate_json_summary(sub_df)


deduped_summary, merges = merge_similar_objects(json_summary)

# Retain only what we show in visualization
for rec in deduped_summary:
    del rec["item_service"]
    del rec['num_favorable']
    del rec['num_partially_favorable']
    del rec['num_withdrawn']
    del rec['num_unfavorable']

## Part D

In [19]:
df = pd.read_csv(os.path.join(INPUT_DATA_DIR, "medicare_qic_partd.csv"))

In [20]:
df.head()

Unnamed: 0,decision_number,part,decision_date,decision_date_sortable,decision,appeal_type,_condition,drug,decision_rationale,coverage_rules
0,QIC20-002141,Part D-Drug,01/08/2020,20200108,Unfavorable,Prescription Drug,,Butalbital Acetaminophen Caffeine,Medicare rules require a Part D Plan to issue ...,Citations used in this letter come from Sectio...
1,QIC20-002144,Part D-Drug,01/06/2020,20200106,Unfavorable,Prescription Drug,,Oxycontin ER,Medicare rules require a Part D Plan to issue ...,Citations used in this letter come from Sectio...
2,QIC20-002145,Part D-Drug,01/13/2020,20200113,Unfavorable,Prescription Drug,,Adderall,Citations used in this letter come from Sectio...,Citations used in this letter come from Sectio...
3,QIC20-002146,Part D-Drug,01/03/2020,20200103,Unfavorable,Prescription Drug,,Viokace,You asked the Plan to cover Viokace. The Plan ...,Citations used in this letter come from Sectio...
4,QIC20-002147,Part D-Drug,01/06/2020,20200106,Unfavorable,Prescription Drug,Dermatitis unspecified,Fluocinonide,Medicare rules require a Part D Plan to issue ...,Citations used in this letter come from Sectio...


In [21]:
num_appeals = len(df)
complete_overturn_rate = get_overturn_rate(df,"decision", "Favorable") # TODO: Verify favorable means favorable to insured. Ambiguous descriptor.
overturn_rate = (df["decision"].value_counts()['Favorable'] + df["decision"].value_counts()['Partially Favorable']) / df["decision"].value_counts()['Unfavorable']
print(f"Number of appeals: {num_appeals}")
print(f"Overturn rate (complete overturn): {complete_overturn_rate}")
print(f"Overturn rate (complete + partial): {overturn_rate}")

Number of appeals: 113624
Overturn rate (complete overturn): 0.06308526367668803
Overturn rate (complete + partial): 0.07023839799561069


In [22]:
# Split dates
df["decision_year"] = df["decision_date"].apply(lambda x: int(x.split("/")[-1]))
df["decision_month"] = df["decision_date"].apply(lambda x: int(x.split("/")[0]))
df["decision_day"] = df["decision_date"].apply(lambda x: int(x.split("/")[1]))

In [23]:
# Yearly summary
json_summary = generate_json_summary(generate_summary_df(df, "decision_year"))
json_summary

[{'decision_year': 2020,
  'num_appeals': 23738,
  'num_favorable': 2239,
  'num_partially_favorable': 166,
  'num_unfavorable': 21333,
  'num_withdrawn': 0,
  'overturn_rate': 0.0943213413,
  'partial_overturn_rate': 0.006993007},
 {'decision_year': 2021,
  'num_appeals': 25491,
  'num_favorable': 1271,
  'num_partially_favorable': 16,
  'num_unfavorable': 24204,
  'num_withdrawn': 0,
  'overturn_rate': 0.0498607352,
  'partial_overturn_rate': 0.0006276725},
 {'decision_year': 2022,
  'num_appeals': 34771,
  'num_favorable': 1858,
  'num_partially_favorable': 54,
  'num_unfavorable': 32859,
  'num_withdrawn': 0,
  'overturn_rate': 0.053435334,
  'partial_overturn_rate': 0.0015530183},
 {'decision_year': 2023,
  'num_appeals': 29624,
  'num_favorable': 1800,
  'num_partially_favorable': 53,
  'num_unfavorable': 27771,
  'num_withdrawn': 0,
  'overturn_rate': 0.0607615447,
  'partial_overturn_rate': 0.0017890899}]

In [24]:
condition_df = generate_summary_df(df, group_col="_condition")

In [25]:
# Restrict to conditions appealed more than Threshold times
APPEAL_THRESHOLD = 50
sub_df = condition_df[condition_df["num_appeals"] > APPEAL_THRESHOLD]

In [26]:
# Sort DF by overturn rate.
sub_df = sub_df.sort_values(by="overturn_rate", ascending=False)

# Enforce consistent capitalization
sub_df["name"] = sub_df["_condition"].str.title()

# Convert to dict/json
json_summary = generate_json_summary(sub_df)

In [27]:
deduped_summary, merges = merge_similar_objects(json_summary)

# Retain only what we show in visualization
for rec in deduped_summary:
    del rec["_condition"]
    del rec['num_favorable']
    del rec['num_partially_favorable']
    del rec['num_withdrawn']
    del rec['num_unfavorable']

In [30]:

service_df = generate_summary_df(df, group_col="drug")

# Restrict to services appealed more than Threshold times
APPEAL_THRESHOLD = 50
sub_df = service_df[service_df["num_appeals"] > APPEAL_THRESHOLD]

# Sort DF by overturn rate.
sub_df = sub_df.sort_values(by="overturn_rate", ascending=False)

# Enforce consistent capitalization
sub_df["name"] = sub_df["drug"].str.title()

# Convert to dict/json
json_summary = generate_json_summary(sub_df)


deduped_summary, merges = merge_similar_objects(json_summary)

# Retain only what we show in visualization
for rec in deduped_summary:
    del rec["drug"]
    del rec['num_favorable']
    del rec['num_partially_favorable']
    del rec['num_withdrawn']
    del rec['num_unfavorable']