In [None]:
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
from datetime import datetime


pd.set_option('display.max_columns', None)
sys.path.append("../..")

from src.utils.download import download_file_from_url
from src.utils.plot import plot_bar

In [None]:
puf_path = "/home/mike/data/practice_data/Transparency_in_Coverage_PUF.xlsx"
puf = pd.read_excel(puf_path, sheet_name=1)
# 2021 plan year data
# Get rid of junk rows
puf.columns = puf.iloc[1]
puf = puf.iloc[2:]

In [None]:
len(puf)
len(puf["Issuer_Claims_Received"].unique())
len(puf["Issuer_ID"].unique())

In [1]:
def isolate_well_reported():
    pass

def isolate_medical(df):
    df = df[df['SADP_Only?'] == "No"]
    return df

def isolate_QHP(df):
    df = df[df["QHP/SADP"] == "QHP"]
    return df

NameError: name 'b' is not defined

In [None]:
# Calculate issuer denial rate:
import numpy as np
import matplotlib.pyplot as plt
def denial_rate(row):
    recvd = row["Issuer_Claims_Received"]
    denied = row["Issuer_Claims_Denials"]
    if type(recvd) is str and type(denied) is str:
        return int(denied) / int(recvd)
    else:
        return -1

# Calculate denial rates where possible
puf["denial_rate"] = puf.apply(denial_rate, axis=1)
denial_rates = puf[puf["denial_rate"] != -1]


# Plot hist of denial rate:
fig, ax = plt.subplots()
ax.set_title("Denial rate distribution across PUF marketplace plans.")
ax.hist(denial_rates["denial_rate"], alpha=.5, bins=np.linspace(0, .5, 25))
plt.show()

In [None]:
puf["State"].value_counts()

In [None]:
def split_puf_sheet(puf_path, sheet_name):
    """Split off the title, legend and actual data from CMS puf.
    """
    df = pd.read_excel(puf_path, engine="openpyxl", sheet_name=sheet_name)
    title = df.keys()[0]
    legend = df.iloc[0][0]

    # reassign header
    data = df.iloc[1:]
    headers = data.iloc[0]
    data = data[1:]
    data.columns = headers

    return title, legend, data

In [None]:
puf_path_2023 = '/home/mike/data/practice_data/Transparency_in_Coverage_PUF.xlsx'
sheet_name = "Transparency 2023 - Ind QHP"
title, legend, df = split_puf_sheet(puf_path_2023, sheet_name)

# This is delivered as xlsx, with a title row and legend row, which making parsing this as a dataframe
# unnecessarily difficult. Why CMS, just why?

print(f"{title}\n\n{legend}")

In [None]:
# General utilities
def isolate_complete_issuers(df):
    pass

def isolate_complete_plans(df):
    pass

def isolate_medical(df):
    df = df[df['SADP_Only?'] == "No"]
    return df

def isolate_QHP(df):
    df = df[df["QHP/SADP"] == "QHP"]
    return df

def clean(data, year):
    "Clean and return numerical data"

    # Add plan year as convenience:
    data["denial_stats_year"] = year


    if year == 2023:
        integral_columns = ['Issuer_Claims_Received', 'Issuer_Claims_Denials',
                            'Issuer_Internal_Appeals_Filed',
                            'Issuer_Number_Internal_Appeals_Overturned',
                            'Issuer_Percent_Internal_Appeals_Overturned',
                            'Issuer_External_Appeals_Filed',
                            'Issuer_Number_External_Appeals_Overturned',
                            'Issuer_Percent_External_Appeals_Overturned',
                            'Plan_Number_Claims_Received', 'Plan_Number_Claims_Denied',
                            'Plan_Number_Claims_Denied_Referral_Required',
                            'Plan_Number_Claims_Denied_Out_of_Network',
                            'Plan_Number_Claims_Denied_Services_Excluded',
                            'Plan_Number_Claims_Denied_Not_Medically_Necessary_Excl_Behavioral_Health',
                            'Plan_Number_Claims_Denied_Not_Medically_Necessary_Behavioral_Health_Only',
                            'Plan_Number_Claims_Denied_Other', 'Average Monthly Enrollment',
                            'Average Monthly Disenrollment']
        other_columns = ['State', 'Issuer_Name', 'Issuer_ID', 'Is_Issuer_New_to_Exchange? (Yes_or_No)',
                         'SADP_Only?', 'Plan_ID', 'QHP/SADP', 'Plan_Type', 'Metal_Level',]
        not_null_columns = []

    if year == 2021:
        # 2021 numerical columns
        integral_columns = ['Issuer_ID', 'Issuer_Name', 'Issuer_Claims_Received', 'Issuer_Claims_Denials', 
                    'Issuer_Internal_Appeals_Filed', 'Issuer_Number_Internal_Appeals_Overturned',
                    'Issuer_Percent_Internal_Appeals_Overturned', 'Issuer_External_Appeals_Filed',
                    'Issuer_Number_External_Appeals_Overturned', 'Issuer_Percent_External_Appeals_Overturned',
                    'Plan_Number_Claims_Received', 'Plan_Number_Claims_Denied', 
                    'Plan_Number_Claims_Denied_Referral_Required',
                    'Plan_Number_Claims_Denied_Out_of_Network',
                    'Plan_Number_Claims_Denied_Services_Excluded',
                    'Plan_Number_Claims_Denied_Not_Medically_Necessary_Excl_Behavioral_Health',
                    #'Plan_Number_Claims_Denied_Not_Medically_Necessary_Incl_Behavioral_Health',
                    'Plan_Number_Claims_Denied_Other', 'Enrollment_Data', 'Disenrollment_Data']
        not_null_columns = ["Issuer_Claims_Received", "Issuer_Claims_Denials"]
        

    if year == 2020:
        # 2020 numerical columns
        integral_columns = ['Issuer_ID', 'Issuer_Name', 'Issuer_Claims_Received', 'Issuer_Claims_Denials', 
                            'Issuer_Internal_Appeals_Filed', 'Issuer_Number_Internal_Appeals_Overturned',
                            'Issuer_Percent_Internal_Appeals_Overturned', 'Issuer_External_Appeals_Filed',
                            'Issuer_Number_External_Appeals_Overturned', 'Issuer_Percent_External_Appeals_Overturned',
                            'Plan_Number_Claims_Received', 'Plan_Number_Claims_Denied', 
                            'Plan_Number_Claims_Denied_Referral_Required',
                            'Plan_Number_Claims_Denied_Out_of_Network',
                            'Plan_Number_Claims_Denied_Services_Excluded',
                            'Plan_Number_Claims_Denied_Not_Medically_Necessary_Excl_Behavioral_Health',
                            #'Plan_Number_Claims_Denied_Not_Medically_Necessary_Incl_Behavioral_Health',
                            'Plan_Number_Claims_Denied_Other', 'Enrollment_Data', 'Disenrollment_Data']
        not_null_columns = ["Issuer_Claims_Received", "Issuer_Claims_Denials"]

    if (year == 2019) or (year ==2018):
        # 2019 numerical columns
        integral_columns = ['Issuer_ID', 'Issuer_Name', 'Claims_Received', 'Claims_Denials', 
                             'Internal_Appeals Filed', 'Number_Internal_Appeals_Overturned',
                            'Percent_Internal_Appeals_Overturned', 'External Appeals Filed',
                            'Number_External_Appeals_Overturned', 'Percent_External_Appeals_Overturned',
                            'Enrollment_Data', 'Disenrollment_Data']
        not_null_columns = ["Claims_Received", "Claims_Denials"]

    # Get non NA values for now
    if year in [2018, 2019]:
        nm_data = data[integral_columns + ["State"]].dropna()
    if year in [2020, 2021]:
        nm_data = data[integral_columns + ["State", "Plan_ID"]].dropna()

    if year in [2018, 2019]:
        nm_data = nm_data[~nm_data["Claims_Received"].isin(["*", "**"])]
        nm_data = nm_data[~nm_data["Claims_Denials"].isin(["*", "**"])]
    if year in [2020, 2021, 2023]:
        nm_data = nm_data[~nm_data["Issuer_Claims_Received"].isin(["*", "**"])]
        nm_data = nm_data[~nm_data["Issuer_Claims_Denials"].isin(["*", "**"])]
    # Remove rows where claims are less than denials, some bug in reporting:
    if (year == 2019) or (year ==2018):
        nm_data = nm_data[nm_data["Claims_Received"] > nm_data["Claims_Denials"]]
    # Remove erroneous cases of claims received > claims_denied
    if (year == 2020) or (year ==2021) or (year == 2023):
        nm_data = nm_data[nm_data["Issuer_Claims_Received"] > nm_data["Issuer_Claims_Denials"]]
        

    for column in integral_columns:
        nm_data[f"{column}"] = pd.to_numeric(nm_data[f"{column}"], errors='coerce')
    for column in not_null_columns:
        nm_data = nm_data[nm_data[f"{column}"].notna()]
        nm_data[f"{column}"] = nm_data[f"{column}"].astype(int)
        
    if (year == 2019) or (year ==2018):
        nm_data = nm_data[nm_data["Claims_Received"] > 1000]

    if (year == 2020) or (year ==2021):
        nm_data = nm_data[nm_data["Issuer_Claims_Received"] > 1000]
    return nm_data


def get_issuer_level_dfs(dfs):
    issuer_level_dfs = []
    for df in dfs:
        issuer_level_df = df.groupby("Issuer_ID").first()
        issuer_level_dfs.append(issuer_level_df)
    return issuer_level_dfs

def get_plan_level_dfs(dfs):
    plan_level_dfs = []
    for df in dfs:
        plan_level_df = df.groupby("Plan_ID").first()
        plan_level_dfs.append(plan_level_df)
    return plan_level_dfs

In [None]:
df["Issuer_Claims_Received"].isna().sum()

In [None]:
# Motivating Questions Across Data Sources


# Plot denial rates by state, by year


# Plot denial rates by insurer, by year (how it is progressing?)


# Plot denial type by insurer, by state? What dominates for each insurer, where to focus?

# KFF looks only at insurers from marketplace with "complete" data.

# How often do consumers get denied?

# How often do they appeal?

# How often do they win appeals (internal)?

# How often do they external appeal?

# How often do they win appeals (external)?

# How often do they win, period?


In [None]:
## TODO From this:

1. Create a Deck (Persius MO).
2. Write a detailed report (like KHN) +  arxiv paper.
3. Write a blog post.
4. Publish CT data, other data summaries?