In [167]:
import glob
import json
import os
import sys

from datetime import datetime
from functools import partial

import numpy as np
import pandas as pd
import pypdf
import matplotlib.pyplot as plt
import tabula
import zipfile

from matplotlib.dates import DateFormatter


pd.set_option("display.max_columns", None)
sys.path.append("../..")

from src.utils.download import download_file_from_url
from src.utils.plot import plot_bar, plot_hist, plot_pie
from src.utils.states import abbrev_to_state

INPUT_DATA_DIR = "./input_data"
OUTPUT_IMAGE_DIR = "./images/pa_claims"
OUTPUT_DATA_DIR = "./output_data/pa_claims"  # Set to None if you don't want to save non-required data for external use

In [168]:
os.makedirs(INPUT_DATA_DIR, exist_ok=True)
if OUTPUT_IMAGE_DIR:
    os.makedirs(OUTPUT_IMAGE_DIR, exist_ok=True)
if OUTPUT_DATA_DIR:
    os.makedirs(OUTPUT_DATA_DIR, exist_ok=True)

In [169]:
def get_issuer_data(pdf_path):
    # Get issuer level data from first page
    issuer1_area = (171, 51, 562, 378)
    issuer1 = tabula.read_pdf(pdf_path, pages=1, area=issuer1_area)[0]
    plan_year_reporting = int(issuer1.iloc[0, 0][-6:-2])
    plan_year = plan_year_reporting + 2

    # Get issuer level data from second page
    issuer2_area = (171, 51, 600, 378)
    issuer2 = tabula.read_pdf(
        pdf_path,
        pages=2,
        area=issuer2_area,
        multiple_tables=False,
        stream=True,
        pandas_options={"header": None},
    )[0]

    # Get metadata values
    issuer_values = issuer2.iloc[:, 0].values
    on_exchange_in_plan_year_reporting = True if issuer_values[0] == "Yes" else False
    sadp_only = True if issuer_values[1] == "Yes" else False
    hios_issuer_id = int(issuer_values[2])

    # Claims
    claims_received = (
        int(issuer_values[3].replace(",", ""))
        if not pd.isna(issuer_values[3])
        else None
    )
    claims_denied = (
        int(issuer_values[4].replace(",", ""))
        if not pd.isna(issuer_values[4])
        else None
    )
    internal_appeals = (
        int(issuer_values[5].replace(",", ""))
        if not pd.isna(issuer_values[5])
        else None
    )
    internal_appeal_overturns = (
        int(issuer_values[6].replace(",", ""))
        if not pd.isna(issuer_values[6])
        else None
    )
    external_appeals = (
        int(issuer_values[7].replace(",", ""))
        if not pd.isna(issuer_values[7])
        else None
    )
    external_appeal_overturns = (
        int(issuer_values[8].replace(",", ""))
        if not pd.isna(issuer_values[8])
        else None
    )

    return (
        hios_issuer_id,
        plan_year,
        plan_year_reporting,
        sadp_only,
        claims_received,
        claims_denied,
        internal_appeals,
        internal_appeal_overturns,
        external_appeals,
        external_appeal_overturns,
    )

In [172]:
def convert_comma_string_to_int(val):
    if type(val) is not str:
        val = str(val)
        val = val.replace(".0", "")
    # Remove commas in strs, and erroneous str entries
    val = int(val.replace(",", "")) if val != "N/A" else None
    return val


def get_plan_data(pdf_path, plan_year, issuer_id):
    # Get total pdf page length
    reader = pypdf.PdfReader(open(pdf_path, mode="rb"))
    num_pages = len(reader.pages)

    if num_pages < 3:
        return pd.DataFrame()

    # Hardcode starting page in pdfs for plan-level data
    start_page = 3

    p_area = (171, 51, 800, 378)
    plans = tabula.read_pdf(
        pdf_path,
        pages=f"{start_page}-{num_pages}",
        guess=False,
        lattice=True,
        multiple_tables=True,
        pandas_options={"header": None},
    )

    plan_dfs = []

    # We will iterate over vertically split pieces within horizontally split pieces
    # Table is split horizontally into 3 pieces.
    # Number of vertical pieces for each horizontal piece varies by pdf, depending on total num records in table
    horizontal_pieces = 3
    assert len(plans) % horizontal_pieces == 0
    num_vertical_pieces = len(plans) // horizontal_pieces

    # Subsequent pages can be handled with consistent logic
    # Since no need to drop a row
    for pg_num in range(0, len(plans), num_vertical_pieces):
        vertical_pieces = []
        for voffset in range(0, num_vertical_pieces):
            p = plans[pg_num + voffset]
            if voffset == 0:
                if pg_num == 0:
                    p = p.drop(labels=0, axis=0).reset_index(drop=True)
                p.columns = p.iloc[0]
                p = p.drop(axis=0, labels=0)
            vertical_pieces.append(p)
        vertical_piece = pd.DataFrame(np.vstack(vertical_pieces))
        vertical_piece.columns = vertical_pieces[0].columns
        plan_dfs.append(vertical_piece)

    # Drop complete NA dfs that appear
    plan_dfs = [df for df in plan_dfs if not df.isnull().values.all()]

    # Drop complete NA rows
    df = pd.concat(plan_dfs, axis=1).dropna(how="all")

    # Add plan year column
    df["current_plan_year"] = plan_year
    df["data_plan_year"] = plan_year - 2

    # Add issuer id column
    df["hios_issuer_id"] = issuer_id

    # Rename columns
    df.columns = [
        "plan_id",
        "claims_received",
        "claims_denied",
        "claims_denied_prior_auth_referral",
        "claims_denied_oon_provider",
        "claims_denied_exclusion_of_service",
        "claims_denied_nmn_excl_behavioral",
        "claims_denied_nmn_behavioral",
        "claims_denied_other",
        "notes",
        "current_plan_year",
        "data_plan_year",
        "hios_issuer_id",
    ]

    # Cast int cols
    int_cols = [
        "claims_received",
        "claims_denied",
        "claims_denied_prior_auth_referral",
        "claims_denied_oon_provider",
        "claims_denied_exclusion_of_service",
        "claims_denied_nmn_excl_behavioral",
        "claims_denied_nmn_behavioral",
        "claims_denied_other",
    ]
    for int_col in int_cols:
        df[int_col] = (
            df[int_col].apply(convert_comma_string_to_int).astype(dtype="Int64")
        )

    return df

In [173]:
data_dir = "/home/mike/persius/Public Records Request/PA/rtkl01609"
pdf_paths = glob.glob(f"{data_dir}/**")


# Issuer level, plan year claims denial data format:
insurer_names = []
issuer_hios_ids = []
plan_years_reporting = []
plan_years = []
sadp_only_statuses = []
claims_received = []  # received and DOS in plan_year
claims_denied = []
internal_appeals = []
internal_appeal_overturns = []
external_appeals = []
external_appeal_overturns = []
plan_dfs = []

for pdf_path in pdf_paths:
    try:
        insurer_name = " ".join(pdf_path.split("/")[-1].split("-")[0].split(" ")[:-1])

        # Get insurer-level data
        (
            hios_issuer_id,
            plan_year,
            plan_year_reporting,
            sadp_only,
            issuer_claims_received,
            issuer_claims_denied,
            issuer_internal_appeals,
            issuer_internal_appeal_overturns,
            issuer_external_appeals,
            issuer_external_appeal_overturns,
        ) = get_issuer_data(pdf_path)

        # Append insurer-level data
        insurer_names.append(insurer_name)
        issuer_hios_ids.append(hios_issuer_id)
        plan_years_reporting.append(plan_year_reporting)
        plan_years.append(plan_year)
        sadp_only_statuses.append(sadp_only)
        claims_received.append(issuer_claims_received)
        claims_denied.append(issuer_claims_denied)
        internal_appeals.append(issuer_internal_appeals)
        internal_appeal_overturns.append(issuer_internal_appeal_overturns)
        external_appeals.append(issuer_external_appeals)
        external_appeal_overturns.append(issuer_external_appeal_overturns)

        # Get plan-level data
        plan_data = get_plan_data(
            pdf_path, plan_year=plan_year, issuer_id=hios_issuer_id
        )
        if len(plan_data) > 0:
            plan_dfs.append(plan_data)

    except Exception as e:
        print(pdf_path)
        print(e)

issuer_df = pd.DataFrame(
    data={
        "insurer_name": pd.Series(insurer_names, dtype=str),
        "hios_id": pd.Series(issuer_hios_ids, dtype=int),
        "data_plan_year": pd.Series(plan_years_reporting, dtype=int),
        "current_plan_year": pd.Series(plan_years, dtype=int),
        "sadp_only": sadp_only_statuses,
        "claims_received": pd.Series(claims_received, dtype="Int64"),
        "claims_denied": pd.Series(claims_denied, dtype="Int64"),
        "internal_appeals": pd.Series(internal_appeals, dtype="Int64"),
        "internal_appeal_overturns": pd.Series(
            internal_appeal_overturns, dtype="Int64"
        ),
        "external_appeals": pd.Series(external_appeals, dtype="Int64"),
        "external_appeal_overturns": pd.Series(
            external_appeal_overturns, dtype="Int64"
        ),
    }
)

plan_df = pd.concat(plan_dfs, axis=0)

In [174]:
issuer_df.to_csv(os.path.join(OUTPUT_DATA_DIR, "issuers.csv"), index=False)

In [175]:
plan_df.to_csv(os.path.join(OUTPUT_DATA_DIR, "plans.csv"), index=False)