# Medicaid spending on prescription drugs

In [1]:
import pandas as pd
import numpy as np

In [2]:
import us

In [3]:
list_of_states = list(us.states.mapping('abbr', 'name').values())

In [4]:
# Store year along with the data from each sheet
# So we can add this as a column later
# Unnecessary for 2016 data

expenditures_16 = (pd.read_excel("data/medicaid_spending/FY_2016_Financial_Management_Data.xlsx", usecols=[0, 2, 4, 10]))
expenditures_15 = (2015, pd.read_excel("data/medicaid_spending/FY 2015 NET EXPENDITURES.xlsx", header=6, sheetname=None, usecols=[0, 1]))
expenditures_14 = (2014, pd.read_excel("data/medicaid_spending/FMR Net Expenditures FY14.xlsx", header=6, sheetname=None, usecols=[0, 1]))
expenditures_13 = (2013, pd.read_excel("data/medicaid_spending/FMR Net Expenditures FY13.xlsx", header=6, sheetname=None, usecols=[0, 1]))
expenditures_12 = (2012, pd.read_excel("data/medicaid_spending/FMR Net Expenditures FY12.xlsx", header=6, sheetname=None, usecols=[0, 1]))
expenditures_06_11 = pd.read_excel("data/medicaid_spending/NetExpenditure02through11.xlsx", header=None, skiprows=4, usecols=[0, 1], sheetname=["2006", "2007", "2008", "2009", "2010", "2011"])

In [5]:
# Make an array of the datasets we want to join
expenditures_12_15 = [expenditures_15, expenditures_14, expenditures_13, expenditures_12]

# Empty array to hold the final dataframes
extracted_sheets = []
for year, data in expenditures_12_15:
    
    # Filter sheets that have "MAP" in the value.
    wanted_sheets = [(sheet_name, sheet) for sheet_name, sheet in data.items() if sheet_name.startswith('MAP')]
    
    # If we don't find any sheets that have MAP, then use all available sheets
    # This is to handle 2012 dataset
    if not wanted_sheets:
        wanted_sheets = data.items()

    # Create a list of sheet names (in this case that's the state names)
    sheet_names = [sheet_name for sheet_name, sheet in wanted_sheets]
    
    # Create a list of all the sheets corresponding to each name above
    sheets = [sheet for sheet_name, sheet in wanted_sheets]
    
    # Remove MAP from sheet name if it exists
    sheet_names = [sheet_name.split('-')[-1] for sheet_name in sheet_names]
    
    # Combine all the sheets and use the sheet_names to add a state column in the final dataset
    all_states = pd.concat(sheets, keys=sheet_names)
    
    # Add a YEAR column to signify the year for the sheets being added
    all_states['Year'] = year
    
    # Add them to an array so they can be concatenated later.
    extracted_sheets.append(all_states)

In [6]:
# Empty array to hold the final dataframes for 2006 - 2011
all_06_11_data = []

# Iterate through the file, the sheet name is the year.
for year, data in expenditures_06_11.items():
    
    # Identify rows that deliniate the tables (each containing a state) or contains one of the summary table names
    boundary_rows = data[0].isin(list_of_states + ['All States', 'National Totals'])
    boundary_indices = data[boundary_rows].index
    
    # Place holder to hold the data from the curent iteration
    states = []
    
    # Each boundary indicates the start of a table, each table contains data for a state
    for i,item in enumerate(boundary_indices):
        start = item
        if i+1 < len(boundary_indices): end = boundary_indices[i+1]
        else: end = None # Get the rest of the dataframe it's the last slice

        # Slice the current table (state) out of the main dataset
        current_dataset = data.iloc[start:end]
        
        # Replace columns with whitespace into Null value (NA)
        current_dataset = current_dataset.replace(r'^\s+$', np.nan, regex=True)
        
        # Remove rows where all columns are null
        current_dataset = current_dataset.dropna(how='all')
        
        # Get the name of the state from the first column of the first row
        state_name = current_dataset.iloc[0][0]
        
        # If the state name is a valid state then we keep the table (ignore summary statistics)
        if state_name in list_of_states:
            current_dataset['State'] = state_name
            current_dataset.columns = ['Service Category', 'Total Computable', 'State']
            current_dataset = current_dataset[-current_dataset['Service Category'].isin([state_name, 'Service Category'])]
            states.append(current_dataset)
    
    all_states_current_year = pd.concat(states)
    all_states_current_year['Year'] = year
    all_06_11_data.append(all_states_current_year)

In [13]:
# Concatenate all the sheets from all the years into a big dataframe with a state and year column
medicaid_rebates = pd.concat(all_06_11_data + extracted_sheets)

In [14]:
interested_categories = ("Drug Rebate Offset - National",
"Drug Rebate Offset - State Sidebar Agreement",
"MCO - National Agreement",
"MCO - State Sidebar Agreement",
"Increased ACA OFFSET - Fee for Service",
"Increased ACA OFFSET - MCO")

In [15]:
medicaid_drug_rebates = medicaid_rebates[medicaid_rebates['Service Category'].isin(interested_categories)]

In [16]:
medicaid_drug_rebates.to_excel("data/medicaid_drug_rebates.xlsx")