# Medicaid spending on prescription drugs

In [1]:
import pandas as pd
import numpy as np
pd.set_option("display.float_format", lambda x: "%.2f" % x) # Suppress scientific notation
# Enable browser notifications
%load_ext jupyternotify

<IPython.core.display.Javascript object>

## Import [data](https://www.nasbo.org/mainsite/reports-data/state-expenditure-report) on state expenditures from NASBO

In [2]:
budgets = pd.read_excel("data/medicaid_spending/state_expenditures.xlsx", usecols=["YEAR", "STATE", "MCAID_GF", "MCAID_FF", "MCAID_OF", "MCAID_BF", "MCAID_TOT", "GFTOT_CAPI", "FFTOT_CAPI", "OFTOT_CAPI", "BFTOT_CAPI", "TOTAL_CAPI"], names=["year", "state", "medicaid_general", "medicaid_federal", "medicaid_other", "medicaid_bonds", "medicaid_total", "all_spending_general", "all_spending_federal", "all_spending_other", "all_spending_bonds", "all_spending_total"], sheetname="State Exp Report Data")
budgets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1376 entries, 0 to 1375
Data columns (total 12 columns):
year                    1376 non-null int64
state                   1376 non-null object
medicaid_general        1355 non-null float64
medicaid_federal        1354 non-null float64
medicaid_other          1355 non-null float64
medicaid_bonds          1354 non-null float64
medicaid_total          1376 non-null float64
all_spending_general    1376 non-null float64
all_spending_federal    1376 non-null float64
all_spending_other      1376 non-null float64
all_spending_bonds      1376 non-null float64
all_spending_total      1376 non-null float64
dtypes: float64(10), int64(1), object(1)
memory usage: 129.1+ KB


Filter the data to years for 2008 and later.

In [3]:
budgets = budgets[budgets["year"] >= 2008].reset_index(drop=True)
budgets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 510 entries, 0 to 509
Data columns (total 12 columns):
year                    510 non-null int64
state                   510 non-null object
medicaid_general        500 non-null float64
medicaid_federal        500 non-null float64
medicaid_other          500 non-null float64
medicaid_bonds          500 non-null float64
medicaid_total          510 non-null float64
all_spending_general    510 non-null float64
all_spending_federal    510 non-null float64
all_spending_other      510 non-null float64
all_spending_bonds      510 non-null float64
all_spending_total      510 non-null float64
dtypes: float64(10), int64(1), object(1)
memory usage: 47.9+ KB


In [4]:
budgets.head(1)

Unnamed: 0,year,state,medicaid_general,medicaid_federal,medicaid_other,medicaid_bonds,medicaid_total,all_spending_general,all_spending_federal,all_spending_other,all_spending_bonds,all_spending_total
0,2008,Alabama,471.0,2899.0,1030.0,0.0,4400.0,8460.0,6291.0,4537.0,552.0,19840.0


Create a column that combines all state funding sources (general, other and bonds).

In [5]:
budgets["medicaid_state"] = budgets["medicaid_general"] + budgets["medicaid_other"] + budgets["medicaid_bonds"]
budgets["all_spending_state"] = budgets["all_spending_general"] + budgets["all_spending_other"] + budgets["all_spending_bonds"]
budgets.drop(["medicaid_general", "medicaid_other", "medicaid_bonds"], axis=1, inplace=True) # Drop component columns
budgets.drop(["all_spending_general", "all_spending_other", "all_spending_bonds"], axis=1, inplace=True) # Drop component columns
budgets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 510 entries, 0 to 509
Data columns (total 8 columns):
year                    510 non-null int64
state                   510 non-null object
medicaid_federal        500 non-null float64
medicaid_total          510 non-null float64
all_spending_federal    510 non-null float64
all_spending_total      510 non-null float64
medicaid_state          500 non-null float64
all_spending_state      510 non-null float64
dtypes: float64(6), int64(1), object(1)
memory usage: 32.0+ KB


Spot check the post-calculation numbers.

In [6]:
budgets.head(1)

Unnamed: 0,year,state,medicaid_federal,medicaid_total,all_spending_federal,all_spending_total,medicaid_state,all_spending_state
0,2008,Alabama,2899.0,4400.0,6291.0,19840.0,1501.0,13549.0


## Import the CMS [Medicaid drug utilization data](https://www.medicaid.gov/medicaid/prescription-drugs/state-drug-utilization-data/index.html)

Query the Medicaid drug utilization API for each year and calculate the Medicaid amount paid.

In [7]:
query_16 = "https://data.medicaid.gov/resource/4kp3-zsqr.json?$select=period_covered,sum(medicaid_amount_reimbursed)&$group=state_code,period_covered"
drug_spending_16 = pd.read_json(query_16)
query_15 = "https://data.medicaid.gov/resource/h5ii-2ar3.json?$select=period_covered,sum(medicaid_amount_reimbursed)&$group=state_code,period_covered"
drug_spending_15 = pd.read_json(query_15)
query_14 = "https://data.medicaid.gov/resource/hz5c-g52b.json?$select=period_covered,sum(medicaid_amount_reimbursed)&$group=state_code,period_covered"
drug_spending_14 = pd.read_json(query_14)
query_13 = "https://data.medicaid.gov/resource/kmyc-3n7k.json?$select=period_covered,sum(medicaid_amount_reimbursed)&$group=state_code,period_covered"
drug_spending_13 = pd.read_json(query_13)
query_12 = "https://data.medicaid.gov/resource/mkpf-yey3.json?$select=period_covered,sum(medicaid_amount_reimbursed)&$group=state_code,period_covered"
drug_spending_12 = pd.read_json(query_12)
query_11 = "https://data.medicaid.gov/resource/de79-47w8.json?$select=period_covered,sum(medicaid_amount_reimbursed)&$group=state_code,period_covered"
drug_spending_11 = pd.read_json(query_11)
query_10 = "https://data.medicaid.gov/resource/dgte-3gu8.json?$select=period_covered,sum(medicaid_amount_reimbursed)&$group=state_code,period_covered"
drug_spending_10 = pd.read_json(query_10)
query_09 = "https://data.medicaid.gov/resource/6tcm-8x84.json?$select=period_covered,sum(medicaid_amount_reimbursed)&$group=state_code,period_covered"
drug_spending_09 = pd.read_json(query_09)
query_08 = "https://data.medicaid.gov/resource/bzcm-rrj6.json?$select=period_covered,sum(medicaid_amount_reimbursed)&$group=state_code,period_covered"
drug_spending_08 = pd.read_json(query_08)

Concatenate the dataframes into a single dataframe of spending in all years.

In [8]:
drug_spending = pd.concat([drug_spending_16, drug_spending_15, drug_spending_14, drug_spending_13, drug_spending_12, drug_spending_11, drug_spending_10, drug_spending_09, drug_spending_08]).reset_index(drop=True)
drug_spending.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 2 columns):
period_covered                    9 non-null int64
sum_medicaid_amount_reimbursed    9 non-null float64
dtypes: float64(1), int64(1)
memory usage: 224.0 bytes


In [9]:
drug_spending.rename(columns={"period_covered": "year", "sum_medicaid_amount_reimbursed": "drug_spending"}, inplace=True)
drug_spending

Unnamed: 0,year,drug_spending
0,2016,62016683226.54
1,2015,55620114965.09
2,2014,47078114991.53
3,2013,37940450109.63
4,2012,37976860399.61
5,2011,37783429301.78
6,2010,32989906941.56
7,2009,26014836021.22
8,2008,24642404374.76


## Import the [Medicaid drug rebates data](https://www.medicaid.gov/medicaid/finance/state-expenditure-reporting/expenditure-reports/index.html) from the CMS MBES/CBES reports.

In [10]:
drug_rebates_16 = pd.read_csv("data/medicaid_spending/FY_2016_Financial_Management_Data_-_National_Totals.csv", header=0, usecols=[0, 3, 4], names=["year", "service_category", "drug_rebates"])
drug_rebates_15 = pd.read_excel("data/medicaid_spending/FY 2015 NET EXPENDITURES.xlsx", sheetname="MAP - National Totals", header=6, usecols=[0, 1], names=["service_category", "drug_rebates"])
drug_rebates_14 = pd.read_excel("data/medicaid_spending/FMR Net Expenditures FY14.xlsx", sheetname="MAP - National Totals", header=6, usecols=[0, 1], names=["service_category", "drug_rebates"])
drug_rebates_13 = pd.read_excel("data/medicaid_spending/FMR Net Expenditures FY13.xlsx", sheetname="MAP - National Totals", header=6, usecols=[0, 1], names=["service_category", "drug_rebates"])
drug_rebates_12 = pd.read_excel("data/medicaid_spending/FMR Net Expenditures FY12.xlsx", sheetname="National Totals", header=6, usecols=[0, 1], names=["service_category", "drug_rebates"])
# For the following imports, we will skip all the rows until the "National Totals" data begins
drug_rebates_11 = pd.read_excel("data/medicaid_spending/NetExpenditure02through11.xlsx", sheetname="2011", skiprows=10605, header=7, usecols=[0, 1], names=["service_category", "drug_rebates"]) 
drug_rebates_10 = pd.read_excel("data/medicaid_spending/NetExpenditure02through11.xlsx", sheetname="2010", skiprows=10031, header=7, usecols=[0, 1], names=["service_category", "drug_rebates"]) 
drug_rebates_09 = pd.read_excel("data/medicaid_spending/NetExpenditure02through11.xlsx", sheetname="2009", skiprows=7303, header=7, usecols=[0, 1], names=["service_category", "drug_rebates"]) 
drug_rebates_08 = pd.read_excel("data/medicaid_spending/NetExpenditure02through11.xlsx", sheetname="2008", skiprows=7175, header=7, usecols=[0, 1], names=["service_category", "drug_rebates"]) 

Add a year column to the 2008 through 2015 dataframes.

In [11]:
drug_rebates_15["year"] = 2015
drug_rebates_14["year"] = 2014
drug_rebates_13["year"] = 2013
drug_rebates_12["year"] = 2012
drug_rebates_11["year"] = 2011
drug_rebates_10["year"] = 2010
drug_rebates_09["year"] = 2009
drug_rebates_08["year"] = 2008

Convert the drug rebates column to numeric (int) data type in the 2016 data.

In [12]:
drug_rebates_16["drug_rebates"] = drug_rebates_16["drug_rebates"].str.replace(",", "")
drug_rebates_16["drug_rebates"] = pd.to_numeric(drug_rebates_16["drug_rebates"])
drug_rebates_16.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 277 entries, 0 to 276
Data columns (total 3 columns):
year                277 non-null int64
service_category    277 non-null object
drug_rebates        277 non-null int64
dtypes: int64(2), object(1)
memory usage: 6.6+ KB


Concatenate the dataframes into a single dataframe of rebates in all years.

In [13]:
drug_rebates = pd.concat([drug_rebates_16, drug_rebates_15, drug_rebates_14, drug_rebates_13, drug_rebates_12, drug_rebates_11, drug_rebates_10, drug_rebates_09, drug_rebates_08]).reset_index(drop=True)
drug_rebates.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1739 entries, 0 to 1738
Data columns (total 3 columns):
drug_rebates        1710 non-null float64
service_category    1724 non-null object
year                1739 non-null int64
dtypes: float64(1), int64(1), object(1)
memory usage: 40.8+ KB


Filter the data to just the six drug rebates.

In [14]:
drug_rebates = drug_rebates[(drug_rebates["service_category"].str.startswith("Drug Rebate Offset - ")) | (drug_rebates["service_category"].str.startswith("MCO - ")) | (drug_rebates["service_category"].str.startswith("Increased ACA OFFSET - "))]
drug_rebates

Unnamed: 0,drug_rebates,service_category,year
75,-12885689016.0,Drug Rebate Offset - National,2016
76,-864806347.0,Drug Rebate Offset - State Sidebar Agreement,2016
77,-15859240388.0,MCO - National Agreement,2016
78,-230435858.0,MCO - State Sidebar Agreement,2016
79,-576527028.0,Increased ACA OFFSET - Fee for Service,2016
80,-776169673.0,Increased ACA OFFSET - MCO,2016
295,-10547657885.0,Drug Rebate Offset - National,2015
296,-842686786.0,Drug Rebate Offset - State Sidebar Agreement,2015
297,-10990020714.0,MCO - National Agreement,2015
298,-207156105.0,MCO - State Sidebar Agreement,2015


Calculate the total drug rebates by year.

In [15]:
drug_rebates = drug_rebates.groupby("year")["drug_rebates"].sum().to_frame().reset_index()
drug_rebates

Unnamed: 0,year,drug_rebates
0,2008,-8393320183.0
1,2009,-9721492081.0
2,2010,-11505122505.0
3,2011,-16117488352.0
4,2012,-17549478009.0
5,2013,-18274357490.0
6,2014,-19878420141.0
7,2015,-24030295066.0
8,2016,-31192868310.0


## Calculate drug spending less rebates

Join the drug spending and drug rebates dataframes.

In [17]:
drugs = drug_spending.merge(drug_rebates, on="year")
# Add the columns because the rebates are expressed as negatives
drugs["spending_less_rebates"] = drugs["drug_spending"] + drugs["drug_rebates"]
drugs

Unnamed: 0,year,drug_spending,drug_rebates,spending_less_rebates
0,2016,62016683226.54,-31192868310.0,30823814916.54
1,2015,55620114965.09,-24030295066.0,31589819899.09
2,2014,47078114991.53,-19878420141.0,27199694850.53
3,2013,37940450109.63,-18274357490.0,19666092619.63
4,2012,37976860399.61,-17549478009.0,20427382390.61
5,2011,37783429301.78,-16117488352.0,21665940949.78
6,2010,32989906941.56,-11505122505.0,21484784436.56
7,2009,26014836021.22,-9721492081.0,16293343940.22
8,2008,24642404374.76,-8393320183.0,16249084191.76


## Export the data for further analysis and visualization

In [18]:
writer = pd.ExcelWriter("data/medicaid_spending.xlsx")
budgets.to_excel(writer, "budgets", startcol=0, index=False)
drugs.to_excel(writer, "drugs", startcol=0, index=False)
writer.save()

# SCRATCH PAD AND OLD STUFF

In [37]:
drug_rebates = pd.read_excel("data/medicaid_drug_rebates_working.xlsx", sheetname="working_sheet")
drug_rebates.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2598 entries, 0 to 2597
Data columns (total 5 columns):
service_category    2598 non-null object
st_name             2598 non-null object
st_abbreviation     2598 non-null object
rebate_amount       2598 non-null int64
year                2598 non-null int64
dtypes: int64(2), object(3)
memory usage: 101.6+ KB


In [38]:
drug_rebates.rename(columns={"rebate_amount": "drug_rebates"}, inplace=True)
drug_rebates.head(1)

Unnamed: 0,service_category,st_name,st_abbreviation,drug_rebates,year
0,Drug Rebate Offset - National,Alaska,AK,-51646337,2016


What's the total of all rebates for all states, years and service categories?

In [39]:
drug_rebates["drug_rebates"].sum()

-175551698792

In [40]:
drug_rebates_grouped = drug_rebates.groupby(["st_name", "st_abbreviation", "year"])["drug_rebates"].sum()
drug_rebates_grouped = drug_rebates_grouped.reset_index()
drug_rebates_grouped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555 entries, 0 to 554
Data columns (total 4 columns):
st_name            555 non-null object
st_abbreviation    555 non-null object
year               555 non-null int64
drug_rebates       555 non-null int64
dtypes: int64(2), object(2)
memory usage: 17.4+ KB


In [41]:
drug_rebates_grouped.head(1)

Unnamed: 0,st_name,st_abbreviation,year,drug_rebates
0,Alabama,AL,2006,-151202667


Join the drug spending and rebates datasets.

In [42]:
medicaid_drug_costs = drug_spending.merge(drug_rebates_grouped, left_on=["state_code", "period_covered"], right_on=["st_abbreviation", "year"], how="inner")
medicaid_drug_costs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 551 entries, 0 to 550
Data columns (total 7 columns):
period_covered     551 non-null int64
state_code         551 non-null object
drug_spending      551 non-null float64
st_name            551 non-null object
st_abbreviation    551 non-null object
year               551 non-null int64
drug_rebates       551 non-null int64
dtypes: float64(1), int64(3), object(3)
memory usage: 34.4+ KB


Make sure those four missing records are Arizona's four missing years of spending data.

In [43]:
medicaid_drug_costs.groupby("state_code")["state_code"].size()

state_code
AK    11
AL    11
AR    11
AZ     7
CA    11
CO    11
CT    11
DC     5
DE    11
FL    11
GA    11
HI    11
IA    11
ID    11
IL    11
IN    11
KS    11
KY    11
LA    11
MA    11
MD    11
ME    11
MI    11
MN    11
MO    11
MS    11
MT    11
NC    11
ND    11
NE    11
NH    11
NJ    11
NM    11
NV    11
NY    11
OH    11
OK    11
OR    11
PA    11
RI    11
SC    11
SD    11
TN    11
TX    11
UT    11
VA    11
VT    11
WA    11
WI    11
WV    11
WY    11
Name: state_code, dtype: int64

Calculate the spending less the rebates.

In [44]:
# Use addition because the drug rebates are expressed as negatives
medicaid_drug_costs["spending_less_rebates"] = medicaid_drug_costs["drug_spending"] + medicaid_drug_costs["drug_rebates"]
medicaid_drug_costs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 551 entries, 0 to 550
Data columns (total 8 columns):
period_covered           551 non-null int64
state_code               551 non-null object
drug_spending            551 non-null float64
st_name                  551 non-null object
st_abbreviation          551 non-null object
year                     551 non-null int64
drug_rebates             551 non-null int64
spending_less_rebates    551 non-null float64
dtypes: float64(2), int64(3), object(3)
memory usage: 38.7+ KB


In [45]:
medicaid_drug_costs.head(1)

Unnamed: 0,period_covered,state_code,drug_spending,st_name,st_abbreviation,year,drug_rebates,spending_less_rebates
0,2016,AK,108594348.62,Alaska,AK,2016,-53673099,54921249.62


Spot check the 2016 data against [MACPAC's graphic](https://www.macpac.gov/wp-content/uploads/2015/11/EXHIBIT-28.-Medicaid-Gross-Spending-and-Rebates-for-Drugs-by-Delivery-System-FY-2016-millions.pdf).

In [46]:
medicaid_drug_costs[medicaid_drug_costs["period_covered"] == 2016]

Unnamed: 0,period_covered,state_code,drug_spending,st_name,st_abbreviation,year,drug_rebates,spending_less_rebates
0,2016,AK,108594348.62,Alaska,AK,2016,-53673099,54921249.62
1,2016,AL,667346361.34,Alabama,AL,2016,-391980305,275366056.34
2,2016,AR,339785706.81,Arkansas,AR,2016,-204345993,135439713.81
3,2016,AZ,1246118343.59,Arizona,AZ,2016,-604294056,641824287.59
4,2016,CA,8004575486.94,California,CA,2016,-4277024489,3727550997.94
5,2016,CO,908800937.46,Colorado,CO,2016,-453282840,455518097.46
6,2016,CT,1245733707.42,Connecticut,CT,2016,-791964672,453769035.42
7,2016,DC,258174354.03,District of Columbia,DC,2016,-111668911,146505443.03
8,2016,DE,175553662.96,Delaware,DE,2016,-170345039,5208623.96
9,2016,FL,2820646171.46,Florida,FL,2016,-1626179490,1194466681.46


In [47]:
medicaid_drug_costs.groupby("period_covered")["drug_spending", "drug_rebates", "spending_less_rebates"].sum()

Unnamed: 0_level_0,drug_spending,drug_rebates,spending_less_rebates
period_covered,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2006,13155718056.23,-11555368529,1600349527.23
2007,16645542840.88,-7333488126,9312054714.88
2008,23574853474.39,-8393320183,15181533291.39
2009,24891244344.57,-9721492081,15169752263.57
2010,31501126307.06,-11505122505,19996003802.06
2011,35873723765.06,-16117488352,19756235413.06
2012,35791682437.86,-17549478009,18242204428.86
2013,35370704458.97,-18274357490,17096346968.97
2014,44127966413.29,-19878420141,24249546272.29
2015,51918414900.51,-24030295066,27888119834.51


Join the Medicaid drug costs and the state budget data.

In [48]:
medicaid = medicaid_drug_costs.merge(budgets, left_on=["st_name", "period_covered"], right_on=["state", "year"], how ="inner")
medicaid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 546 entries, 0 to 545
Data columns (total 16 columns):
period_covered           546 non-null int64
state_code               546 non-null object
drug_spending            546 non-null float64
st_name                  546 non-null object
st_abbreviation          546 non-null object
year_x                   546 non-null int64
drug_rebates             546 non-null int64
spending_less_rebates    546 non-null float64
year_y                   546 non-null int64
state                    546 non-null object
medicaid_federal         546 non-null float64
medicaid_total           546 non-null float64
all_spending_federal     546 non-null float64
all_spending_total       546 non-null float64
medicaid_state           546 non-null float64
all_spending_state       546 non-null float64
dtypes: float64(8), int64(4), object(4)
memory usage: 72.5+ KB


Calculate the proportion of each state's Medicaid spending for which drugs are responsible and the proportion of each state's total budget for which Medicaid is responsible.

In [49]:
medicaid["drugs_as_share_of_medicaid"] = medicaid["spending_less_rebates"] / medicaid["medicaid_total"]
medicaid["medicaid_as_share_of_all_spending"] = medicaid["medicaid_total"] / medicaid["all_spending_total"]
medicaid.head()

Unnamed: 0,period_covered,state_code,drug_spending,st_name,st_abbreviation,year_x,drug_rebates,spending_less_rebates,year_y,state,medicaid_federal,medicaid_total,all_spending_federal,all_spending_total,medicaid_state,all_spending_state,drugs_as_share_of_medicaid,medicaid_as_share_of_all_spending
0,2016,AK,108594348.62,Alaska,AK,2016,-53673099,54921249.62,2016,Alaska,1066381900.0,1702572800.0,3277197749.0,10031970628.0,636190900.0,6754772879.0,0.03,0.17
1,2016,AL,667346361.34,Alabama,AL,2016,-391980305,275366056.34,2016,Alabama,4393000000.0,6354000000.0,9635000000.0,25838000000.0,1961000000.0,16203000000.0,0.04,0.25
2,2016,AR,339785706.81,Arkansas,AR,2016,-204345993,135439713.81,2016,Arkansas,5077000000.0,6555700000.0,7397700000.0,23960524000.0,1478700000.0,16562824000.0,0.02,0.27
3,2016,AZ,1246118343.59,Arizona,AZ,2016,-604294056,641824287.59,2016,Arizona,8690000000.0,11521000000.0,14167000000.0,39682000000.0,2831000000.0,25515000000.0,0.06,0.29
4,2016,CA,8004575486.94,California,CA,2016,-4277024489,3727550997.94,2016,California,53337000000.0,81660000000.0,90690000000.0,250899000000.0,28323000000.0,160209000000.0,0.05,0.33


Reorder columns while dropping the unnecessary ones.

In [50]:
medicaid = medicaid[["period_covered", "st_abbreviation", "st_name", "drug_spending", "drug_rebates", "spending_less_rebates", "medicaid_state", "medicaid_federal", "medicaid_total", "all_spending_state", "all_spending_federal", "all_spending_total", "drugs_as_share_of_medicaid", "medicaid_as_share_of_all_spending"]]
medicaid.head()

Unnamed: 0,period_covered,st_abbreviation,st_name,drug_spending,drug_rebates,spending_less_rebates,medicaid_state,medicaid_federal,medicaid_total,all_spending_state,all_spending_federal,all_spending_total,drugs_as_share_of_medicaid,medicaid_as_share_of_all_spending
0,2016,AK,Alaska,108594348.62,-53673099,54921249.62,636190900.0,1066381900.0,1702572800.0,6754772879.0,3277197749.0,10031970628.0,0.03,0.17
1,2016,AL,Alabama,667346361.34,-391980305,275366056.34,1961000000.0,4393000000.0,6354000000.0,16203000000.0,9635000000.0,25838000000.0,0.04,0.25
2,2016,AR,Arkansas,339785706.81,-204345993,135439713.81,1478700000.0,5077000000.0,6555700000.0,16562824000.0,7397700000.0,23960524000.0,0.02,0.27
3,2016,AZ,Arizona,1246118343.59,-604294056,641824287.59,2831000000.0,8690000000.0,11521000000.0,25515000000.0,14167000000.0,39682000000.0,0.06,0.29
4,2016,CA,California,8004575486.94,-4277024489,3727550997.94,28323000000.0,53337000000.0,81660000000.0,160209000000.0,90690000000.0,250899000000.0,0.05,0.33


Export the data to Excel.

In [51]:
medicaid.to_excel("data/medicaid_costs_for_graphics.xlsx", index=False)

## Analyze the data

How has Medicaid spending on drugs changed over time?

In [52]:
medicaid.groupby("period_covered")["spending_less_rebates"].sum()

period_covered
2006    1600349527.23
2007    9312054714.88
2008   15181533291.39
2009   15169752263.57
2010   19996003802.06
2011   19756235413.06
2012   18128296239.87
2013   17066124026.46
2014   24195783690.30
2015   27826383269.39
2016   26387376826.91
Name: spending_less_rebates, dtype: float64

How have Medicaid programs' budgets changed over time? 

In [56]:
medicaid.groupby("period_covered")["medicaid_total"].sum()

period_covered
2006   281543503372.41
2007   292570316142.54
2008   297840892186.72
2009   333489736582.00
2010   359400202593.35
2011   397984952207.02
2012   391070400795.63
2013   406368154689.77
2014   459803265727.18
2015   515296220208.77
2016   541364849093.54
Name: medicaid_total, dtype: float64

How has drug spending changed as a proportion of Medicaid programs' budgets?

In [53]:
medicaid.groupby("period_covered")["spending_less_rebates"].sum() / medicaid.groupby("period_covered")["medicaid_total"].sum()

period_covered
2006   0.01
2007   0.03
2008   0.05
2009   0.05
2010   0.06
2011   0.05
2012   0.05
2013   0.04
2014   0.05
2015   0.05
2016   0.05
dtype: float64

How have total state budgets changed over time?

In [57]:
medicaid.groupby("period_covered")["all_spending_total"].sum()

period_covered
2006   1317741801784.57
2007   1401696143193.89
2008   1453532902661.21
2009   1528925803235.38
2010   1617119951985.93
2011   1672046141954.50
2012   1635928359466.63
2013   1670760856726.77
2014   1733728568017.37
2015   1844638482553.01
2016   1885646852310.84
Name: all_spending_total, dtype: float64

How have Medicaid programs' budgets changed as a proportion of total state budgets?

In [55]:
medicaid.groupby("period_covered")["medicaid_total"].sum() / medicaid.groupby("period_covered")["all_spending_total"].sum()

period_covered
2006   0.21
2007   0.21
2008   0.20
2009   0.22
2010   0.22
2011   0.24
2012   0.24
2013   0.24
2014   0.27
2015   0.28
2016   0.29
dtype: float64

In [27]:
import us

In [28]:
list_of_states = list(us.states.mapping('abbr', 'name').values())

In [37]:
# Store year along with the data from each sheet
# So we can add this as a column later
# Unnecessary for 2016 data

expenditures_16 = (pd.read_excel("data/medicaid_spending/FY_2016_Financial_Management_Data.xlsx", usecols=[0, 2, 4, 10]))
expenditures_15 = (2015, pd.read_excel("data/medicaid_spending/FY 2015 NET EXPENDITURES.xlsx", header=6, sheetname=None, usecols=[0, 1]))
expenditures_14 = (2014, pd.read_excel("data/medicaid_spending/FMR Net Expenditures FY14.xlsx", header=6, sheetname=None, usecols=[0, 1]))
expenditures_13 = (2013, pd.read_excel("data/medicaid_spending/FMR Net Expenditures FY13.xlsx", header=6, sheetname=None, usecols=[0, 1]))
expenditures_12 = (2012, pd.read_excel("data/medicaid_spending/FMR Net Expenditures FY12.xlsx", header=6, sheetname=None, usecols=[0, 1]))
expenditures_06_11 = pd.read_excel("data/medicaid_spending/NetExpenditure02through11.xlsx", header=None, skiprows=4, usecols=[0, 1], sheetname=["2006", "2007", "2008", "2009", "2010", "2011"])

In [38]:
# Make an array of the datasets we want to join
expenditures_12_16 = [expenditures_16, expenditures_15, expenditures_14, expenditures_13, expenditures_12]

# Empty array to hold the final dataframes
extracted_sheets = []
for year, data in expenditures_12_16:
    
    # Filter sheets that have "MAP" in the value.
    wanted_sheets = [(sheet_name, sheet) for sheet_name, sheet in data.items() if sheet_name.startswith('MAP')]
    
    # If we don't find any sheets that have MAP, then use all available sheets
    # This is to handle 2012 and 2016 datasets
    if not wanted_sheets:
        wanted_sheets = data.items()

    # Create a list of sheet names (in this case that's the state names)
    sheet_names = [sheet_name for sheet_name, sheet in wanted_sheets]
    
    # Create a list of all the sheets corresponding to each name above
    sheets = [sheet for sheet_name, sheet in wanted_sheets]
    
    # Remove MAP from sheet name if it exists
    sheet_names = [sheet_name.split('-')[-1] for sheet_name in sheet_names]
    
    # Combine all the sheets and use the sheet_names to add a state column in the final dataset
    all_states = pd.concat(sheets, keys=sheet_names)
    
    # Add a YEAR column to signify the year for the sheets being added
    all_states['Year'] = year
    
    # Add them to an array so they can be concatenated later.
    extracted_sheets.append(all_states)

ValueError: too many values to unpack (expected 2)

In [9]:
# Empty array to hold the final dataframes for 2006 - 2011
all_06_11_data = []

# Iterate through the file, the sheet name is the year.
for year, data in expenditures_06_11.items():
    
    # Identify rows that deliniate the tables (each containing a state) or contains one of the summary table names
    boundary_rows = data[0].isin(list_of_states + ['All States', 'National Totals'])
    boundary_indices = data[boundary_rows].index
    
    # Place holder to hold the data from the curent iteration
    states = []
    
    # Each boundary indicates the start of a table, each table contains data for a state
    for i,item in enumerate(boundary_indices):
        start = item
        if i+1 < len(boundary_indices): end = boundary_indices[i+1]
        else: end = None # Get the rest of the dataframe it's the last slice

        # Slice the current table (state) out of the main dataset
        current_dataset = data.iloc[start:end]
        
        # Replace columns with whitespace into Null value (NA)
        current_dataset = current_dataset.replace(r'^\s+$', np.nan, regex=True)
        
        # Remove rows where all columns are null
        current_dataset = current_dataset.dropna(how='all')
        
        # Get the name of the state from the first column of the first row
        state_name = current_dataset.iloc[0][0]
        
        # If the state name is a valid state then we keep the table (ignore summary statistics)
        if state_name in list_of_states:
            current_dataset['State'] = state_name
            current_dataset.columns = ['Service Category', 'Total Computable', 'State']
            current_dataset = current_dataset[-current_dataset['Service Category'].isin([state_name, 'Service Category'])]
            states.append(current_dataset)
    
    all_states_current_year = pd.concat(states)
    all_states_current_year['Year'] = year
    all_06_11_data.append(all_states_current_year)

In [11]:
# Concatenate all the sheets from all the years into a big dataframe with a state and year column
medicaid_rebates = pd.concat(all_06_11_data + extracted_sheets)

In [12]:
interested_categories = ("Drug Rebate Offset - National",
"Drug Rebate Offset - State Sidebar Agreement",
"MCO - National Agreement",
"MCO - State Sidebar Agreement",
"Increased ACA OFFSET - Fee for Service",
"Increased ACA OFFSET - MCO")

In [13]:
medicaid_drug_rebates = medicaid_rebates[medicaid_rebates['Service Category'].isin(interested_categories)]

In [15]:
medicaid_drug_rebates.to_excel("data/medicaid_drug_rebates.xlsx")

# Integrity checks scratch space

In [22]:
medicaid_drug_rebates[medicaid_drug_rebates["State"] == "Arizona"]

Unnamed: 0,Service Category,State,Total Computable,Total Computable.1,Year
408,Drug Rebate Offset - National,Arizona,,0.0,2006
409,Drug Rebate Offset - State Sidebar Agreement,Arizona,,0.0,2006
408,Drug Rebate Offset - National,Arizona,,0.0,2007
409,Drug Rebate Offset - State Sidebar Agreement,Arizona,,0.0,2007
526,Drug Rebate Offset - National,Arizona,,0.0,2008
527,Drug Rebate Offset - State Sidebar Agreement,Arizona,,0.0,2008
526,Drug Rebate Offset - National,Arizona,,0.0,2009
527,Drug Rebate Offset - State Sidebar Agreement,Arizona,,0.0,2009
736,Drug Rebate Offset - National,Arizona,,0.0,2010
737,Drug Rebate Offset - State Sidebar Agreement,Arizona,,0.0,2010
