In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option("display.float_format", lambda x: "%.2f" % x) # Suppress scientific notation for float data type

## Import and analyze Medicaid state drug utilization data

Query the Medicaid API to return the drug name, the state that bought the drugs and the number of drugs purchased in 2016.

In [2]:
query = "https://data.medicaid.gov/resource/neai-csgh.json?$select=state_code,product_fda_list_name,sum(units_reimbursed)&$where=suppression_used=False%20and%20not%20state_code='XX'&$group=state_code,product_fda_list_name&$limit=4621014&$$app_token=v3AK8nRjxbWjtmIBGHJ9OmMlb"
drugs = pd.read_json(query)
drugs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95081 entries, 0 to 95080
Data columns (total 3 columns):
product_fda_list_name    95079 non-null object
state_code               95081 non-null object
sum_units_reimbursed     95081 non-null float64
dtypes: float64(1), object(2)
memory usage: 2.2+ MB


In [3]:
drugs.head()

Unnamed: 0,product_fda_list_name,state_code,sum_units_reimbursed
0,ZINC OXIDE,KY,417559.25
1,RAVICTI,TN,15775.0
2,BICILLIN L,IN,1919.67
3,Tramadol H,WA,36053.0
4,NAPROXEN 3,NV,8678.0


Rank the drugs by their units reimbursed within each state.

In [4]:
drugs["rank"] = drugs.groupby("state_code")["sum_units_reimbursed"].rank(method="min", ascending=False).astype(int)
drugs.head()

Unnamed: 0,product_fda_list_name,state_code,sum_units_reimbursed,rank
0,ZINC OXIDE,KY,417559.25,283
1,RAVICTI,TN,15775.0,734
2,BICILLIN L,IN,1919.67,2131
3,Tramadol H,WA,36053.0,966
4,NAPROXEN 3,NV,8678.0,1075


Create a new dataframe with the top 10 drugs in each state.

In [5]:
top_10 = drugs[drugs["rank"] <= 10].sort_values(["state_code", "rank"], ascending=True).reset_index(drop=True)
top_10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 510 entries, 0 to 509
Data columns (total 4 columns):
product_fda_list_name    510 non-null object
state_code               510 non-null object
sum_units_reimbursed     510 non-null float64
rank                     510 non-null int32
dtypes: float64(1), int32(1), object(2)
memory usage: 14.0+ KB


How many times does each drug appear in a state's top-10 list?

In [6]:
counts = top_10.product_fda_list_name.value_counts().reset_index() # Create new dataframe of drug counts
counts.columns = ["product_fda_list_name", "count"] # Rename columns
counts.head()

Unnamed: 0,product_fda_list_name,count
0,AMOXICILLI,50
1,POLYETHYLE,48
2,GABAPENTIN,43
3,ALBUTEROL,38
4,SODIUM CHL,33


Merge the dataframes.

In [7]:
top_10 = top_10.merge(counts, how="inner", on="product_fda_list_name")
top_10.head()

Unnamed: 0,product_fda_list_name,state_code,sum_units_reimbursed,rank,count
0,SODIUM CHL,AK,4318793.63,1,33
1,SODIUM CHL,CA,126541509.44,9,33
2,SODIUM CHL,CO,14185663.68,5,33
3,SODIUM CHL,DE,2099675.0,10,33
4,SODIUM CHL,FL,128607823.5,1,33


Which drugs appear in only a single state's top-10 list?

In [8]:
outliers = top_10[top_10["count"] <= 1].sort_values(["product_fda_list_name", "state_code"], ascending=True).reset_index(drop=True)
outliers

Unnamed: 0,product_fda_list_name,state_code,sum_units_reimbursed,rank,count
0,ADVATE 5ML,NV,8649028.0,8,1
1,ALPRAZOLAM,MO,9810078.31,10,1
2,AMLODIPINE,DC,2354345.13,10,1
3,AMMONIUM L,NY,61596452.85,8,1
4,BROMFED DM,TX,61984061.99,7,1
5,BUPROPION,VT,1365163.5,9,1
6,CHILDREN I,TX,77695830.83,6,1
7,CLONAZEPAM,RI,2107665.0,9,1
8,Cetirizine,OK,5399826.07,7,1
9,DEXTROAMP-,MA,14594796.5,8,1


Export the outliers data as an Excel file.

In [9]:
outliers.to_excel("medicaid_drug_utilization_outliers.xlsx")