# Libraries

In [2]:
import pandas as pd
import requests

pd.set_option('display.max_columns', None) # display all columns in DF

# Data from Metastore

In [3]:
r = requests.get('https://data.medicaid.gov/api/1/metastore/schemas/dataset/items') # requesting meta data
data = r.json()

df_metastore = pd.json_normalize(data, max_level = 1)
df_metastore = df_metastore[df_metastore['title'].str.contains('State Drug Utilization Data')] # selecting datasets
df_metastore = df_metastore.sort_values(['title']) # sorting datasets
df_metastore['year'] = df_metastore['title'].str[-4:] # creating variable for year

# Data from Medicaid

In [7]:
start_year = 1999

for index, row in df_metastore.iterrows(): # iterating through meta data

    title_i = df_metastore['title'][index]
    identifier_i = df_metastore['identifier'][index]
    year_i = df_metastore['year'][index]

    if int(year_i) >= start_year: # selecting meta data from start year and onward

        start_url = 'https://data.medicaid.gov/api/1/datastore/query/'
        middle_url = str(identifier_i)
        end_url = '/0'
        
        r = requests.get(start_url + middle_url + end_url, timeout = 20)  # requesting part of dataset
        data = r.json()
        
        data_n = data['count'] # finding the number of observations
        
        for offset_i in range(0, int(data_n), 10000): # iterating through data in smaller 
    
            if offset_i == 0:

                df = pd.json_normalize(data['results'], max_level = 1)
                
                df['units_reimbursed'] = pd.to_numeric(df['units_reimbursed'], errors='coerce') # removing empty observations
                df['total_amount_reimbursed'] = pd.to_numeric(df['total_amount_reimbursed'], errors='coerce')
                df.dropna(subset = ['units_reimbursed', 'total_amount_reimbursed'], inplace = True)
            else:
                offset = offset_i
                offset_url = '?offset=' + str(offset)
                r = requests.get(start_url + middle_url + end_url + offset_url, timeout = 20) # requesting part of dataset
                data = r.json()
                
                df = pd.json_normalize(data['results'], max_level = 1)
                
                df['units_reimbursed'] = pd.to_numeric(df['units_reimbursed'], errors='coerce') # removing empty observations
                df['total_amount_reimbursed'] = pd.to_numeric(df['total_amount_reimbursed'], errors='coerce')
                df.dropna(subset = ['units_reimbursed', 'total_amount_reimbursed'], inplace = True)
        
            df['dataset'] = title_i # adding name of dataset

            df['price_per_unit'] = df['total_amount_reimbursed'] / df['units_reimbursed'] # calculating price per unit

            df = df.groupby(['dataset', 'labeler_code', 'product_code', 'year', 'quarter'])['price_per_unit'].mean()

            if int(year_i) == start_year and offset_i == 0:
                df_medicaid = df.reset_index()
            else:
                df = df.reset_index()
                df_medicaid = df_medicaid.append(df, ignore_index = True)
                
            df_medicaid = df_medicaid.groupby(['dataset', 'labeler_code', 'product_code', 'year', 'quarter'])['price_per_unit'].mean() # grouping by for each NDA and year
            df_medicaid = df_medicaid.reset_index()
        
        print(title_i)

State Drug Utilization Data 1999
State Drug Utilization Data 2000
State Drug Utilization Data 2001
State Drug Utilization Data 2002
State Drug Utilization Data 2003
State Drug Utilization Data 2004
State Drug Utilization Data 2005
State Drug Utilization Data 2006
State Drug Utilization Data 2007
State Drug Utilization Data 2008
State Drug Utilization Data 2009
State Drug Utilization Data 2010
State Drug Utilization Data 2011
State Drug Utilization Data 2012
State Drug Utilization Data 2013
State Drug Utilization Data 2014
State Drug Utilization Data 2015
State Drug Utilization Data 2016
State Drug Utilization Data 2017
State Drug Utilization Data 2018
State Drug Utilization Data 2019
State Drug Utilization Data 2020
State Drug Utilization Data 2021


In [8]:
len(df_medicaid)

1574630

In [9]:
df_medicaid

Unnamed: 0,dataset,labeler_code,product_code,year,quarter,price_per_unit
0,State Drug Utilization Data 1999,00002,0125,1999,1,0.020499
1,State Drug Utilization Data 1999,00002,0125,1999,2,0.019720
2,State Drug Utilization Data 1999,00002,0125,1999,3,0.021717
3,State Drug Utilization Data 1999,00002,0125,1999,4,0.020933
4,State Drug Utilization Data 1999,00002,0313,1999,1,0.024479
...,...,...,...,...,...,...
1574625,State Drug Utilization Data 2021,78670,0130,2021,1,134.683129
1574626,State Drug Utilization Data 2021,78670,0131,2021,1,124.031333
1574627,State Drug Utilization Data 2021,89141,0123,2021,1,4.395548
1574628,State Drug Utilization Data 2021,99207,0240,2021,1,19.294980


# Downloading output

In [10]:
df_medicaid.to_csv('raw_medicaid_data.csv', index = False)