# Libraries

In [4]:
import pandas as pd
import requests

pd.set_option('display.max_columns', None) # display all columns in DF

# Data from Metastore

In [5]:
r = requests.get('https://data.medicaid.gov/api/1/metastore/schemas/dataset/items') # requesting meta data
data = r.json()

df_metastore = pd.json_normalize(data, max_level = 1)
df_metastore = df_metastore[df_metastore['title'].str.contains('State Drug Utilization Data')] # selecting datasets
df_metastore = df_metastore.sort_values(['title']) # sorting datasets
df_metastore['year'] = df_metastore['title'].str[-4:] # creating variable for year

# Data from Medicaid

In [6]:
start_year = 2020 # first year 1991

for index, row in df_metastore.iterrows(): # iterating through meta data

    title_i = df_metastore['title'][index]
    identifier_i = df_metastore['identifier'][index]
    year_i = df_metastore['year'][index]

    if int(year_i) >= start_year: # selecting meta data from start year and onward
        
        print(title_i)
        
        start_url = 'https://data.medicaid.gov/api/1/datastore/query/'
        middle_url = str(identifier_i)
        end_url = '/0'
        
        r = requests.get(start_url + middle_url + end_url, timeout = 20)  # requesting part of dataset
        data = r.json()
        
        data_n = data['count'] # finding the number of observations
        
        for offset_i in range(0, int(data_n), 10000): # iterating through data in smaller 
    
            if offset_i == 0:

                df_temp = pd.json_normalize(data['results'], max_level = 1)
                
                df_temp['units_reimbursed'] = pd.to_numeric(df_temp['units_reimbursed'], errors='coerce') # removing empty observations
                df_temp['total_amount_reimbursed'] = pd.to_numeric(df_temp['total_amount_reimbursed'], errors='coerce')
                df_temp.dropna(subset = ['units_reimbursed', 'total_amount_reimbursed'], inplace = True)

                df_temp = df_temp[(df_temp['units_reimbursed'] != 0)] # removing zero observations
                df_temp = df_temp[(df_temp['total_amount_reimbursed'] != 0)]
                
            else:
                offset = offset_i
                offset_url = '?offset=' + str(offset)
                r = requests.get(start_url + middle_url + end_url + offset_url, timeout = 20) # requesting part of dataset
                data = r.json()
                
                df_temp = pd.json_normalize(data['results'], max_level = 1)
                
                df_temp['units_reimbursed'] = pd.to_numeric(df_temp['units_reimbursed'], errors='coerce') # removing empty observations
                df_temp['total_amount_reimbursed'] = pd.to_numeric(df_temp['total_amount_reimbursed'], errors='coerce')
                df_temp.dropna(subset = ['units_reimbursed', 'total_amount_reimbursed'], inplace = True)

                df_temp = df_temp[(df_temp['units_reimbursed'] != 0)] # removing zero observations
                df_temp = df_temp[(df_temp['total_amount_reimbursed'] != 0)]

            if offset_i == 0:
                df = df_temp[['labeler_code', 'product_code', 'year', 'quarter', 'units_reimbursed', 'total_amount_reimbursed']]
            else:
                df = df.append(df_temp[['labeler_code', 'product_code', 'year', 'quarter', 'units_reimbursed', 'total_amount_reimbursed']], ignore_index = True)
        
    if int(year_i) == start_year:
        df['dataset'] = title_i # adding name of dataset
        
        df_units_reimbursed = df.groupby(['dataset', 'labeler_code', 'product_code', 'year', 'quarter'])['units_reimbursed'].sum() # grouping by for each NDA and quarter
        df_total_amount_reimbursed = df.groupby(['dataset', 'labeler_code', 'product_code', 'year', 'quarter'])['total_amount_reimbursed'].sum()
        
        df_medicaid = pd.merge(df_units_reimbursed, df_total_amount_reimbursed, left_index = True, right_index = True)

        df_medicaid = df_medicaid.reset_index()

    elif int(year_i) >= start_year:
        df['dataset'] = title_i # adding name of dataset
        
        df_units_reimbursed = df.groupby(['dataset', 'labeler_code', 'product_code', 'year', 'quarter'])['units_reimbursed'].sum() # grouping by for each NDA and quarter
        df_total_amount_reimbursed = df.groupby(['dataset', 'labeler_code', 'product_code', 'year', 'quarter'])['total_amount_reimbursed'].sum()
        
        df_medicaid_temp = pd.merge(df_units_reimbursed, df_total_amount_reimbursed, left_index = True, right_index = True)

        df_medicaid_temp = df_medicaid_temp.reset_index()
        
        df_medicaid = df_medicaid.append(df_medicaid_temp, ignore_index = True)
        
df_medicaid['price_per_unit'] = df_medicaid['total_amount_reimbursed'] / df_medicaid['units_reimbursed'] # calculating price per unit

State Drug Utilization Data 2020
State Drug Utilization Data 2021


In [7]:
df_medicaid

Unnamed: 0,dataset,labeler_code,product_code,year,quarter,units_reimbursed,total_amount_reimbursed,price_per_unit
0,State Drug Utilization Data 2020,00002,0101,2020,1,222.0,1.224048e+04,55.137297
1,State Drug Utilization Data 2020,00002,0101,2020,2,426.0,1.355196e+04,31.812113
2,State Drug Utilization Data 2020,00002,1433,2020,1,264990.0,1.008005e+08,380.393731
3,State Drug Utilization Data 2020,00002,1433,2020,2,295444.0,1.142042e+08,386.551215
4,State Drug Utilization Data 2020,00002,1433,2020,3,334039.5,1.296977e+08,388.270457
...,...,...,...,...,...,...,...,...
105838,State Drug Utilization Data 2021,78670,0130,2021,1,183.2,2.476169e+04,135.162063
105839,State Drug Utilization Data 2021,78670,0131,2021,1,60.0,7.441880e+03,124.031333
105840,State Drug Utilization Data 2021,89141,0123,2021,1,4050.0,1.780197e+04,4.395548
105841,State Drug Utilization Data 2021,99207,0240,2021,1,1494.0,2.882670e+04,19.294980


# Downloading output

In [8]:
df_medicaid.to_csv('raw_medicaid_data.csv', sep = '~', index = False)

# Appending output if split (do not use)

In [14]:
df_medicaid_2019 = pd.read_csv('raw_medicaid_data_2019.csv', sep = '~')
df_medicaid_2019 = df_medicaid_2019[df_medicaid_2019['year'] != '2020']
df_medicaid_2019['price_per_unit'] = df_medicaid_2019['total_amount_reimbursed'] / df_medicaid_2019['units_reimbursed']
df_medicaid_2021 = pd.read_csv('raw_medicaid_data_2021.csv', sep = '~')
df_medicaid = pd.concat([df_medicaid_2019, df_medicaid_2021])
df_medicaid.to_csv('raw_medicaid_data.csv', sep = '~', index = False)