# Libraries

In [1]:
import pandas as pd
import requests

pd.set_option('display.max_columns', None) # display all columns in DF

# Data from Metastore

In [2]:
r = requests.get('https://data.medicaid.gov/api/1/metastore/schemas/dataset/items') # requesting meta data
data = r.json()

df_metastore = pd.json_normalize(data, max_level = 1)
df_metastore = df_metastore[df_metastore['title'].str.contains('State Drug Utilization Data')] # selecting datasets
df_metastore = df_metastore.sort_values(['title']) # sorting datasets
df_metastore['year'] = df_metastore['title'].str[-4:] # creating variable for year

# Data from Medicaid

In [5]:
start_year = 1999

for index, row in df_metastore.iterrows(): # iterating through meta data

    title_i = df_metastore['title'][index]
    identifier_i = df_metastore['identifier'][index]
    year_i = df_metastore['year'][index]

    if int(year_i) >= start_year: # selecting meta data from start year and onward
        
        print(title_i)
        
        start_url = 'https://data.medicaid.gov/api/1/datastore/query/'
        middle_url = str(identifier_i)
        end_url = '/0'
        
        r = requests.get(start_url + middle_url + end_url, timeout = 20)  # requesting part of dataset
        data = r.json()
        
        data_n = data['count'] # finding the number of observations
        
        for offset_i in range(0, int(data_n), 10000): # iterating through data in smaller 
    
            if offset_i == 0:

                df_temp = pd.json_normalize(data['results'], max_level = 1)
                
                df_temp['units_reimbursed'] = pd.to_numeric(df_temp['units_reimbursed'], errors='coerce') # removing empty observations
                df_temp['total_amount_reimbursed'] = pd.to_numeric(df_temp['total_amount_reimbursed'], errors='coerce')
                df_temp.dropna(subset = ['units_reimbursed', 'total_amount_reimbursed'], inplace = True)

                df_temp = df_temp[(df_temp['units_reimbursed'] != 0)] # removing zero observations
                df_temp = df_temp[(df_temp['total_amount_reimbursed'] != 0)]
                
            else:
                offset = offset_i
                offset_url = '?offset=' + str(offset)
                r = requests.get(start_url + middle_url + end_url + offset_url, timeout = 20) # requesting part of dataset
                data = r.json()
                
                df_temp = pd.json_normalize(data['results'], max_level = 1)
                
                df_temp['units_reimbursed'] = pd.to_numeric(df_temp['units_reimbursed'], errors='coerce') # removing empty observations
                df_temp['total_amount_reimbursed'] = pd.to_numeric(df_temp['total_amount_reimbursed'], errors='coerce')
                df_temp.dropna(subset = ['units_reimbursed', 'total_amount_reimbursed'], inplace = True)

                df_temp = df_temp[(df_temp['units_reimbursed'] != 0)] # removing zero observations
                df_temp = df_temp[(df_temp['total_amount_reimbursed'] != 0)]

            if offset_i == 0:
                df = df_temp
            else:
                df = df.append(df_temp, ignore_index = True)
        
    if int(year_i) == start_year:
        df['dataset'] = title_i # adding name of dataset
        
        df['price_per_unit'] = df['total_amount_reimbursed'] / df['units_reimbursed'] # calculating price per unit
        df = df.groupby(['dataset', 'labeler_code', 'product_code', 'year', 'quarter'])['price_per_unit'].mean() # grouping by for each NDA and year    
        
        df_medicaid = df.reset_index()
        print('hello', title_i)

    elif int(year_i) >= start_year:
        df['dataset'] = title_i # adding name of dataset
        
        df['price_per_unit'] = df['total_amount_reimbursed'] / df['units_reimbursed'] # calculating price per unit
        df = df.groupby(['dataset', 'labeler_code', 'product_code', 'year', 'quarter'])['price_per_unit'].mean() # grouping by for each NDA and year
        df = df.reset_index()
        
        df_medicaid = df_medicaid.append(df, ignore_index = True)
        print('hello', title_i)

State Drug Utilization Data 1999
hello State Drug Utilization Data 1999
State Drug Utilization Data 2000
hello State Drug Utilization Data 2000
State Drug Utilization Data 2001
hello State Drug Utilization Data 2001
State Drug Utilization Data 2002
hello State Drug Utilization Data 2002
State Drug Utilization Data 2003
hello State Drug Utilization Data 2003
State Drug Utilization Data 2004
hello State Drug Utilization Data 2004
State Drug Utilization Data 2005
hello State Drug Utilization Data 2005
State Drug Utilization Data 2006
hello State Drug Utilization Data 2006
State Drug Utilization Data 2007
hello State Drug Utilization Data 2007
State Drug Utilization Data 2008
hello State Drug Utilization Data 2008
State Drug Utilization Data 2009
hello State Drug Utilization Data 2009
State Drug Utilization Data 2010
hello State Drug Utilization Data 2010
State Drug Utilization Data 2011
hello State Drug Utilization Data 2011
State Drug Utilization Data 2012
hello State Drug Utilization Da

# Downloading output

In [7]:
df_medicaid.to_csv('raw_medicaid_data.csv', index = False)
#df_medicaid.to_excel('raw_medicaid_data.xlsx', index = False)