# Libraries

In [2]:
import pandas as pd
import requests

pd.set_option('display.max_columns', None) # display all columns in DF

# Data from Metastore

In [3]:
r = requests.get('https://data.medicaid.gov/api/1/metastore/schemas/dataset/items') # requesting meta data
data = r.json()

df_metastore = pd.json_normalize(data, max_level = 1)
df_metastore = df_metastore[df_metastore['title'].str.contains('State Drug Utilization Data')] # selecting datasets
df_metastore = df_metastore.sort_values(['title']) # sorting datasets
df_metastore['year'] = df_metastore['title'].str[-4:] # creating variable for year

# Data from Medicaid

In [34]:
start_year = 2020

for index, row in df_metastore.iterrows(): # iterating through meta data

    title_i = df_metastore['title'][index]
    identifier_i = df_metastore['identifier'][index]
    year_i = df_metastore['year'][index]

    if int(year_i) >= start_year: # selecting meta data from start year and onward
        
        print(title_i)
        
        start_url = 'https://data.medicaid.gov/api/1/datastore/query/'
        middle_url = str(identifier_i)
        end_url = '/0'
        
        r = requests.get(start_url + middle_url + end_url, timeout = 20)  # requesting part of dataset
        data = r.json()
        
        data_n = data['count'] # finding the number of observations
        
        for offset_i in range(0, int(data_n), 10000): # iterating through data in smaller 
    
            if offset_i == 0:

                df = pd.json_normalize(data['results'], max_level = 1)
                
            else:
                offset = offset_i
                offset_url = '?offset=' + str(offset)
                r = requests.get(start_url + middle_url + end_url + offset_url, timeout = 20) # requesting part of dataset
                data = r.json()
                
                df = pd.json_normalize(data['results'], max_level = 1)
                
        
            df['dataset'] = title_i # adding name of dataset

            if int(year_i) == start_year and offset_i == 0:
                df_medicaid = df
            else:
                df_medicaid = df_medicaid.append(df, ignore_index = True)

State Drug Utilization Data 2020
State Drug Utilization Data 2021


In [35]:
len(df_medicaid)

6091457

In [38]:
# I need to remove zero for units_reimbursed and total_amount_reimbursed

### Calculating price per unit and grouping

In [36]:
df_medicaid['units_reimbursed'] = pd.to_numeric(df_medicaid['units_reimbursed'], errors='coerce') # removing empty observations
df_medicaid['total_amount_reimbursed'] = pd.to_numeric(df_medicaid['total_amount_reimbursed'], errors='coerce')
df_medicaid.dropna(subset = ['units_reimbursed', 'total_amount_reimbursed'], inplace = True)

df_medicaid['price_per_unit'] = df_medicaid['total_amount_reimbursed'] / df_medicaid['units_reimbursed'] # calculating price per unit
                
df_medicaid = df_medicaid.groupby(['dataset', 'labeler_code', 'product_code', 'year', 'quarter'])['price_per_unit'].mean() # grouping by for each NDA and year
df_medicaid = df_medicaid.reset_index()

In [37]:
df_medicaid.head()

Unnamed: 0,dataset,labeler_code,product_code,year,quarter,price_per_unit
0,State Drug Utilization Data 2020,2,101,2020,1,55.137297
1,State Drug Utilization Data 2020,2,101,2020,2,31.812113
2,State Drug Utilization Data 2020,2,101,2020,3,0.0
3,State Drug Utilization Data 2020,2,101,2020,4,0.0
4,State Drug Utilization Data 2020,2,1433,2020,1,372.331892


# Downloading output

In [9]:
df_medicaid.to_csv('raw_medicaid_data_test.csv', index = False)
df_medicaid.to_excel('raw_medicaid_data_test.xlsx', index = False)