# Libraries

In [2]:
import pandas as pd
import requests

pd.set_option('display.max_columns', None) # display all columns in DF

# Data from Metastore

In [3]:
r = requests.get('https://data.medicaid.gov/api/1/metastore/schemas/dataset/items') # requesting meta data
data = r.json()

df_metastore = pd.json_normalize(data, max_level = 1)
df_metastore = df_metastore[df_metastore['title'].str.contains('State Drug Utilization Data')] # selecting datasets
df_metastore = df_metastore.sort_values(['title']) # sorting datasets
df_metastore['year'] = df_metastore['title'].str[-4:] # creating variable for year

# Data from Medicaid

In [21]:
start_year = 2020

for index, row in df_metastore.iterrows(): # iterating through meta data

    title_i = df_metastore['title'][index]
    identifier_i = df_metastore['identifier'][index]
    year_i = df_metastore['year'][index]

    if int(year_i) >= start_year: # selecting meta data from start year and onward
        
        print(title_i)
        
        start_url = 'https://data.medicaid.gov/api/1/datastore/query/'
        middle_url = str(identifier_i)
        end_url = '/0'
        
        r = requests.get(start_url + middle_url + end_url, timeout = 20)  # requesting part of dataset
        data = r.json()
        
        data_n = data['count'] # finding the number of observations
        
        for offset_i in range(0, int(data_n), 10000): # iterating through data in smaller 
    
            if offset_i == 0:

                df_temp = pd.json_normalize(data['results'], max_level = 1)
                
                df_temp['units_reimbursed'] = pd.to_numeric(df_temp['units_reimbursed'], errors='coerce') # removing empty observations
                df_temp['total_amount_reimbursed'] = pd.to_numeric(df_temp['total_amount_reimbursed'], errors='coerce')
                df_temp.dropna(subset = ['units_reimbursed', 'total_amount_reimbursed'], inplace = True)

                df_temp = df_temp[(df_temp['units_reimbursed'] != 0)] # removing zero observations
                df_temp = df_temp[(df_temp['total_amount_reimbursed'] != 0)]
                
            else:
                offset = offset_i
                offset_url = '?offset=' + str(offset)
                r = requests.get(start_url + middle_url + end_url + offset_url, timeout = 20) # requesting part of dataset
                data = r.json()
                
                df_temp = pd.json_normalize(data['results'], max_level = 1)
                
                df_temp['units_reimbursed'] = pd.to_numeric(df_temp['units_reimbursed'], errors='coerce') # removing empty observations
                df_temp['total_amount_reimbursed'] = pd.to_numeric(df_temp['total_amount_reimbursed'], errors='coerce')
                df_temp.dropna(subset = ['units_reimbursed', 'total_amount_reimbursed'], inplace = True)

                df_temp = df_temp[(df_temp['units_reimbursed'] != 0)] # removing zero observations
                df_temp = df_temp[(df_temp['total_amount_reimbursed'] != 0)]

            if offset_i == 0:
                df = df_temp
            else:
                df = df.append(df_temp, ignore_index = True)
        
    if int(year_i) == start_year:
        df['dataset'] = title_i # adding name of dataset
        
        df['price_per_unit'] = df['total_amount_reimbursed'] / df['units_reimbursed'] # calculating price per unit
        df = df.groupby(['dataset', 'labeler_code', 'product_code', 'year', 'quarter'])['price_per_unit'].mean() # grouping by for each NDA and year    
        
        df_medicaid = df.reset_index()
        print('hello', title_i)

    elif int(year_i) >= start_year:
        df['dataset'] = title_i # adding name of dataset
        
        df['price_per_unit'] = df['total_amount_reimbursed'] / df['units_reimbursed'] # calculating price per unit
        df = df.groupby(['dataset', 'labeler_code', 'product_code', 'year', 'quarter'])['price_per_unit'].mean() # grouping by for each NDA and year
        df = df.reset_index()
        
        df_medicaid = df_medicaid.append(df, ignore_index = True)
        print('hello', title_i)

State Drug Utilization Data 2020
hello State Drug Utilization Data 2020
State Drug Utilization Data 2021


TypeError: to_append should be a Series or list/tuple of Series, got DataFrame

In [23]:
df_temp

Unnamed: 0,utilization_type,state,ndc,labeler_code,product_code,package_size,year,quarter,suppression_used,product_name,units_reimbursed,number_of_prescriptions,total_amount_reimbursed,medicaid_amount_reimbursed,non_medicaid_amount_reimbursed
333,FFSU,DC,00173069604,00173,0696,04,2021,1,false,ADVAIR 250,238.0,17,2768.96,2768.96,0
334,MCOU,DC,64980015701,64980,0157,01,2021,1,false,VIT D2 1.2,294.0,68,102.78,102.78,0
335,MCOU,KS,00832046560,00832,0465,60,2021,1,false,NYAMYC 60G,1740.0,26,777.25,674.33,102.92
336,MCOU,KS,42543049701,42543,0497,01,2021,1,false,Amantadine,1459.0,32,1623.09,1430.09,193
338,MCOU,KS,00527328443,00527,3284,43,2021,1,false,Levothyrox,1155.0,20,543.61,507.61,36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9986,MCOU,KS,00378180010,00378,1800,10,2021,1,false,LEVOTHYROX,4512.0,83,1818.21,1658.86,159.35
9989,MCOU,KS,00832003810,00832,0038,10,2021,1,false,OXYBUTYNIN,3968.0,39,805.83,733.83,72
9991,MCOU,DC,70010075405,70010,0754,05,2021,1,false,METHOCARBA,673.0,12,60.89,60.89,0
9994,MCOU,KS,49348098053,49348,0980,53,2021,1,false,SM ASPIRIN,913.0,31,312.75,267.75,45


### Calculating price per unit and grouping

In [None]:
df_medicaid['units_reimbursed'] = pd.to_numeric(df_medicaid['units_reimbursed'], errors='coerce') # removing empty observations
df_medicaid['total_amount_reimbursed'] = pd.to_numeric(df_medicaid['total_amount_reimbursed'], errors='coerce')
df_medicaid.dropna(subset = ['units_reimbursed', 'total_amount_reimbursed'], inplace = True)

df_medicaid = df_medicaid[(df_medicaid['units_reimbursed'] != 0)] # removing zero observations
df_medicaid = df_medicaid[(df_medicaid['total_amount_reimbursed'] != 0)]

df_medicaid['price_per_unit'] = df_medicaid['total_amount_reimbursed'] / df_medicaid['units_reimbursed'] # calculating price per unit
                
df_medicaid = df_medicaid.groupby(['dataset', 'labeler_code', 'product_code', 'year', 'quarter'])['price_per_unit'].mean() # grouping by for each NDA and year
df_medicaid = df_medicaid.reset_index()

In [20]:
df_medicaid.to_excel('raw_medicaid_data_test.xlsx', index = False)

# Downloading output

In [None]:
df_medicaid.to_csv('raw_medicaid_data_test.csv', index = False)
df_medicaid.to_excel('raw_medicaid_data_test.xlsx', index = False)