# Libraries

In [1]:
import pandas as pd
import numpy as np
import math
import requests

# Data from Metastore

In [2]:
r = requests.get('https://data.medicaid.gov/api/1/metastore/schemas/dataset/items') # requesting meta data
data = r.json()

df_metastore = pd.json_normalize(data, max_level = 1)
df_metastore = df_metastore[df_metastore['title'].str.contains('State Drug Utilization Data')] # selecting datasets
df_metastore = df_metastore.sort_values(['title']) # sorting datasets
df_metastore['year'] = df_metastore['title'].str[-4:] # creating variable for year

# Data from Medicaid

In [None]:
start_year = 2020

for index, row in df_metastore.iterrows(): # iterating through meta data
    
    title_i = df_metastore['title'][index]
    identifier_i = df_metastore['identifier'][index]
    year_i = df_metastore['year'][index]
    
    if int(year_i) >= start_year: # selecting meta data from start year and onward
        
        start_url = 'https://data.medicaid.gov/api/1/datastore/query/'
        middle_url = str(identifier_i)
        end_url = '/0'
        
        r = requests.get(start_url + middle_url + end_url, timeout = 20)  # requesting part of dataset
        data = r.json()
        
        data_n = data['count'] # finding the number of observations
        
        for offset_i in range(0, int(data_n), 10000): # iterating through data in smaller 
    
            if offset_i == 0:
                df = pd.json_normalize(data['results'], max_level = 1)
                
                df['units_reimbursed'] = pd.to_numeric(df['units_reimbursed'], errors='coerce') # removing empty observations
                df['total_amount_reimbursed'] = pd.to_numeric(df['total_amount_reimbursed'], errors='coerce')
                df.dropna(subset = ['units_reimbursed', 'total_amount_reimbursed'], inplace = True)
            else:
                offset = offset_i
                offset_url = '?offset=' + str(offset)
                r = requests.get(start_url + middle_url + end_url + offset_url, timeout = 20) # requesting part of dataset
                data = r.json()
                
                temp_df = pd.json_normalize(data['results'], max_level = 1)
                
                temp_df['units_reimbursed'] = pd.to_numeric(temp_df['units_reimbursed'], errors='coerce') # removing empty observations
                temp_df['total_amount_reimbursed'] = pd.to_numeric(temp_df['total_amount_reimbursed'], errors='coerce')
                temp_df.dropna(subset = ['units_reimbursed', 'total_amount_reimbursed'], inplace = True)
                
                df = df.append(temp_df, ignore_index = True)
                
        if len(df) != int(data_n):
            print(title_i)
            print('The number of observation do not match')
            print('The number of observation in API:', data['count'])
            print('The number of observation in dataset:', len(df))
        else:
            print(title_i)
            print('The number of observation in API:', data['count'])
            print('The number of observation in dataset:', len(df))
        
        df['dataset'] = title_i # adding name of dataset
        
        df['price_per_unit'] = df['total_amount_reimbursed'] / df['units_reimbursed'] # calculating price per unit
        
        df = df.groupby(['labeler_code', 'product_code', 'year'])['price_per_unit'].mean()
        
        if int(year_i) == start_year:
            df_medicaid = df.reset_index()
        else:
            df_medicaid = df.append(temp_df, ignore_index = True)

In [8]:
df

Unnamed: 0,utilization_type,state,ndc,labeler_code,product_code,package_size,year,quarter,suppression_used,product_name,units_reimbursed,number_of_prescriptions,total_amount_reimbursed,medicaid_amount_reimbursed,non_medicaid_amount_reimbursed
0,MCOU,KS,00002431208,00002,4312,08,2021,1,true,REYVOW,,,,,
1,MCOU,KS,69557011130,69557,0111,30,2021,1,true,ZTlido,,,,,
2,FFSU,KY,00088250033,00088,2500,33,2021,1,true,APIDRA,,,,,
3,FFSU,KY,00173044200,00173,0442,00,2021,1,true,ZOFRAN,,,,,
4,FFSU,KY,00409713909,00409,7139,09,2021,1,true,WATER,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1211314,MCOU,IN,23155010410,23155,0104,10,2021,1,false,METFORMIN,105098,1460,3930.95,3923.51,7.44
1211315,MCOU,IN,63481052910,63481,0529,10,2021,1,false,Cortispori,150,15,3348.2,3348.2,0
1211316,MCOU,IN,63323056497,63323,0564,97,2021,1,false,Enoxaparin,389.6,60,5505.92,5505.92,0
1211317,MCOU,IN,45802012260,45802,0122,60,2021,1,true,Loratadine,,,,,


In [4]:
df_medicaid.to_excel('output.xlsx', index = False)
df.to_csv('output.csv', index = False)