In [336]:
from dotenv import load_dotenv
import requests
import os
import uuid
import csv
from datetime import datetime
import pandas as pd

load_dotenv()

True

In [85]:
BASE_URL = "https://api.fda.gov/drug/event.json"

In [329]:
# Patient information
patient_header = [
    "patientid",
    # Patient demographics
    "patientagegroup",
    "patientonsetage",
    "patientonsetageunit",
    "patientsex",
    "patientweight",

    # Seriousness and outcomes
    "serious",
    "seriousnessdeath",
    "seriousnesshospitalization",
    "seriousnessdisabling",
    "seriousnesslifethreatening",
    "seriousnessother",

    # Dates for timeline analysis
    "receivedate",
    "receiptdate",
    
    # Metadata
    "safetyreportid"
]

# Drug information
drug_header = [
    "patientid",
    "medicinalproduct",
    "activesubstancename",
    "drugindication",
    "drugadministrationroute",
    "drugstartdate",
    "drugenddate",
    "drugdosagetext",
    "drugstructuredosagenumb",
    "drugstructuredosageunit",
    "drugtreatmentduration",
    "drugtreatmentdurationunit",
    "drugrecurreadministration",
]

# Reaction information
reaction_header = [
    "patientid",
    "reactionmeddrapt",
    "reactionoutcome",
]


- Need to handle pagination logic and scheduled batch processing here
- Also need to decide what part of the script goes into the dag

In [330]:
# Fetch data from the FDA API
params = {
    "api_key": os.getenv("OPENFDA_API_KEY"),
    "limit": 20
}

response = requests.get(BASE_URL, params=params)
data = response.json()["results"]
len(data)

20

In [None]:
# Map json to list
patient_list = []
drugs_list = []
reactions_list = []

# data = batch data
for item in data:
    patientid = str(uuid.uuid4())
    patient = item.get("patient",{})

    patient_list.append((
        patientid,
        patient.get("patientagegroup"),
        patient.get("patientonsetage"),
        patient.get("patientonsetageunit"),
        patient.get("patientsex"),
        patient.get("patientweight"),
        patient.get("serious"),
        patient.get("seriousnessdeath"),
        patient.get("seriousnesshospitalization"),
        patient.get("seriousnessdisabling"),
        patient.get("seriousnesslifethreatening"),
        patient.get("seriousnessother"),
        patient.get("receivedate"),
        patient.get("receiptdate"),
        patient.get("safetyreportid"),
    ))

    drugs = patient.get('drug',[])
    for drug in drugs:
        drugs_list.append((
            patientid,
            drug.get("medicinalproduct"),
            drug.get("activesubstance",{}).get("activesubstancename"),
            drug.get("drugindication"),    
            drug.get("drugadministrationroute"),    
            drug.get("drugstartdate"),
            drug.get("drugenddate"),
            drug.get("drugdosagetext"),
            drug.get("drugstructuredosagenumb"),
            drug.get("drugstructuredosageunit"),
            drug.get("drugtreatmentduration"),
            drug.get("drugtreatmentdurationunit"),
            drug.get("drugrecurreadministration"),
        ))

    reactions = patient.get("reaction",[])
    for reaction in reactions:
        reactions_list.append((
            patientid,
            reaction.get("reactionmeddrapt"),
            reaction.get("reactionoutcome"),
        ))

In [353]:
# purge data folder
!rm -r ../data/*

In [349]:
def save_batch_as_csv(path, filename, data, header):
    if not os.path.exists(path):
        os.makedirs(path,exist_ok=True)

    with open(os.path.join(path,f"{filename}.csv"), "w") as f:
        writer = csv.writer(f)
        writer.writerow(header)
        writer.writerows(data)

In [350]:
def save_batch_as_parquet(path, filename, data, header):
    df = pd.DataFrame(data, columns=header)
    if not os.path.exists(path):
        os.makedirs(path, exist_ok=True)

    df.to_parquet(os.path.join(path,f"{filename}.parquet"))

In [None]:
# File name
current_time = datetime.now().strftime("%m-%d-%Y_%H%M%S")

patient_filename = f"patient_{current_time}"
drug_filename = f"drug_{current_time}"
reaction_filename = f"reaction_{current_time}"


# Save batch as csv or parquet files
save_batch_as_csv("../data/patients",patient_filename, patient_list, patient_header)
save_batch_as_csv("../data/drugs",drug_filename, drugs_list, drug_header)
save_batch_as_csv("../data/reactions",reaction_filename, reactions_list, reaction_header)

save_batch_as_parquet("../data/pq/patients",patient_filename, patient_list, patient_header)
save_batch_as_parquet("../data/pq/drugs",drug_filename, drugs_list, drug_header)
save_batch_as_parquet("../data/pq/reactions",reaction_filename, reactions_list, reaction_header)