In [1]:
from dotenv import load_dotenv
import os
import requests
from typing import *
import sys
import subprocess
import shlex
from datetime import datetime
import json
import uuid
import pandas as pd
from google.cloud import storage

# Add path to import custom modules
# sys.path.append(os.path.abspath("../src"))

load_dotenv()

True

In [21]:
TEMP_DIR = "./temp/"
RAW_DIR = os.path.join(TEMP_DIR,"raw")
PQ_DIR = os.path.join(TEMP_DIR,"pq")

temp_dirs = [RAW_DIR,PQ_DIR]

for dir in temp_dirs:
    os.makedirs(dir,exist_ok=True)

In [22]:
result = subprocess.Popen(f"rm -rvf {TEMP_DIR}",shell=True,text=True)
result

<Popen: returncode: None args: 'rm -rvf ./temp/'>

removed directory './temp/raw'
removed directory './temp/pq'
removed directory './temp/'


In [2]:
# logger_config.py
import logging

def configure_logging():
    # Configure the root logger
    root_logger = logging.getLogger()
    
    # Check if handlers already exist to avoid duplicates
    if root_logger.handlers:
        return
        
    # Create a stream handler for console output
    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.INFO)
    
    # Create formatter and add to the handler
    formatter = logging.Formatter('[%(asctime)s] %(name)s:%(lineno)d - %(levelname)s - %(message)s')
    console_handler.setFormatter(formatter)
    
    # Add the handler to the root logger
    root_logger.addHandler(console_handler)
    root_logger.setLevel(logging.INFO)

def get_module_logger(module_name):
    # Make sure logging is configured
    configure_logging()
    
    # Return a logger with the module name
    return logging.getLogger(module_name)

In [3]:
# BASE_URL = "https://api.fda.gov/drug/event.json"

# skip = 0 # upto 25000

# # Fetch data from the FDA API
# params = {
#     "api_key": os.getenv("OPENFDA_API_KEY"),
#     "limit": 1000,
#     "skip": skip
# }

# response = requests.get(BASE_URL, params=params)
# data = response.json()
# data

Flow:
- Download the .json.zip files to a temporary directory.
- Unzip them to a temporary directory.
- Process the JSON data into Parquet.
- Upload the Parquet files to GCS.
- Delete the temporary files after processing.
- Perform this for each batch

In [5]:
# Link to downloads of Drug Event data
res = requests.get("https://api.fda.gov/download.json")
data = res.json()
# sum([float(p.get('size_mb')) for p in data.get('results').get('drug').get('event').get('partitions')])
# data.get('results').get('drug').get('event').get('partitions')[0]
data

{'meta': {'disclaimer': 'Do not rely on openFDA to make decisions regarding medical care. While we make every effort to ensure that data is accurate, you should assume all results are unvalidated. We may limit or otherwise restrict your access to the API in line with our Terms of Service.',
  'terms': 'https://open.fda.gov/terms/',
  'license': 'https://open.fda.gov/license/',
  'last_updated': '2025-04-19'},
 'results': {'food': {'enforcement': {'export_date': '2025-04-16',
    'partitions': [{'display_name': '/food/enforcement data',
      'file': 'https://download.open.fda.gov/food/enforcement/food-enforcement-0001-of-0001.json.zip',
      'size_mb': '5.15',
      'records': 27192}],
    'total_records': 27192},
   'event': {'export_date': '2025-04-16',
    'partitions': [{'display_name': '/food/event data',
      'file': 'https://download.open.fda.gov/food/event/food-event-0001-of-0001.json.zip',
      'size_mb': '8.04',
      'records': 140565}],
    'total_records': 140565}},
  '

In [6]:
def partition_id_by_year_quarter(p):
    return "".join(p.get('display_name').split(" ")[:2])

def partition_id_by_year(p):
    return p.get('display_name').split(" ")[0]

def no_of_parts_in_partition(p):
    return int(p.get('display_name').replace("(", "").replace(")","").split(" ")[-1])

def part_size_mb(p):
    return float(p.get('size_mb'))

def get_total_size(json):
    return round(sum([p.get('size_mb') for p in json.get('partitions')]),2)

def read_json_file(json_path):
    with open(json_path, "r") as f:
        d = json.load(f)
        return d

"""
Function to restructure JSON object to handle batch processing better
"""
def extract_drug_events(data):
    events = data.get('results').get('drug').get('event')
    total_records = events.get('total_records')
    partitions = events.get('partitions')

    # Generate unique partition_id and count set
    partition_ids = {}
    for p in partitions:
        id = partition_id_by_year(p)
        partition_ids[id] = partition_ids.get(id,0) + 1
    
    # Groups partition by partitionid
    results = []
    for item in partition_ids.items():
        id, count = item
        file_list = []
        counter = 0
        tot_size = 0

        for p in partitions:
            if counter == count:
                break
            if partition_id_by_year(p) == id:
                counter+=1
                file_list.append(p.get('file'))
                tot_size+=part_size_mb(p)
                
        results.append(
            {
                "partition_id": id,
                "count": count,
                "size_mb" : round(tot_size,2),
                "files" : file_list
            }
        )
    
    return {
        "total_records" : total_records,
        "partitions" : results
    }

"""
Function to seggregate partitions as batches based on disksize threshold
"""
def create_batch(partitions, max_batch_size_mb=10000):
    batch = []                  # partitions per batch
    batch_partitions = []       # Partitions under the threshold
    big_batch_partitions = []   # Different approach to process bigger partitions
    sum_size = 0                # Size counter

    for p in partitions:
        size = p.get('size_mb', 0)

        if size > max_batch_size_mb:
            # TODO:
            # Handle oversized partititions
            big_batch_partitions.append(p)
            continue
        
        if sum_size + size > max_batch_size_mb:
            # TODO:
            # - Declare batch_partitions as batch #
            # - Reset sum_size
            # - Reset batch_partitions
            batch.append(batch_partitions.copy())
            batch_partitions.clear()
            sum_size = 0
            continue
        
        batch_partitions.append(p)
        sum_size += size

    # Flush batch_partitions to schedule as last batch
    if len(batch_partitions) != 0:
        batch.append(batch_partitions.copy())
        batch_partitions.clear()
    
    return batch, big_batch_partitions

In [8]:
# Original Count
len([p for p in data.get('results').get('drug').get('event').get('partitions') if p.get('display_name')[:4] == '2023'])

121

In [7]:
# Restrcutured JSON
downloads_json = extract_drug_events(data)
downloads_json

{'total_records': 18700795,
 'partitions': [{'partition_id': '2004',
   'count': 20,
   'size_mb': 1034.67,
   'files': ['https://download.open.fda.gov/drug/event/2004q3/drug-event-0001-of-0005.json.zip',
    'https://download.open.fda.gov/drug/event/2004q3/drug-event-0002-of-0005.json.zip',
    'https://download.open.fda.gov/drug/event/2004q3/drug-event-0003-of-0005.json.zip',
    'https://download.open.fda.gov/drug/event/2004q3/drug-event-0004-of-0005.json.zip',
    'https://download.open.fda.gov/drug/event/2004q3/drug-event-0005-of-0005.json.zip',
    'https://download.open.fda.gov/drug/event/2004q2/drug-event-0001-of-0005.json.zip',
    'https://download.open.fda.gov/drug/event/2004q2/drug-event-0002-of-0005.json.zip',
    'https://download.open.fda.gov/drug/event/2004q2/drug-event-0003-of-0005.json.zip',
    'https://download.open.fda.gov/drug/event/2004q2/drug-event-0004-of-0005.json.zip',
    'https://download.open.fda.gov/drug/event/2004q2/drug-event-0005-of-0005.json.zip',
   

In [10]:
# Create batches
partitions = downloads_json.get('partitions')
batch, _ = create_batch(partitions, max_batch_size_mb=13000)
print(len(batch))
print(len(_))

6
0


In [11]:
logger = get_module_logger(__name__)

class ADE:
    # Patient information
    patient_header = [
        "patientid",
        "patientagegroup",
        "patientonsetage",
        "patientonsetageunit",
        "patientsex",
        "patientweight",
        "serious",
        "seriousnessdeath",
        "seriousnesshospitalization",
        "seriousnessdisabling",
        "seriousnesslifethreatening",
        "seriousnessother",
        "receivedate",
        "receiptdate",
        "safetyreportid"
    ]

    # Drug information
    drug_header = [
        "patientid",
        "medicinalproduct",
        "activesubstancename",
        "drugindication",
        "drugadministrationroute",
        "drugstartdate",
        "drugenddate",
        "drugdosagetext",
        "drugstructuredosagenumb",
        "drugstructuredosageunit",
        "drugtreatmentduration",
        "drugtreatmentdurationunit",
        "drugrecurreadministration",
    ]

    # Reaction information
    reaction_header = [
        "patientid",
        "reactionmeddrapt",
        "reactionoutcome",
    ]

    def __init__(self):
        self.patients_list = []
        self.drugs_list = []
        self.reactions_list = []
    
    def extractJSON(self, data):
        for item in data:
            patientid = str(uuid.uuid4())
            patient = item.get("patient",{})

            self.patients_list.append((
                patientid,
                patient.get("patientagegroup"),
                patient.get("patientonsetage"),
                patient.get("patientonsetageunit"),
                patient.get("patientsex"),
                patient.get("patientweight"),
                patient.get("serious"),
                patient.get("seriousnessdeath"),
                patient.get("seriousnesshospitalization"),
                patient.get("seriousnessdisabling"),
                patient.get("seriousnesslifethreatening"),
                patient.get("seriousnessother"),
                patient.get("receivedate"),
                patient.get("receiptdate"),
                patient.get("safetyreportid"),
            ))

            drugs = patient.get('drug',[])
            for drug in drugs:
                self.drugs_list.append((
                    patientid,
                    drug.get("medicinalproduct"),
                    drug.get("activesubstance",{}).get("activesubstancename"),
                    drug.get("drugindication"),    
                    drug.get("drugadministrationroute"),    
                    drug.get("drugstartdate"),
                    drug.get("drugenddate"),
                    drug.get("drugdosagetext"),
                    drug.get("drugstructuredosagenumb"),
                    drug.get("drugstructuredosageunit"),
                    drug.get("drugtreatmentduration"),
                    drug.get("drugtreatmentdurationunit"),
                    drug.get("drugrecurreadministration"),
                ))

            reactions = patient.get("reaction",[])
            for reaction in reactions:
                self.reactions_list.append((
                    patientid,
                    reaction.get("reactionmeddrapt"),
                    reaction.get("reactionoutcome"),
                ))

    def _to_dataframe(self):
        df_patients = pd.DataFrame(self.patients_list, columns=self.patient_header)
        df_drugs = pd.DataFrame(self.drugs_list, columns=self.drug_header)
        df_reactions = pd.DataFrame(self.reactions_list, columns=self.reaction_header)

        return df_patients, df_drugs, df_reactions

    def save_as_parquet(self, fname, dir):
        df_patients, df_drugs, df_reactions = self._to_dataframe()
        df = [df_patients, df_drugs, df_reactions]

        dirs = []

        for p in ["patient", "drug", "reaction"]:
            # ./pq/patient/yr/
            path = os.path.join("pq", p, dir)
            dirs.append(path)
            os.makedirs(path, exist_ok=True)
        
        for d,p in zip(df, dirs):
            saved_path = os.path.join(p,f"{fname}.parquet")
            d.to_parquet(saved_path)
            logger.info(f"Parquet File saved to: {saved_path}")

In [12]:
# def download_file(url, download_path="tmp", filename="temp.json"):
#     os.makedirs(download_path,exist_ok=True)
#     subprocess.run(
#         f'wget -q -O - {shlex.quote(url)} | gunzip > {shlex.quote(os.path.join(download_path,filename))}',
#         shell=True,
#     )

# download_file(
#     url=downloads_json['partitions'][1].get('files')[0],
# )

In [13]:
# b = batch[1]
# p = b[0]
# url = p.get('files')[0]
# url

In [14]:
# ade = ADE()
# data = read_json_file("./tmp/temp.json")['results']
# ade.extractJSON(data)
# ade.save_as_parquet(url, return_dirs=True)
# # dirs = ade.save_as_parquet(url, return_dirs=True)
# # dirs

In [15]:
# # local_base_path = "tmp" # relative to this path, that's why rel_path displays "../" because it needs to go one level up to go into pq
# local_base_path = "pq"
# gcs_base_path = "data/pq"
# for r,d,f in os.walk(dirs[0]):
#     # print(r)
#     # print(f)
#     for F in f:
#         local_file_path = os.path.join(r,F)
#         print(local_file_path)

#         rel_path = os.path.relpath(local_file_path, local_base_path) # this path is relative to "pq" so it only displays the directory relative to it
#         gcs_path = os.path.join(gcs_base_path, rel_path).replace("\\","/")
#         # print(rel_path)
#         print(gcs_path)
        

In [16]:
from google.cloud import storage

def upload_to_gcs(local_base_dir, bucket_name, gcs_prefix):
    client = storage.Client()
    bucket = client.bucket(bucket_name)

    for root, _, files in os.walk(local_base_dir):
        for file in files:
            if file.endswith(".parquet"):
                local_file_path = os.path.join(root, file)
                
                # Relative path from the local_base_dir
                relative_path = os.path.relpath(local_file_path, local_base_dir)
                gcs_blob_path = os.path.join(gcs_prefix, relative_path).replace("\\", "/")
                blob = bucket.blob(gcs_blob_path)
                blob.upload_from_filename(local_file_path)
                
                logger.info(f"Uploaded {local_file_path} to gs://{bucket_name}/{gcs_blob_path}")


In [17]:
def process_batch(batch):

    # Directories of temporary files and folder
    tmp_dirs = ["./raw", "./pq"]

    # Create temporary directories
    if not os.path.exists(tmp_dirs[0]):
        logger.info("Directory 'raw' missing. Created 'raw'")
        os.makedirs(tmp_dirs[0],exist_ok=True)

    # Insert batch iteration here
    for i, b in enumerate(batch):
        logger.info('===================================================================')
        logger.info(f'============================= BATCH {i+1} =============================')
        logger.info('===================================================================')

        # Insert partition iteration here
        for j,p in enumerate(b):
            logger.info(f'----------------- Processing partition {j+1} -----------------')  

            files = p.get('files')
            total_count = p.get('count')
            file_count = 1
            for f in files:
                dl_filename = f"drug-event-part-{file_count}-of-{total_count}"
                try:
                    # Download and unzip
                    logger.info(f"Download started: {f}")

                    dl_filepath = os.path.join(tmp_dirs[0],f"{dl_filename}.json")  
                    result = subprocess.run(
                        f'wget -q -O - {shlex.quote(f)} | gunzip > {shlex.quote(dl_filepath)}',
                        shell=True,
                        check=True,
                        capture_output=True,
                        text=True
                    )

                    # Saved to tmp folder
                    logger.info(f"Filed saved to: {dl_filepath}")

                    # Load JSON and map to class ADE
                    ade = ADE()
                    temp_json = read_json_file(dl_filepath)['results']
                    ade.extractJSON(temp_json)
                    logger.info(f"Parsed json file to ADE object: {dl_filepath}")


                    # Save ADE object as parquet file
                    ade.save_as_parquet(fname=dl_filename, dir=p.get('partition_id'))

                    # Upload parquet file to GCS bucket
                    upload_to_gcs(local_base_dir="pq",bucket_name='ade-pipeline',gcs_prefix="data/pq")

                    # Increment part number
                    file_count+=1

                except subprocess.CalledProcessError as e:
                    logger.error(f"(return {e.returncode}) Failed to download or unzip: {f}")
                    logger.error(f"{e.stderr.strip()}")
                except Exception as e:
                    logger.error(f"Unexpected error occured: {e}")

            # Purge tmp folder to prepare for next partition
            for dir_path in tmp_dirs:
                wildcard_path = os.path.join(dir_path, "*")
                quoted_path = shlex.quote(dir_path)
                subprocess.run(f"rm -rf {quoted_path}", shell=True, check=True)
            logging.info("Purged tmp directories")

        logger.info('===================================================================')
        logger.info('============================= Batch {i} END =========================')
        logger.info('===================================================================')
    
    # Summary of the batch script
    # Total batch processed
    # Size of each batch processed
    # Etc
    logger.info("Batch Processing Completed!")

In [18]:
# process_batch(batch[2:4])

In [19]:
# tmp_dirs = ["./raw", "./pq"]

# # Purge tmp folder to prepare for next partition
# for dir_path in tmp_dirs:
#     wildcard_path = os.path.join(dir_path, "*")
#     quoted_path = shlex.quote(dir_path)
#     subprocess.run(f"rm -rf {quoted_path}", shell=True, check=True)

In [44]:
temp_path = "temp"
os.makedirs(temp_path,exist_ok=True)
for i in range(5):
    fname=f"test-{i+1}.txt"
    fpath=os.path.join(temp_path,fname)
    subprocess.run(f"touch {fpath}",shell=True)

In [45]:
del_path = os.path.join(temp_path,"*")
result = subprocess.Popen(f"rm -rfv {del_path}",shell=True, stdout=subprocess.PIPE, text=True)

# Purging temp directory
for o in result.stdout:
    logger.info(o.strip())

[2025-04-18 16:59:15,906] __main__:6 - INFO - removed 'temp/test-1.txt'
[2025-04-18 16:59:15,908] __main__:6 - INFO - removed 'temp/test-2.txt'
[2025-04-18 16:59:15,908] __main__:6 - INFO - removed 'temp/test-3.txt'
[2025-04-18 16:59:15,909] __main__:6 - INFO - removed 'temp/test-4.txt'
[2025-04-18 16:59:15,910] __main__:6 - INFO - removed 'temp/test-5.txt'
