# Accesing the clinicaltrails.gov data via REST API in python
## Resuources
All API urls neede ror requests:
https://classic.clinicaltrials.gov/api/gui/ref/api_urls
https://classic.clinicaltrials.gov/api/gui


In [1]:
# Imports
import requests
import json
import pandas as pd
import numpy as np
import plotly.express as px
from tqdm import tqdm
from datetime import datetime as dt
from datetime import timedelta as td

import requests
import xml.etree.ElementTree as ET
import duckdb
import os
import json
import sqlalchemy


import pytrials


import csv
import sqlite3



# Using pytrails repository to access the data

https://pypi.org/project/pytrials/ 

## Get the list of available fields
First let's see what are the available fields that we can search by: 
For that purpose we parse the XML file listed here: 

https://classic.clinicaltrials.gov/api/info/study_fields_list

In [2]:
all_study_fields_url ="https://classic.clinicaltrials.gov/api/info/study_fields_list"
response = requests.get(all_study_fields_url)
all_study_fields_xml  = response.text

# Parse the XML data
root = ET.fromstring(all_study_fields_xml)

# Find all Field elements within FieldList
field_elements = root.findall(".//FieldList/Field")

# Extract the Field Name attribute and store in a list
all_study_fields = [str(field.get("Name")) for field in field_elements]

# Print the list of field names
print(all_study_fields)
print(f"{len(all_study_fields)} fields in total")


['Acronym', 'AgreementOtherDetails', 'AgreementPISponsorEmployee', 'AgreementRestrictionType', 'AgreementRestrictiveAgreement', 'ArmGroupDescription', 'ArmGroupInterventionName', 'ArmGroupLabel', 'ArmGroupType', 'AvailIPDComment', 'AvailIPDId', 'AvailIPDType', 'AvailIPDURL', 'BaselineCategoryTitle', 'BaselineClassDenomCountGroupId', 'BaselineClassDenomCountValue', 'BaselineClassDenomUnits', 'BaselineClassTitle', 'BaselineDenomCountGroupId', 'BaselineDenomCountValue', 'BaselineDenomUnits', 'BaselineGroupDescription', 'BaselineGroupId', 'BaselineGroupTitle', 'BaselineMeasureCalculatePct', 'BaselineMeasureDenomCountGroupId', 'BaselineMeasureDenomCountValue', 'BaselineMeasureDenomUnits', 'BaselineMeasureDenomUnitsSelected', 'BaselineMeasureDescription', 'BaselineMeasureDispersionType', 'BaselineMeasureParamType', 'BaselineMeasurePopulationDescription', 'BaselineMeasureTitle', 'BaselineMeasureUnitOfMeasure', 'BaselineMeasurementComment', 'BaselineMeasurementGroupId', 'BaselineMeasurementLow

## Pytrials usage

In [6]:
from pytrials.client import ClinicalTrials
search_term = """Coronavirus+COVID"""
ct = ClinicalTrials()
# Get 50 full studies related to Coronavirus and COVID in json format.
ct.get_full_studies(search_expr=search_term, max_studies=100)

# Get the NCTId, Condition and Brief title fields from 500 studies related to Coronavirus and Covid, in csv format.
corona_fields = ct.get_study_fields(
    search_expr="Coronavirus+COVID",
    fields=all_study_fields[0:20], # The API limits the number of fields to 20
    max_studies=100, # API has a limit 100 records
    fmt="csv",
)

# Get the count of studies related to Coronavirus and COVID.
# ClinicalTrials limits API queries to 1000 records
# Count of studies may be useful to build loops when you want to retrieve more than 1000 records

ct.get_study_count(search_expr="Coronavirus+COVID")

# Read the csv data in Pandas
corona_df = pd.DataFrame.from_records(corona_fields[1:], columns=corona_fields[0])
print(corona_df)


   Rank       Acronym                              AgreementOtherDetails  \
0     1         COSiN                                                      
1     2                                                                    
2     3  PedCan-COVID                                                      
3     4        COLIDE                                                      
4     5    ILIAD-7-UK                                                      
..  ...           ...                                                ...   
95   96     COVIXTREM                                                      
96   97                                                                    
97   98                                                                    
98   99      COVID-19  Site may publish results from the Study, after...   
99  100                                                                    

   AgreementPISponsorEmployee AgreementRestrictionType  \
0                            

## Manual REST API query 
Example, semi-working REST API query

In [7]:


# Sample URL
#url = "https://api/v2/studies"
url = "https://ClinicalTrials.gov/api/query/study_fields?expr=heart+attack&fields=NCTId,Condition,BriefTitle"
response = requests.get(url)
text_data = response.text
print(text_data)

#json_data = json.loads(text_data) # <-- this does not work, because the data comes in as XML string not JSON string

<StudyFieldsResponse>
  <APIVrs>1.01.05</APIVrs>
  <DataVrs>2023:10:17 00:10:37.720</DataVrs>
  <Expression>heart attack</Expression>
  <NStudiesAvail>469667</NStudiesAvail>
  <NStudiesFound>10132</NStudiesFound>
  <MinRank>1</MinRank>
  <MaxRank>20</MaxRank>
  <NStudiesReturned>20</NStudiesReturned>
  <FieldList>
    <Field>NCTId</Field>
    <Field>Condition</Field>
    <Field>BriefTitle</Field>
  </FieldList>
  <StudyFieldsList>
    <StudyFields Rank="1">
      <FieldValues Field="NCTId">
        <FieldValue>NCT05654389</FieldValue>
      </FieldValues>
      <FieldValues Field="Condition">
        <FieldValue>Telemedicine</FieldValue>
      </FieldValues>
      <FieldValues Field="BriefTitle">
        <FieldValue>Effectiveness of Teleconsultation in Referring a Patient With Early Myocardial Infarction From Peripheral Hospital to Cardiac Centre in Hail, Saudi Arabia</FieldValue>
      </FieldValues>
    </StudyFields>
    <StudyFields Rank="2">
      <FieldValues Field="NCTId">
     

### Parsing the XML manually. Not recommended
This is a bit of a pain, but it works to some extent. A little bit more of exercise and we could get it to work
Still this will face same limitations as the pytrials approach as the API limits on columns and fields is the same.

In [8]:
# Full version
if response.status_code == 200:
    try:
        # Parse the XML content
        root = ET.fromstring(response.text)

        # Initialize a dictionary to store data for each field
        field_data = {}

        # Traverse the XML tree and extract data
        for element in root.iter():
            field_name = element.tag
            field_value = element.text

            # Initialize the field_data dictionary if the field doesn't exist
            if field_name not in field_data:
                field_data[field_name] = []

            # Append the field value to the corresponding field
            field_data[field_name].append(field_value)

        # Determine the maximum length of field values
        max_len = max(len(v) for v in field_data.values())

        # Pad field values with None to ensure they have the same length
        for k, v in field_data.items():
            while len(v) < max_len:
                v.append(None)

        # Create a Pandas DataFrame from the padded field_data
        df = pd.DataFrame(field_data)

        # Create a DuckDB database and insert data from the DataFrame
        conn = duckdb.connect(database='clinical_trials.db')
        df.to_sql('clinical_trials', conn, if_exists='replace', index=False)

        # Close the connection
        conn.close()

        print("Data inserted into the DuckDB database.")

    except ET.ParseError as e:
        print('Failed to parse XML:', e)
else:
    print(f'Failed to retrieve data. Status code: {response.status_code}')


  df.to_sql('clinical_trials', conn, if_exists='replace', index=False)


Data inserted into the DuckDB database.


In [9]:
df

Unnamed: 0,StudyFieldsResponse,APIVrs,DataVrs,Expression,NStudiesAvail,NStudiesFound,MinRank,MaxRank,NStudiesReturned,FieldList,Field,StudyFieldsList,StudyFields,FieldValues,FieldValue
0,\n,1.01.05,2023:10:17 00:10:37.720,heart attack,469667,10132,1,20,20,\n,NCTId,\n,\n,\n,NCT05654389
1,,,,,,,,,,,Condition,,\n,\n,Telemedicine
2,,,,,,,,,,,BriefTitle,,\n,\n,Effectiveness of Teleconsultation in Referring...
3,,,,,,,,,,,,,\n,\n,NCT01874691
4,,,,,,,,,,,,,\n,\n,Acute Myocardial Infarction
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,,,,,,,,,,,,,,\n,Myocardial Infarction
57,,,,,,,,,,,,,,\n,MiSaver® Stem Cell Treatment for Heart Attack ...
58,,,,,,,,,,,,,,\n,NCT01150825
59,,,,,,,,,,,,,,\n,Myocardial Infarction


# Alternative: Downlaod all records locally
While not the most efficient, due to clinicaltrials.gov API limitations (max 1000 records per query), this might be the best way to proceed
So.. first we need to download the full dataset from: 
    
    https://classic.clinicaltrials.gov/api/gui/ref/download_all

We chose the JSON download format:
    https://classic.clinicaltrials.gov/AllAPIJSON.zip

## Iterative scan per json into a dataframe
After we download over 10GB of data, we can parse it into a csv file/sql database or anything similar that we can then use in downstream calculations


In [60]:
# Setup

# Create an empty list to store the data records
data_records = []
max_studies = 1000000000
n_write = 10000
#selected_study_fields = ['NCTId', 'Condition', 'BriefTitle'] # <-- smaller set of fields for testing
selected_study_fields = all_study_fields


# Specify the top-level folder path containing subfolders with JSON files
folder_path = '../data/AllAPIJSON/'
selected_study_fields

['Acronym',
 'AgreementOtherDetails',
 'AgreementPISponsorEmployee',
 'AgreementRestrictionType',
 'AgreementRestrictiveAgreement',
 'ArmGroupDescription',
 'ArmGroupInterventionName',
 'ArmGroupLabel',
 'ArmGroupType',
 'AvailIPDComment',
 'AvailIPDId',
 'AvailIPDType',
 'AvailIPDURL',
 'BaselineCategoryTitle',
 'BaselineClassDenomCountGroupId',
 'BaselineClassDenomCountValue',
 'BaselineClassDenomUnits',
 'BaselineClassTitle',
 'BaselineDenomCountGroupId',
 'BaselineDenomCountValue',
 'BaselineDenomUnits',
 'BaselineGroupDescription',
 'BaselineGroupId',
 'BaselineGroupTitle',
 'BaselineMeasureCalculatePct',
 'BaselineMeasureDenomCountGroupId',
 'BaselineMeasureDenomCountValue',
 'BaselineMeasureDenomUnits',
 'BaselineMeasureDenomUnitsSelected',
 'BaselineMeasureDescription',
 'BaselineMeasureDispersionType',
 'BaselineMeasureParamType',
 'BaselineMeasurePopulationDescription',
 'BaselineMeasureTitle',
 'BaselineMeasureUnitOfMeasure',
 'BaselineMeasurementComment',
 'BaselineMeasurem

In [61]:
# some potetnially useful functions
def extract_last_keys(data, parent_key='', sep='_'):
    items = {}
    for key, value in data.items():
        new_key = f"{key}" if parent_key else key
        if isinstance(value, dict):
            items.update(extract_last_keys(value, new_key, sep=sep))
        else:
            items[new_key] = value
    return items

def convert_lists_to_strings(df):
    # Iterate over the DataFrame's columns
    for col in df.columns:
        # Check if the column contains lists
        if df[col].apply(lambda x: isinstance(x, list)).any():
            # Convert lists to strings
            df[col] = df[col].astype(str)
    return df

def extract_field(data, field_name):
    if isinstance(data, dict):
        if field_name in data:
            return data[field_name]
        else:
            for key, value in data.items():
                result = extract_field(value, field_name)
                if result is not None:
                    return result
    elif isinstance(data, list):
        for item in data:
            result = extract_field(item, field_name)
            if result is not None:
                return result
    return None





In [68]:
conn = sqlite3.connect('../data/alltrials.db')  

i = 0 # Counter for files processed
for subfolder_name in tqdm(os.listdir(folder_path)):
    subfolder_path = os.path.join(folder_path, subfolder_name)

    # Check if the item in the top folder is a directory (subfolder)
    if os.path.isdir(subfolder_path):    
        for filename in os.listdir(subfolder_path):
            if i > max_studies:
                break
            if filename.endswith('.json'):
                i+=1
                file_path = os.path.join(subfolder_path, filename)

                # Read JSON data
                with open(file_path, 'r') as json_file:
                    study_data = json.load(json_file)
                    row_data = [extract_field(study_data, field) for field in selected_study_fields]
                    data_records.append(row_data)
            if (i % n_write == 0): # save every n_write records and empty the list
                df = pd.DataFrame(data_records)
                df.columns = selected_study_fields
                df = convert_lists_to_strings(df)
                
                df.to_csv('../data/alltrials.csv', mode='a', header=True, sep="\t")
                df.to_sql("../data/alltrials", conn, if_exists='append', index=False)
                data_records = []    
# Final write of the remaining data after the loop
df = pd.DataFrame(data_records)
df = convert_lists_to_strings(df)
df.columns = selected_study_fields
df.to_csv('../data/alltrials.csv', mode='a', header=True, sep="\t")
df.to_sql("../data/alltrials", conn, if_exists='append', index=False)
data_records = []    
                


 21%|██        | 125/609 [2:19:25<3:37:34, 26.97s/it]   

In [62]:
max_studies

1000000000

In [53]:
# wonder if we can read it in into memory:P
df = pd.read_csv('../data/alltrials.csv', index_col=False, on_bad_lines='warn', sep="\t")

In [54]:
df

Unnamed: 0.1,Unnamed: 0,NCTId,Condition,BriefTitle
0,0.0,NCT00509938,"['Hematopoietic Stem Cell Transplantation', 'B...",Safety of a Single Dose of 5 mg of hLF1-11 Giv...
1,1.0,NCT00501553,['Heart Failure'],Vitamin D in Patients With Heart Failure
2,2.0,NCT00508131,"['Anemia', 'Iron Deficiency', 'Stunting']",Effectiveness of Iron-Fortified Milk on Iron S...
3,3.0,NCT00505570,"['Migraine', 'Migraine With Aura', 'Patent For...",PRIMA PFO Migraine Trial
4,4.0,NCT00500214,"['Postoperative Pain', 'Postoperative Hyposthe...",Ilioinguinal Nerve Excision in Open Mesh Repai...
...,...,...,...,...
101002,99995.0,NCT05903859,"['Infertility, Male']",Improvement of Reproductive Function in Men Wi...
101003,99996.0,NCT05902767,"['Stroke', 'Dementia']",Nut Supplementation to Mitigate Post-stroke Co...
101004,99997.0,NCT05907148,['Parkinson Disease'],Effects of Sensory Integration Training on Bal...
101005,99998.0,NCT05906706,['Bullous Pemphigoid'],Compassionate Use of Dupilumab for Adult Patie...


## Coverting the local data to a database

In [27]:
trials_query = """SELECT * FROM alltrials LIMIT 10"""
result = conn.execute(trials_query)
alltrials_df = pd.DataFrame.from_records(result)

print(alltrials_df)


             0                                                  1  \
0  NCT00509938  ['Hematopoietic Stem Cell Transplantation', 'B...   
1  NCT00501553                                  ['Heart Failure']   
2  NCT00508131          ['Anemia', 'Iron Deficiency', 'Stunting']   
3  NCT00505570  ['Migraine', 'Migraine With Aura', 'Patent For...   
4  NCT00500214  ['Postoperative Pain', 'Postoperative Hyposthe...   
5  NCT00503945                                    ['Sleep Apnea']   
6  NCT00507754                                ['Advanced Cancer']   
7  NCT00501722                     ['Ascites', 'Liver Cirrhosis']   
8  NCT00505154                         ['Dilated Cardiomyopathy']   
9  NCT00505531                                   ['Chronic Pain']   

                                                   2  
0  Safety of a Single Dose of 5 mg of hLF1-11 Giv...  
1           Vitamin D in Patients With Heart Failure  
2  Effectiveness of Iron-Fortified Milk on Iron S...  
3                   

In [28]:
alltrails_df = pd.read_csv("alltrials2.csv", sep="\t")

Unnamed: 0.1,Unnamed: 0,0,1,2
0,0.0,NCT00509938,"['Hematopoietic Stem Cell Transplantation', 'B...",Safety of a Single Dose of 5 mg of hLF1-11 Giv...
1,1.0,NCT00501553,['Heart Failure'],Vitamin D in Patients With Heart Failure
2,2.0,NCT00508131,"['Anemia', 'Iron Deficiency', 'Stunting']",Effectiveness of Iron-Fortified Milk on Iron S...
3,3.0,NCT00505570,"['Migraine', 'Migraine With Aura', 'Patent For...",PRIMA PFO Migraine Trial
4,4.0,NCT00500214,"['Postoperative Pain', 'Postoperative Hyposthe...",Ilioinguinal Nerve Excision in Open Mesh Repai...
...,...,...,...,...
464411,15318.0,NCT04569383,['Covid19'],Safety and Immunogenicity of the Candidate Vac...
464412,15319.0,NCT04563533,['Acute Gastroenteritis'],Clinical Trial of Quadrivalent Recombinant Nor...
464413,15320.0,NCT04567173,['Covid19'],Convalescent Plasma as Adjunctive Therapy for ...
464414,15321.0,NCT04568928,['Incomplete Spinal Cord Injury'],Powered Exoskeleton Combined With Functional E...
