<a href="https://colab.research.google.com/github/Noob1701/Clin_Trials_Clustering/blob/main/Data_Collect2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import warnings
import pandas as pd
import requests
import csv


In [10]:
# Initial URL for the first API call
base_url = "https://clinicaltrials.gov/api/v2/studies"

# Initialize an empty list to store the data
data_list = []
params = {
}

In [11]:
def write_to_csv(data_list, output_file):
    with open(output_file, 'a', newline='', encoding='utf-8') as f:
        headers = list(data_list[0].keys())
        writer = csv.DictWriter(f, fieldnames=headers)
        # Write headers only if file is empty
        if f.tell() == 0:
            writer.writeheader()

        for data in data_list:
            writer.writerow(data)

In [12]:
# Loop until there is no nextPageToken
while True:
    # Print the current URL (for debugging purposes)
    # print("Fetching data from:", base_url + '?' + '&'.join([f"{k}={v}" for k, v in params.items()]))

    # Send a GET request to the API
    response = requests.get(base_url, params=params)

    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()  # Parse JSON response
        studies = data.get('studies', [])  # Extract the list of studies

        # Loop through each study and extract specific information
        for study in studies:
            # Safely access nested keys
                responsibleParty = study['protocolSection']['sponsorCollaboratorsModule'].get('responsibleParty', {}).get('type', 'Unknown')
                orgFullName = study['protocolSection']['identificationModule'].get('organization', {}).get('fullName', 'Unknown')
                orgClass = study['protocolSection']['identificationModule'].get('organization',{}).get('class', 'Unknown')
                overallStatus = study['protocolSection']['statusModule'].get('overallStatus', 'Unknown')

                startDate = study['protocolSection']['statusModule'].get('startDateStruct', {}).get('date', 'Unknown')
                if 'conditionsModule' in study['protocolSection']:
                  conditions = ', '.join(study['protocolSection']['conditionsModule'].get('conditions', ['Unknown']))
                else:
                  conditions = 'No conditions listed'
                briefTitle = study['protocolSection']['identificationModule'].get('briefTitle', 'Unknown')
                fullTitle = study['protocolSection']['identificationModule'].get('officialTitle', 'Unknown')
            # Extract interventions safely
                interventions_list = study['protocolSection'].get('armsInterventionsModule', {}).get('interventions', [])
                interventions = ', '.join([intervention.get('name', 'Unknown') for intervention in interventions_list]) if interventions_list else "Unknown"
                interventionDesc_list = study['protocolSection'].get('armsInterventionsModule', {}).get('interventions',[])
                interventionDesc = ', '.join([interventionDesc.get('description', 'Unknown') for interventionDesc in interventionDesc_list]) if interventionDesc_list else "Unknown"

            # Extract dates and phases
                if 'designModule' in study['protocolSection']:
                  studyType = study['protocolSection']['designModule'].get('studyType', 'Unknown')
                else:
                  studyType = 'Unknown'
                if 'designModule' in study['protocolSection']:
                  phases = ', '.join(study['protocolSection']['designModule'].get('phases', ['Unknown']))
                else:
                  phases = 'No phases listed'
                outcome_list = study.get('protocolSection', {}).get('outcomesModule', {}).get('primaryOutcomes', [])
                if outcome_list:
                  outcomes = ' '.join([outcome.get('measure', 'Unknown') for outcome in outcome_list])
                else:
                  outcomes = "Unknown"
                standardAge_list = study['protocolSection'].get('eligibilityModule', {}).get('stdAges',[])
                standardAge = ' '.join(standardAge_list)
                if 'designModule' in study['protocolSection']:
                  if 'designInfo' in study['protocolSection']['designModule']:
                    primaryPurpose = study['protocolSection']['designModule']['designInfo'].get('primaryPurpose', 'Unknown')
                  else:
                    primaryPurpose = 'Unknown'
                else:
                    primaryPurpose = 'Unknown'
                completion_date_list = startDate = study['protocolSection']['statusModule'].get('primaryCompletionDateStruct', {}).get('date', 'Unknown')
            # Append the data to the list as a dictionary
                data_list.append({
                    "Organization Full Name": orgFullName,
                    "Organization Class": orgClass,
                    "Responsible Party": responsibleParty,
                    "Brief Title": briefTitle,
                    "Full Title": fullTitle,
                    "Overall Status": overallStatus,
                    "Start Date": startDate,
                    "Standard Age": standardAge,
                    "Conditions": conditions,
                    "Primary Purpose": primaryPurpose,
                    "Interventions": interventions,
                    "Intervention Description": interventionDesc,
                    "Study Type": studyType,
                    "Phases": phases,
                    "Outcome Measure": outcomes,
                    "Completion Date": completion_date_list
                })

        # Check for nextPageToken and update the params or break the loop
        nextPageToken = data.get('nextPageToken')
        if nextPageToken:
            params['pageToken'] = nextPageToken  # Set the pageToken for the next request
        else:
            break  # Exit the loop if no nextPageToken is present
        if len(data_list) >= 1000:
            write_to_csv(data_list, 'clin_trials.csv')
            data_list = []  # Clear the data list after writing
    else:
        print("Failed to fetch data. Status code:", response.status_code)
        break


In [13]:
clin_trials = pd.read_csv('clin_trials.csv')

In [14]:
clin_trials.head(1)

Unnamed: 0,Organization Full Name,Organization Class,Responsible Party,Brief Title,Full Title,Overall Status,Start Date,Standard Age,Conditions,Primary Purpose,Interventions,Intervention Description,Study Type,Phases,Outcome Measure,Completion Date
0,"University Hospital, Angers",OTHER_GOV,SPONSOR,Reproducibility of Ankle Brachial Index After ...,Reproducibility of the Ankle-Brachial Index Me...,COMPLETED,2014-04,ADULT,Normal Subjects,Unknown,Unknown,Unknown,OBSERVATIONAL,Unknown,Test-retest difference in ankle to brachial pr...,2014-04


In [15]:
clin_trials = clin_trials.sort_values(by= 'Completion Date', ascending=False)

In [16]:
clin_trials.head(20)

Unnamed: 0,Organization Full Name,Organization Class,Responsible Party,Brief Title,Full Title,Overall Status,Start Date,Standard Age,Conditions,Primary Purpose,Interventions,Intervention Description,Study Type,Phases,Outcome Measure,Completion Date
70265,Thomas Jefferson University,OTHER,SPONSOR,Comparison of Tissue Retractors During Cesarea...,Randomized Controlled Trial of the Efficacy of...,UNKNOWN,Unknown,CHILD ADULT OLDER_ADULT,"Cesarean Section, Obesity",TREATMENT,"Mobius™ retractor, traditional metal retractio...","Unknown, Unknown",INTERVENTIONAL,,operative time,Unknown
40472,GlaxoSmithKline,INDUSTRY,SPONSOR,Topical GW842470X In Adults Patients With Mode...,"Randomised, Double-blind, Placebo-controlled S...",COMPLETED,Unknown,ADULT OLDER_ADULT,"Atopic Dermatitis, Dermatitis, Atopic",TREATMENT,GW842470X cream,Unknown,INTERVENTIONAL,PHASE2,Clinical efficacy of 3% GW842470X cream applie...,Unknown
362942,Weill Medical College of Cornell University,OTHER,SPONSOR,Intra Ocular Pressure During Robotic Prostatec...,Intra Ocular Pressure During Robotic Prostatec...,COMPLETED,Unknown,ADULT OLDER_ADULT,"Visual Acuity, Visual Disturbance, Intraocular...",Unknown,Unknown,Unknown,OBSERVATIONAL,Unknown,Unknown,Unknown
155975,National Institutes of Health Clinical Center ...,NIH,Unknown,Mechanisms of Lung Allograft Rejection,Mechanisms of Lung Allograft Rejection,COMPLETED,Unknown,ADULT OLDER_ADULT,"Lung Transplant, Chronic Allograft Rejection",Unknown,Unknown,Unknown,OBSERVATIONAL,Unknown,Unknown,Unknown
416175,University of Chicago,OTHER,SPONSOR,Evaluation of Lovastatin in Severe Persistent ...,Evaluation of Lovastatin in Severe Persistent ...,WITHDRAWN,Unknown,ADULT OLDER_ADULT,Severe Persistent Asthma,TREATMENT,"Lovastatin, Placebo",Lovastatin 20 mg extended release (or placebo)...,INTERVENTIONAL,"PHASE1, PHASE2",Changes in airway smooth muscle biology and in...,Unknown
201340,[Redacted],Unknown,Unknown,[Trial of device that is not approved or clear...,[Trial of device that is not approved or clear...,WITHHELD,Unknown,,No conditions listed,Unknown,Unknown,Unknown,Unknown,No phases listed,Unknown,Unknown
115237,National Institute of Allergy and Infectious D...,NIH,SPONSOR,A Phase I Multicenter Clinical Trial to Evalua...,A Phase I Multicenter Clinical Trial to Evalua...,COMPLETED,Unknown,ADULT,HIV Infections,PREVENTION,gp160 Vaccine (Immuno-AG),Unknown,INTERVENTIONAL,PHASE1,Unknown,Unknown
424446,National Institutes of Health Clinical Center ...,NIH,Unknown,Brain Encoding for Memory,Role of the Prefrontal Cortex in Successful Me...,COMPLETED,Unknown,CHILD ADULT OLDER_ADULT,Healthy,Unknown,Magstim Rapid Magnetic Stimulator,Unknown,OBSERVATIONAL,Unknown,Unknown,Unknown
201347,[Redacted],Unknown,Unknown,[Trial of device that is not approved or clear...,[Trial of device that is not approved or clear...,WITHHELD,Unknown,,No conditions listed,Unknown,Unknown,Unknown,Unknown,No phases listed,Unknown,Unknown
201350,Mayo Clinic,OTHER,PRINCIPAL_INVESTIGATOR,Clinical Use of a Magnetic Anal Sphincter Augm...,Clinical Use of a Magnetic Anal Sphincter Augm...,NO_LONGER_AVAILABLE,Unknown,ADULT OLDER_ADULT,Fecal Incontinence,Unknown,magnetic anal sphincter augmentation for fecal...,Surgical placement of HUD called FENIX for fem...,EXPANDED_ACCESS,Unknown,Unknown,Unknown


In [17]:
clin_trials_wd = clin_trials[clin_trials['Completion Date'] != 'Unknown']

In [18]:
clin_trials_wd.head(20)

Unnamed: 0,Organization Full Name,Organization Class,Responsible Party,Brief Title,Full Title,Overall Status,Start Date,Standard Age,Conditions,Primary Purpose,Interventions,Intervention Description,Study Type,Phases,Outcome Measure,Completion Date
71453,University of Southern California,OTHER,PRINCIPAL_INVESTIGATOR,PPI Versus Placebo in Severe Functional Heartburn,PPI Versus Placebo in Severe Functional Heartburn,WITHDRAWN,2100-12-01,ADULT OLDER_ADULT,GERD,TREATMENT,PPI,Unknown,INTERVENTIONAL,,Difference in GERD controll PPI vs Placebo bas...,2100-12-01
281717,CorEvitas,NETWORK,SPONSOR,Corrona Atopic Dermatitis Registry: A Study of...,Corrona Atopic Dermatitis Registry: A Study of...,ENROLLING_BY_INVITATION,2100-12,ADULT OLDER_ADULT,Atopic Dermatitis,Unknown,Observational Non-Interventional Registry,Observational Non-Interventional Registry,OBSERVATIONAL,Unknown,"AD epidemiology, presentation, natural history...",2100-12
50480,Sanford Health,OTHER,SPONSOR,Rare Disease Patient Registry & Natural Histor...,Coordination of Rare Diseases at Sanford,RECRUITING,2100-12,CHILD ADULT OLDER_ADULT,"Rare Disorders, Undiagnosed Disorders, Disorde...",Unknown,Unknown,Unknown,OBSERVATIONAL,Unknown,To accelerate research into rare disorders by ...,2100-12
285370,CorEvitas,NETWORK,SPONSOR,Rheumatology Biorepository,Rheumatology Biorepository,ENROLLING_BY_INVITATION,2100-12,ADULT OLDER_ADULT,Rheumatoid Arthritis,Unknown,Unknown,Unknown,OBSERVATIONAL,Unknown,1. Biospecimen (blood) Collection 2. Laborator...,2100-12
413110,CorEvitas,NETWORK,SPONSOR,Corrona Inflammatory Bowel Disease (IBD) Registry,Corrona Inflammatory Bowel Disease (IBD) Registry,ENROLLING_BY_INVITATION,2100-12,ADULT OLDER_ADULT,Inflammatory Bowel Diseases,Unknown,Unknown,Unknown,OBSERVATIONAL,Unknown,"IBD epidemiology, presentation, natural histor...",2100-12
51114,CorEvitas,NETWORK,SPONSOR,The Corrona Psoriatic Arthritis and Spondyloar...,Corrona Psoriatic Arthritis and Spondyloarthri...,ENROLLING_BY_INVITATION,2100-12,ADULT OLDER_ADULT,"Psoriatic Arthritis, Spondyloarthritis",Unknown,Unknown,Unknown,OBSERVATIONAL,Unknown,The major clinical outcomes include an assessm...,2100-12
401909,CorEvitas,NETWORK,SPONSOR,The Corrona Psoriasis (PSO) Registry,Corrona Psoriasis (PSO) Registry,ENROLLING_BY_INVITATION,2100-12,ADULT OLDER_ADULT,Psoriasis,Unknown,Unknown,Unknown,OBSERVATIONAL,Unknown,Number of patients with adverse events (AEs) o...,2100-12
164394,CorEvitas,NETWORK,SPONSOR,CorEvitas Japan Rheumatoid Arthritis (RA) Regi...,CorEvitas Japan Rheumatoid Arthritis (RA) Regi...,ENROLLING_BY_INVITATION,2100-12,ADULT OLDER_ADULT,Rheumatoid Arthritis,Unknown,Unknown,Unknown,OBSERVATIONAL,Unknown,Incidence rate of adverse events (AEs) will be...,2100-12
121912,"Maccabi Healthcare Services, Israel",OTHER,SPONSOR,Registry of Cardiovascular Disease Patients,Computerized Registry of Cardiovascular Diseas...,ENROLLING_BY_INVITATION,2100-12,CHILD ADULT OLDER_ADULT,"Ischemic Heart Disease, Congestive Heart Failu...",Unknown,Unknown,Unknown,OBSERVATIONAL,Unknown,Number of Patients with Cardiovascular disease,2100-12
447247,CorEvitas,NETWORK,SPONSOR,Corevitas Multiple Sclerosis (MS) Registry,CorEvitas Multiple Sclerosis (MS) Registry,ENROLLING_BY_INVITATION,2100-12,ADULT OLDER_ADULT,Multiple Sclerosis,Unknown,Unknown,Unknown,OBSERVATIONAL,Unknown,Number of patients with adverse events (AEs) o...,2100-12


In [19]:
clin_trials_wd = clin_trials_wd[clin_trials_wd['Completion Date'] < '2024-07-05']

In [20]:
clin_trials_wd.head(20)

Unnamed: 0,Organization Full Name,Organization Class,Responsible Party,Brief Title,Full Title,Overall Status,Start Date,Standard Age,Conditions,Primary Purpose,Interventions,Intervention Description,Study Type,Phases,Outcome Measure,Completion Date
390439,Novartis,INDUSTRY,SPONSOR,Sabatolimab as a Treatment for Patients With A...,"A Phase Ib/II, Open Label Study of Sabatolimab...",RECRUITING,2024-07-04,CHILD ADULT OLDER_ADULT,Acute Myeloid Leukemia,TREATMENT,"Sabatolimab, Azacitidine",Sabatolimab is a solution in vial for IV infus...,INTERVENTIONAL,"PHASE1, PHASE2",Incidence of dose limiting toxicities (Safety ...,2024-07-04
355694,Hacettepe University,OTHER,PRINCIPAL_INVESTIGATOR,Investigation of the Effects of Exercise and K...,Investigation of the Effects of Exercise and K...,NOT_YET_RECRUITING,2024-07-04,ADULT,"Scoliosis, Scoliosis Idiopathic, Musculoskelet...",TREATMENT,"Just Exercise, Exercise, Taping",Schroth exercises are including rotational bre...,INTERVENTIONAL,,Trunk rotation Cobb angles Serum markers,2024-07-04
131455,Mayo Clinic,OTHER,SPONSOR,Consent Forms in Cancer Research: Examining th...,Consent Forms in Cancer Research: Examining th...,ACTIVE_NOT_RECRUITING,2024-07-04,ADULT OLDER_ADULT,Cancer,OTHER,"6,000 word consent form, 4,000 word consent fo...",Mock consent form with consistent content in a...,INTERVENTIONAL,,Decision to enroll,2024-07-04
247391,Sawanpracharak hospital,OTHER,PRINCIPAL_INVESTIGATOR,Lidocaine Spray vs Viscous Lidocaine Solution ...,Lidocaine Spray vs Viscous Lidocaine Solution ...,ENROLLING_BY_INVITATION,2024-07-04,ADULT OLDER_ADULT,"Pain, Satisfaction, Patient",TREATMENT,"Lidocaine Spray, Lidocaine Viscous+Lidocane spray",Pharyngeal anesthesia using a topical lidocain...,INTERVENTIONAL,,"procedural pain, measured using the visual ana...",2024-07-04
92512,Mansoura University,OTHER,SPONSOR,Polyetherketoneketone and Metal Framework for ...,Patient Satisfaction and Oral Healthy Related ...,ACTIVE_NOT_RECRUITING,2024-07-04,ADULT OLDER_ADULT,Patient Satisfaction,TREATMENT,Obturator,Evaluation of patient satisfaction and oral he...,INTERVENTIONAL,,Patient satisfaction,2024-07-04
417962,Boehringer Ingelheim,INDUSTRY,SPONSOR,A Study in Healthy Men to Test How Well Differ...,"Safety, Tolerability, Pharmacokinetics, and Ph...",ACTIVE_NOT_RECRUITING,2024-07-04,ADULT OLDER_ADULT,Healthy,TREATMENT,"BI 1569912, Placebo","BI 1569912, Placebo",INTERVENTIONAL,PHASE1,Percentage of subjects with drug-related adver...,2024-07-04
286826,Assistance Publique - Hôpitaux de Paris,OTHER,SPONSOR,Assessment of ADCY5-related Movement Disorders...,Assessment of ADCY5-related Movement Disorders...,RECRUITING,2024-07-04,CHILD ADULT OLDER_ADULT,ADCY5-related Dyskinesia,OTHER,caffeinated coffee - decaffeinated coffee,Drink caffeinated coffee one morning and drink...,INTERVENTIONAL,,Quantification movement disorders,2024-07-04
413926,vandfys,OTHER,PRINCIPAL_INVESTIGATOR,The Effect of Aquatic Physiotherapy on Veteran...,The Effect of Aquatic Physiotherapy on Veteran...,ACTIVE_NOT_RECRUITING,2024-07-04,ADULT OLDER_ADULT,Post Traumatic Stress Disorder,TREATMENT,physical therapy,Aquatic physiotherapy in warm water,INTERVENTIONAL,,Level of symptoms of PTSD,2024-07-04
458438,"Central Hospital, Nancy, France",OTHER,PRINCIPAL_INVESTIGATOR,Efficacy and Safety of JAK Inhibitors in Syste...,Efficacy and Safety of JAK Inhibitors in Patie...,RECRUITING,2024-07-04,ADULT OLDER_ADULT,"Systemic Sclerosis, Interstitial Lung Disease",Unknown,Unknown,Unknown,OBSERVATIONAL,Unknown,relative change in FVC after 12 months of JAK ...,2024-07-04
107011,Hacettepe University,OTHER,PRINCIPAL_INVESTIGATOR,Investigation of the Effect of Motor Imagery T...,Investigation of the Effect of Motor Imagery T...,NOT_YET_RECRUITING,2024-07-04,ADULT OLDER_ADULT,"Image, Body, Neck Pain, Disc Herniation",TREATMENT,"Motor Imagery Exercises, Motor Control Exercises",Lateralization Training: Individuals with cerv...,INTERVENTIONAL,,Evaluation of Normal Joint Range of Motion Eva...,2024-07-04
