<a href="https://colab.research.google.com/github/Noob1701/Clin_Trials_Clustering/blob/main/Data_Collect2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import warnings
import pandas as pd
import requests
import csv


In [6]:
# Initial URL for the first API call
base_url = "https://clinicaltrials.gov/api/v2/studies"

# Initialize an empty list to store the data
data_list = []
params = {
}

In [7]:
def write_to_csv(data_list, output_file):
    with open(output_file, 'a', newline='', encoding='utf-8') as f:
        headers = list(data_list[0].keys())
        writer = csv.DictWriter(f, fieldnames=headers)
        # Write headers only if file is empty
        if f.tell() == 0:
            writer.writeheader()

        for data in data_list:
            writer.writerow(data)

In [8]:
# Loop until there is no nextPageToken
while True:
    # Print the current URL (for debugging purposes)
    # print("Fetching data from:", base_url + '?' + '&'.join([f"{k}={v}" for k, v in params.items()]))

    # Send a GET request to the API
    response = requests.get(base_url, params=params)

    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()  # Parse JSON response
        studies = data.get('studies', [])  # Extract the list of studies

        # Loop through each study and extract specific information
        for study in studies:
            # Safely access nested keys
                responsibleParty = study['protocolSection']['sponsorCollaboratorsModule'].get('responsibleParty', {}).get('type', 'Unknown')
                orgFullName = study['protocolSection']['identificationModule'].get('organization', {}).get('fullName', 'Unknown')
                orgClass = study['protocolSection']['identificationModule'].get('organization',{}).get('class', 'Unknown')
                overallStatus = study['protocolSection']['statusModule'].get('overallStatus', 'Unknown')

                startDate = study['protocolSection']['statusModule'].get('startDateStruct', {}).get('date', 'Unknown')
                if 'conditionsModule' in study['protocolSection']:
                  conditions = ', '.join(study['protocolSection']['conditionsModule'].get('conditions', ['Unknown']))
                else:
                  conditions = 'No conditions listed'
                briefTitle = study['protocolSection']['identificationModule'].get('briefTitle', 'Unknown')
                fullTitle = study['protocolSection']['identificationModule'].get('officialTitle', 'Unknown')
            # Extract interventions safely
                interventions_list = study['protocolSection'].get('armsInterventionsModule', {}).get('interventions', [])
                interventions = ', '.join([intervention.get('name', 'Unknown') for intervention in interventions_list]) if interventions_list else "Unknown"
                interventionDesc_list = study['protocolSection'].get('armsInterventionsModule', {}).get('interventions',[])
                interventionDesc = ', '.join([interventionDesc.get('description', 'Unknown') for interventionDesc in interventionDesc_list]) if interventionDesc_list else "Unknown"

            # Extract dates and phases
                if 'designModule' in study['protocolSection']:
                  studyType = study['protocolSection']['designModule'].get('studyType', 'Unknown')
                else:
                  studyType = 'Unknown'
                if 'designModule' in study['protocolSection']:
                  phases = ', '.join(study['protocolSection']['designModule'].get('phases', ['Unknown']))
                else:
                  phases = 'No phases listed'
                outcome_list = study.get('protocolSection', {}).get('outcomesModule', {}).get('primaryOutcomes', [])
                if outcome_list:
                  outcomes = ' '.join([outcome.get('measure', 'Unknown') for outcome in outcome_list])
                else:
                  outcomes = "Unknown"
                standardAge_list = study['protocolSection'].get('eligibilityModule', {}).get('stdAges',[])
                standardAge = ' '.join(standardAge_list)
                if 'designModule' in study['protocolSection']:
                  if 'designInfo' in study['protocolSection']['designModule']:
                    primaryPurpose = study['protocolSection']['designModule']['designInfo'].get('primaryPurpose', 'Unknown')
                  else:
                    primaryPurpose = 'Unknown'
                else:
                    primaryPurpose = 'Unknown'
                completion_date_list = study['protocolSection']['statusModule'].get('primaryCompletionDateStruct', {}).get('date', 'Unknown')
            # Append the data to the list as a dictionary
                data_list.append({
                    "Organization Full Name": orgFullName,
                    "Organization Class": orgClass,
                    "Responsible Party": responsibleParty,
                    "Brief Title": briefTitle,
                    "Full Title": fullTitle,
                    "Overall Status": overallStatus,
                    "Start Date": startDate,
                    "Standard Age": standardAge,
                    "Conditions": conditions,
                    "Primary Purpose": primaryPurpose,
                    "Interventions": interventions,
                    "Intervention Description": interventionDesc,
                    "Study Type": studyType,
                    "Phases": phases,
                    "Outcome Measure": outcomes,
                    "Completion Date": completion_date_list
                })

        # Check for nextPageToken and update the params or break the loop
        nextPageToken = data.get('nextPageToken')
        if nextPageToken:
            params['pageToken'] = nextPageToken  # Set the pageToken for the next request
        else:
            break  # Exit the loop if no nextPageToken is present
        if len(data_list) >= 1000:
            write_to_csv(data_list, 'clin_trials.csv')
            data_list = []  # Clear the data list after writing
    else:
        print("Failed to fetch data. Status code:", response.status_code)
        break


In [9]:
clin_trials = pd.read_csv('clin_trials.csv')

In [10]:
clin_trials.head(1)

Unnamed: 0,Organization Full Name,Organization Class,Responsible Party,Brief Title,Full Title,Overall Status,Start Date,Standard Age,Conditions,Primary Purpose,Interventions,Intervention Description,Study Type,Phases,Outcome Measure,Completion Date
0,"University Hospital, Angers",OTHER_GOV,SPONSOR,Reproducibility of Ankle Brachial Index After ...,Reproducibility of the Ankle-Brachial Index Me...,COMPLETED,2012-11,ADULT,Normal Subjects,Unknown,Unknown,Unknown,OBSERVATIONAL,Unknown,Test-retest difference in ankle to brachial pr...,2014-04


In [11]:
clin_trials_wd = clin_trials[clin_trials['Completion Date'] != 'Unknown']

In [12]:
clin_trials_wd = clin_trials_wd[clin_trials_wd['Responsible Party'] == 'SPONSOR']

In [13]:
clin_trials_wd = clin_trials_wd[clin_trials_wd['Organization Class'] == 'INDUSTRY']

In [14]:
clin_trials_wd = clin_trials_wd[(clin_trials_wd['Completion Date'] < '2024-07-05') & (clin_trials_wd['Completion Date'] > '2022-01-01')]


In [15]:
clin_trials_wd = clin_trials_wd[clin_trials_wd['Phases'] == 'PHASE3']

In [16]:
column_to_move = 'Completion Date'
cols = [column_to_move] + [col for col in clin_trials_wd.columns if col != column_to_move]
clin_trials_wd = clin_trials_wd[cols]

In [23]:
clin_trials_wd = clin_trials_wd.sort_values(by='Completion Date', ascending=False)

In [24]:
clin_trials_wd.head(20)

Unnamed: 0,Completion Date,Organization Full Name,Organization Class,Responsible Party,Brief Title,Full Title,Overall Status,Start Date,Standard Age,Conditions,Primary Purpose,Interventions,Intervention Description,Study Type,Phases,Outcome Measure
186505,2024-07-02,AstraZeneca,INDUSTRY,SPONSOR,Study to Assess Effect of HFO MDI Propellant o...,"A Randomized, Double-blind, Two-way Crossover ...",RECRUITING,2023-06-14,ADULT,Mucociliary Clearance,BASIC_SCIENCE,"HFO MDI, HFA MDI",* Dose formulation: MDI\n* Unit dose strength(...,INTERVENTIONAL,PHASE3,Change from baseline in MCC through 60 minutes...
335675,2024-07-02,AstraZeneca,INDUSTRY,SPONSOR,D9319C00001- 1L OC Mono Global RCT,"A Randomised, Double-blind, Placebo-controlled...",RECRUITING,2021-05-31,ADULT OLDER_ADULT,Ovarian Cancer,TREATMENT,"Olaparib, Matching placebo","Olaparib tablets 300 mg oral twice daily, Matc...",INTERVENTIONAL,PHASE3,Superiority of olaparib as maintenance treatme...
445091,2024-07-01,FutureChem,INDUSTRY,SPONSOR,Evaluate the Clinical Usefulness of [F-18]Flor...,"A Multi Center, Non-randomized, Open, Phase 3 ...",RECRUITING,2021-04-28,ADULT OLDER_ADULT,High Risk Prostate Carcinoma,DIAGNOSTIC,[F-18]Florastamin,A single dose of \[F-18\]Florastamin at 10 ± 1...,INTERVENTIONAL,PHASE3,Diagnostic sensitivity and specificity analyze...
493449,2024-07-01,"Alume Biosciences, Inc.",INDUSTRY,SPONSOR,ALM-488 for Intra-Operative Visualization of N...,ALM-488 for Intra-Operative Visualization of N...,RECRUITING,2022-04-29,CHILD ADULT OLDER_ADULT,Surgery,TREATMENT,"ALM-488, Intraoperative Visualization using Wh...",ALM-488 will be infused during the pre-operati...,INTERVENTIONAL,PHASE3,Contrast Enhancement Length Measurement
34064,2024-07-01,GIE Medical,INDUSTRY,SPONSOR,Paclitaxel Coated Balloon for the Treatment of...,Paclitaxel Coated Balloon for the Treatment of...,RECRUITING,2023-12-01,ADULT OLDER_ADULT,Esophageal Stricture,TREATMENT,"GIE Medical ProTractX3 TTS DCB, Control","Paclitaxel Coated Balloon, Standard Endoscopic...",INTERVENTIONAL,PHASE3,Treatment Success Primary Safety Outcome
46044,2024-07-01,Lupin Ltd.,INDUSTRY,SPONSOR,Study to Investigate the Efficacy and Safety o...,"A Randomized, Double-blind, Placebo-controlled...",WITHDRAWN,2021-09-03,ADULT OLDER_ADULT,Myotonic Dystrophy Type 1 and Type 2,TREATMENT,"Mexiletine 167 mg, Placebo",Mexiletine 167 mg (equivalent to mexiletine HC...,INTERVENTIONAL,PHASE3,Assess the efficacy and safety of mexiletine f...
231804,2024-07,"AMAG Pharmaceuticals, Inc.",INDUSTRY,SPONSOR,A Study to Evaluate Ferumoxytol for the Treatm...,"A Phase 3, Randomized, Open-Label, Multicenter...",RECRUITING,2019-09-18,CHILD,Iron Deficiency Anemia,TREATMENT,"ferumoxytol, Iron sucrose",Each 20 mL single-use vial contains 17 mL of f...,INTERVENTIONAL,PHASE3,Change in Hemoglobin from Baseline to Week 5
182054,2024-07,"Ionis Pharmaceuticals, Inc.",INDUSTRY,SPONSOR,A Study to Assess the Long-Term Safety and Eff...,"An Open-Label, Extension Study to Assess the L...",RECRUITING,2022-01-04,ADULT OLDER_ADULT,Hereditary Transthyretin-Mediated Amyloid Poly...,TREATMENT,Eplontersen,Eplontersen will be administered by SC injection.,INTERVENTIONAL,PHASE3,Change From Baseline in Platelet Count Number ...
220763,2024-07,ACELYRIN Inc.,INDUSTRY,SPONSOR,Hidradenitis Suppurativa Study of Izokibep,"A Randomized, Double-blind, Placebo-controlled...",ACTIVE_NOT_RECRUITING,2023-06-22,ADULT OLDER_ADULT,Hidradenitis Suppurativa,TREATMENT,"Placebo, Izokibep","Solution for injection, Solution for injection",INTERVENTIONAL,PHASE3,Percentage of Participants Achieving HiSCR75
453973,2024-07,Everstar Therapeutics Limited,INDUSTRY,SPONSOR,A Phase 3 Study of Etrasimod in Subjects With ...,"A Phase 3, Randomized, Placebo-Controlled, Dou...",ACTIVE_NOT_RECRUITING,2019-09-10,ADULT OLDER_ADULT,Moderately to Severely Active Ulcerative Colitis,TREATMENT,"Etrasimod, Placebo","Drug:Etrasimod Tablet other name:APD334, Drug:...",INTERVENTIONAL,PHASE3,Proportion of Subjects With Clinical Remission...


In [18]:
(43.83 - 30.63) * (5000/30.63)

2154.750244857982

dfddsdfddfddfdddff