<a href="https://colab.research.google.com/github/Noob1701/Clin_Trials_Clustering/blob/main/Data_Collect2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import warnings
import pandas as pd
import requests
import csv


In [10]:
# Initial URL for the first API call
base_url = "https://clinicaltrials.gov/api/v2/studies"

# Initialize an empty list to store the data
data_list = []
params = {
}

In [11]:
def write_to_csv(data_list, output_file):
    with open(output_file, 'a', newline='', encoding='utf-8') as f:
        headers = list(data_list[0].keys())
        writer = csv.DictWriter(f, fieldnames=headers)
        # Write headers only if file is empty
        if f.tell() == 0:
            writer.writeheader()

        for data in data_list:
            writer.writerow(data)

In [12]:
# Loop until there is no nextPageToken
while True:
    # Print the current URL (for debugging purposes)
    # print("Fetching data from:", base_url + '?' + '&'.join([f"{k}={v}" for k, v in params.items()]))

    # Send a GET request to the API
    response = requests.get(base_url, params=params)

    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()  # Parse JSON response
        studies = data.get('studies', [])  # Extract the list of studies

        # Loop through each study and extract specific information
        for study in studies:
            # Safely access nested keys
                responsibleParty = study['protocolSection']['sponsorCollaboratorsModule'].get('responsibleParty', {}).get('type', 'Unknown')
                orgFullName = study['protocolSection']['identificationModule'].get('organization', {}).get('fullName', 'Unknown')
                orgClass = study['protocolSection']['identificationModule'].get('organization',{}).get('class', 'Unknown')
                overallStatus = study['protocolSection']['statusModule'].get('overallStatus', 'Unknown')

                startDate = study['protocolSection']['statusModule'].get('startDateStruct', {}).get('date', 'Unknown')
                if 'conditionsModule' in study['protocolSection']:
                  conditions = ', '.join(study['protocolSection']['conditionsModule'].get('conditions', ['Unknown']))
                else:
                  conditions = 'No conditions listed'
                briefTitle = study['protocolSection']['identificationModule'].get('briefTitle', 'Unknown')
                fullTitle = study['protocolSection']['identificationModule'].get('officialTitle', 'Unknown')
            # Extract interventions safely
                interventions_list = study['protocolSection'].get('armsInterventionsModule', {}).get('interventions', [])
                interventions = ', '.join([intervention.get('name', 'Unknown') for intervention in interventions_list]) if interventions_list else "Unknown"
                interventionDesc_list = study['protocolSection'].get('armsInterventionsModule', {}).get('interventions',[])
                interventionDesc = ', '.join([interventionDesc.get('description', 'Unknown') for interventionDesc in interventionDesc_list]) if interventionDesc_list else "Unknown"

            # Extract dates and phases
                if 'designModule' in study['protocolSection']:
                  studyType = study['protocolSection']['designModule'].get('studyType', 'Unknown')
                else:
                  studyType = 'Unknown'
                if 'designModule' in study['protocolSection']:
                  phases = ', '.join(study['protocolSection']['designModule'].get('phases', ['Unknown']))
                else:
                  phases = 'No phases listed'
                outcome_list = study.get('protocolSection', {}).get('outcomesModule', {}).get('primaryOutcomes', [])
                if outcome_list:
                  outcomes = ' '.join([outcome.get('measure', 'Unknown') for outcome in outcome_list])
                else:
                  outcomes = "Unknown"
                standardAge_list = study['protocolSection'].get('eligibilityModule', {}).get('stdAges',[])
                standardAge = ' '.join(standardAge_list)
                if 'designModule' in study['protocolSection']:
                  if 'designInfo' in study['protocolSection']['designModule']:
                    primaryPurpose = study['protocolSection']['designModule']['designInfo'].get('primaryPurpose', 'Unknown')
                  else:
                    primaryPurpose = 'Unknown'
                else:
                    primaryPurpose = 'Unknown'
                completion_date_list = startDate = study['protocolSection']['statusModule'].get('primaryCompletionDateStruct', {}).get('date', 'Unknown')
            # Append the data to the list as a dictionary
                data_list.append({
                    "Organization Full Name": orgFullName,
                    "Organization Class": orgClass,
                    "Responsible Party": responsibleParty,
                    "Brief Title": briefTitle,
                    "Full Title": fullTitle,
                    "Overall Status": overallStatus,
                    "Start Date": startDate,
                    "Standard Age": standardAge,
                    "Conditions": conditions,
                    "Primary Purpose": primaryPurpose,
                    "Interventions": interventions,
                    "Intervention Description": interventionDesc,
                    "Study Type": studyType,
                    "Phases": phases,
                    "Outcome Measure": outcomes,
                    "Completion Date": completion_date_list
                })

        # Check for nextPageToken and update the params or break the loop
        nextPageToken = data.get('nextPageToken')
        if nextPageToken:
            params['pageToken'] = nextPageToken  # Set the pageToken for the next request
        else:
            break  # Exit the loop if no nextPageToken is present
        if len(data_list) >= 1000:
            write_to_csv(data_list, 'clin_trials.csv')
            data_list = []  # Clear the data list after writing
    else:
        print("Failed to fetch data. Status code:", response.status_code)
        break


In [4]:
clin_trials = pd.read_csv('clin_trials.csv')

In [5]:
clin_trials.head(1)

Unnamed: 0,Organization Full Name,Organization Class,Responsible Party,Brief Title,Full Title,Overall Status,Start Date,Standard Age,Conditions,Primary Purpose,Interventions,Intervention Description,Study Type,Phases,Outcome Measure,Completion Date
0,"University Hospital, Angers",OTHER_GOV,SPONSOR,Reproducibility of Ankle Brachial Index After ...,Reproducibility of the Ankle-Brachial Index Me...,COMPLETED,2014-04,ADULT,Normal Subjects,Unknown,Unknown,Unknown,OBSERVATIONAL,Unknown,Test-retest difference in ankle to brachial pr...,2014-04


In [6]:
clin_trials_wd = clin_trials[clin_trials['Completion Date'] != 'Unknown']

In [7]:
clin_trials_wd = clin_trials_wd[clin_trials_wd['Responsible Party'] == 'SPONSOR']

In [10]:
clin_trials_wd = clin_trials_wd[clin_trials_wd['Organization Class'] == 'INDUSTRY']

In [14]:
clin_trials_wd = clin_trials_wd[(clin_trials_wd['Completion Date'] < '2024-07-05') & (clin_trials_wd['Completion Date'] > '2022-01-01')]


In [16]:
clin_trials_wd = clin_trials_wd[clin_trials_wd['Phases'] == 'PHASE3']

In [18]:
column_to_move = 'Completion Date'
cols = [column_to_move] + [col for col in clin_trials_wd.columns if col != column_to_move]
clin_trials_wd = clin_trials_wd[cols]

In [19]:
clin_trials_wd.head(20)

Unnamed: 0,Completion Date,Organization Full Name,Organization Class,Responsible Party,Brief Title,Full Title,Overall Status,Start Date,Standard Age,Conditions,Primary Purpose,Interventions,Intervention Description,Study Type,Phases,Outcome Measure
66,2022-05-20,Shanghai Institute Of Biological Products,INDUSTRY,SPONSOR,A Study Comparing SIBP-02 and Rituximab Combin...,"A Phase III, Multicenter, Randomized, Double-b...",COMPLETED,2022-05-20,ADULT OLDER_ADULT,Diffuse Large B-Cell Lymphoma,TREATMENT,"SIBP-02, Rituximab","Injection，100mg/10ml, Injection，100mg/10ml",INTERVENTIONAL,PHASE3,Overall Response Rate (ORR)
196,2023-11-15,Mikrobiomik Healthcare Company S.L.,INDUSTRY,SPONSOR,Primary or Recurrent Clostridioides Difficile ...,"A Randomised, Controlled, Open-label Phase III...",COMPLETED,2023-11-15,ADULT OLDER_ADULT,"Recurrent Clostridium Difficile Infection, Pri...",TREATMENT,"MBK-01, Dificlir",A single dose of 4 capsules of MBK-01 (heterol...,INTERVENTIONAL,PHASE3,Global Absence of diarrhea: Number of episodes...
745,2024-04-11,"Agios Pharmaceuticals, Inc.",INDUSTRY,SPONSOR,A Study Evaluating the Efficacy and Safety of ...,"A Phase 3, Double-Blind, Randomized, Placebo-C...",ACTIVE_NOT_RECRUITING,2024-04-11,ADULT OLDER_ADULT,"Transfusion-dependent Alpha-Thalassemia, Trans...",TREATMENT,"Placebo Matching Mitapivat, Mitapivat","Tablets, Tablets",INTERVENTIONAL,PHASE3,Percentage of Participants With Transfusion Re...
836,2023-06-23,"EuBiologics Co.,Ltd",INDUSTRY,SPONSOR,A Phase III Study of COVID-19 Vaccine EuCorVac...,"A Phase III, Randomized, Observer-blind, Activ...",COMPLETED,2023-06-23,ADULT OLDER_ADULT,COVID-19,PREVENTION,"EuCorVac-19, ChAdOx1 nCoV-19","COVID-19 vaccine, COVID-19 vaccine",INTERVENTIONAL,PHASE3,The proportion of GMT of neutralizing antibody...
998,2023-12-31,"Supernus Pharmaceuticals, Inc.",INDUSTRY,SPONSOR,Efficacy and Safety Study of MYOBLOC® in the T...,"A Phase 3 Multicenter, Randomized, Double-Blin...",NOT_YET_RECRUITING,2023-12-31,CHILD,Sialorrhea,TREATMENT,"MYOBLOC Low Dose, MYOBLOC High Dose, Placebo",Weight-based dose; 5.0 units/kg for submandibu...,INTERVENTIONAL,PHASE3,Effect of MYOBLOC on Unstimulated Saliva Flow ...
1192,2023-07-26,Bristol-Myers Squibb,INDUSTRY,SPONSOR,A Study of Neoadjuvant Chemotherapy Plus Nivol...,"A Phase 3, Randomized, Double-blind Study of N...",ACTIVE_NOT_RECRUITING,2023-07-26,ADULT OLDER_ADULT,"Carcinoma, Non-Small-Cell Lung",TREATMENT,"Nivolumab, Carboplatin, Cisplatin, Paclitaxel,...","Specified dose on specified days, Specified do...",INTERVENTIONAL,PHASE3,Event-Free Survival (EFS) as Assessed by Blind...
1460,2023-07,Galderma R&D,INDUSTRY,SPONSOR,Efficacy and Safety of Nemolizumab in Subjects...,"A Randomized, Double-Blind, Placebo-Controlled...",WITHDRAWN,2023-07,ADULT OLDER_ADULT,Moderate-to-severe Atopic Dermatitis,TREATMENT,"Nemolizumab, CD14152 placebo",Participants will receive loading dose of 60 m...,INTERVENTIONAL,PHASE3,Proportion of Participants with Eczema Area an...
1522,2023-05-30,Medy-Tox,INDUSTRY,SPONSOR,Long-term Extension Study to Evaluate MBA-P01 ...,"Open-label, Single Group, Multi-center, Repeat...",COMPLETED,2023-05-30,ADULT OLDER_ADULT,Glabellar Frown Lines,TREATMENT,MBA-P01,MBA-P01 will be injected into the Glabellar line.,INTERVENTIONAL,PHASE3,Incidence rate of treatment-emergent adverse e...
1650,2024-06-30,RDC Clinical Pty Ltd,INDUSTRY,SPONSOR,Effect of Testofen on Erectile Function in an ...,Effect of Testofen on Erectile Function in an ...,RECRUITING,2024-06-30,ADULT OLDER_ADULT,Erectile Dysfunction,TREATMENT,"Testofen 300mg, Testofen 600mg, Placebo compar...",Testofen in capsule form - To be taken as a 30...,INTERVENTIONAL,PHASE3,Change in International Index of Erectile Func...
2037,2022-06-10,Ipsen,INDUSTRY,SPONSOR,Study to Assess the Efficacy and Safety of Lan...,"A Phase 3, Single-arm, Open-label, Multicentre...",COMPLETED,2022-06-10,ADULT OLDER_ADULT,Gastroenteropancreatic Neuroendocrine Tumor,TREATMENT,Lanreotide autogel,Administered as deep subcutaneous (SC) injections,INTERVENTIONAL,PHASE3,Clinical Benefit Rate (CBR) of tumour response...


In [22]:
(43.83 - 30.63) * (5000/30.63)

2154.750244857982

dfddsdfddfddfdddff