In [54]:
import pandas as pd
import requests
import urllib
import re
from tqdm import trange
import time

## Importation des données

### Dans un premier temps, on importe la liste de toute les variables à récupérer

In [9]:
url = 'https://classic.clinicaltrials.gov/api/info/study_fields_list'

In [12]:
resp = requests.get('https://classic.clinicaltrials.gov/api/info/study_fields_list')
texte = resp.text

list_fields = re.findall('<Field Name=".*?"/>\n',texte)
list_fields = [x[13:-4] for x in list_fields]

In [14]:
list_fields[:10]

['Acronym',
 'AgreementOtherDetails',
 'AgreementPISponsorEmployee',
 'AgreementRestrictionType',
 'AgreementRestrictiveAgreement',
 'ArmGroupDescription',
 'ArmGroupInterventionName',
 'ArmGroupLabel',
 'ArmGroupType',
 'AvailIPDComment']

### Maintenant, on importe les données depuis l'API de Clinical Trials selon la liste de variable qu'on veut

In [47]:
def treat_list(x):
    if type(x)==list:
        if len(x)>0:
            return x[0]
    else:
        return x

In [78]:
def get_data(taille=10,list_fields=['NCTId','StartDate','LastUpdatePostDate'],keyword=''):
    """
    Fonction de récupération des données d'essais cliniques à partir de l'API du site clinical Trials. 
    Inputs: 
    --Taille: Taille de l'échantillon à récupérer (en milliers) (int)
    --list_fields: Variables à récupérer (list)
    --keyword: mots clés pour la recherche d'essais cliniques (str)
    """
    full_df=pd.DataFrame() #On crée un dataframe vide
    http_time=0
    pandas_time=0
    itter=len(list_fields)//20 +1
    for j in trange(taille):
        df=pd.DataFrame()
        for i in range(itter):     #Itter sert à savoir combien de fois dois ittérer la boucle, étant donné que on ne peut récupérer les variables que 20 par 20  

            url_temp=f'https://classic.clinicaltrials.gov/api/query/study_fields?expr={keyword}&fields='
            fields='%2C'.join(list_fields[i*20:(i+1)*20]) #On join dans un string les variables à récupérer pour les intégrer dans l'url

            url_temp+=fields
            url_temp+=f'&min_rnk={j*1000+1}&max_rnk={(j+1)*1000}&fmt=json' #On intègre aussi le rank des données à récupérer

            st = time.time()

            req_temp = requests.get(url_temp)
            data_temp =req_temp.json()['StudyFieldsResponse']['StudyFields']

            et = time.time()

            http_time+= et - st

            st = time.time()

            df_temp = pd.DataFrame(data_temp).drop('Rank', axis=1)
            df=pd.concat([df, df_temp], axis=1) #On fusionne le dataframe temporelle avec le grand dataframe HORIZONTALEMENT car les 2 dataframe présentent les mêmes essais cliniques mais des variables différentes

            et = time.time()

            pandas_time+= et - st

        st = time.time()

        full_df=pd.concat([full_df,df],axis=0) #On fusionne maintenant verticalement les dataframe

        et = time.time()

        pandas_time+= et - st

    full_df = full_df.reset_index()
    full_df = full_df.drop('index',axis=1)
    full_df=full_df.applymap(treat_list)

    print("Temps requete API:" , '-'*20, http_time, '\n\n', 'Temps Pandas:', '-'*20, pandas_time)
    return full_df

In [79]:
get_data(taille=10,list_fields=list_fields[-40:])

100%|██████████| 10/10 [00:54<00:00,  5.41s/it]
  full_df=full_df.applymap(treat_list)


Temps requete API: -------------------- 53.88085722923279 

 Temps Pandas: -------------------- 0.23021984100341797


Unnamed: 0,RetractionSource,SamplingMethod,SecondaryId,SecondaryIdDomain,SecondaryIdLink,SecondaryIdType,SecondaryOutcomeDescription,SecondaryOutcomeMeasure,SecondaryOutcomeTimeFrame,SeeAlsoLinkLabel,...,SubmissionMCPReleaseN,SubmissionReleaseDate,SubmissionResetDate,SubmissionUnreleaseDate,TargetDuration,UnpostedEventDate,UnpostedEventType,UnpostedResponsibleParty,VersionHolder,WhyStopped
0,,,,,,,Participants respond on a 4-point likert scale...,Change in body appreciation,Baseline; post intervention (immediate post); ...,,...,,,,,,,,,"November 24, 2023",
1,,,,,,,,,,,...,,,,,,,,,"November 24, 2023",
2,,,,,,,Evaluate an individual's level of material har...,Number of participants living with material ha...,0-12 months after intervention,,...,,,,,,,,,"November 24, 2023",
3,,Non-Probability Sample,2023-67017-39059,United States Department of Agriculture,,Other Grant/Funding Number,,,,,...,,,,,,,,,"November 24, 2023",
4,,Non-Probability Sample,,,,,the child will be subjected to Child behavior ...,Detect comorbid psychiatric disorder among ADH...,Baseline,,...,,,,,2 Years,,,,"November 24, 2023",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,,Non-Probability Sample,,,,,"Days hospitalized, days in Intensive care unit...",Assessment of Participants HRU: Inpatient setting,Within the 30-day period following nirmatrelvi...,To obtain contact information for a study cent...,...,,,,,,,,,"November 24, 2023",
9996,,Non-Probability Sample,,,,,Measured using the Rivermead Post Concussion S...,Neurocognitive and psychological symptoms,3 days after every other OMT session for up to...,,...,,,,,,,,,"November 24, 2023",
9997,,,,,,,Metabolic measurements will be conducted using...,Metabolic Processes,Changes will be measured throughout one day be...,,...,,,,,,,,,"November 24, 2023",
9998,,Non-Probability Sample,,,,,"Assessment of a) function, b) pain, and c) pat...",Change in The Routine Assessment of Patient In...,"Baseline and at week 4, 6, 8, 10 and 12",,...,,,,,,,,,"November 24, 2023",
