Scraping the EU clinical trial dataset
This code shows how to scrape only the ICD-10 Diseases of the nervous system (minus inflammatory diseases of the CNS) due to my research focus.
However, all disease htmls are available as a .csv file in gitlab and the principle is the same, just change the initial .csv to whichever disease area(s) you would like the EU Clinical trial protocols for.

Packages and libraries

In [None]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import re
import string

from statsmodels.stats.proportion import proportion_confint

Import CSV files

In [None]:
ns_df = pd.read_csv()

In [None]:
ns_df.head()

Get the HTMLs from the HTML column in the .csv

In [None]:
ns_df.loc[:,'EudraCT_No']

ns_df.dropna(subset=['EudraCT_No'], inplace=True)

eudract_list = ns_df['EudraCT_No'].tolist()

In [None]:
str_eudract = str(eudract_list)
len(str_eudract)

ADD eudract number into url to identify individual protocols

In [None]:
urlList = []

for eudract_no in eudract_list:
   urlList.append("https://www.clinicaltrialsregister.eu/ctr-search/trial/"+str_eudract+"/results")

In [None]:
print(urlList)

Characteristics dataframe

In [None]:
characteristics_df = pd.DataFrame(columns=['EudraCT_No', 'Title', 'Phase', 'Objective', 'End_date','Sample_size', '1ry_endpoint', 'endpoint_description', 'Treatment', 'LT_followup'])

In [None]:
eudract_add = pd.DataFrame({'EudraCT_No': eudract_list})
characteristics_df = pd.concat([eudract_add, characteristics_df], ignore_index=True)

In [None]:
display(characteristics_df)

Add characteristics into the df

In [None]:
idx = 0
for z in range (0, len(urlList)): 
    page = requests.get(urlList[z])
    surrogate_soup = BeautifulSoup(page.content, "html.parser")

    h1_list = surrogate_soup.find_all('h1')

    first = True

    title_list = []

    
    for heading in h1_list:
        if "Results:" in heading.text:
            a,b,c = heading.text.partition('Results:')
            title_list.append(c.strip()) #Phase is incorporated into this part
            phase = c.lower().split('phase') 
            if len(phase) > 1:
                print("Phase "+phase[1].split(' ')[1].translate(str.maketrans('','',string.punctuation)))
                all_characteristics_df.iloc[idx,2] = phase[1].split(' ')[1].translate(str.maketrans('','',string.punctuation))
                print(idx)
            else:
                all_characteristics_df.iloc[idx,2] = 0
            idx = idx +1

    for i, title in enumerate(title_list):
        if title:
            all_characteristics_df.iloc[z,1] = title
           

# Trial objectives 
    obj_list = []
    
    for m_obj in surrogate_soup.find_all('td', class_='labelColumn'): 
        if 'Main objective of the trial' in m_obj.text:
            v_obj = m_obj.find_next('td', class_='valueColumn')
            print(m_obj)
            print(v_obj)
            if v_obj:
                o_text = v_obj.div.get_text(strip=True)
                obj_list.append(o_text)
            
        if len(obj_list) ==1:
            all_characteristics_df.iloc[z,3] = obj_list[0]
        else:
            all_characteristics_df.iloc[z,3] = 0;
    

# End date
    end_list = []
    
    for date in surrogate_soup.find_all('td', class_='labelColumn'):
        if 'Global end of trial date' in date.text:
            v_date = date.find_next_sibling('td', class_='valueColumn')
            if v_date:
                d_text = v_date.div.get_text(strip=True)
                end_list.append(d_text)
        
            if len(end_list) ==1:
                all_characteristics_df.iloc[z,4] = end_list[0]    

# Number of participants
    subj_list = []
    
    for m_subj in surrogate_soup.find_all('td', class_='labelColumn'):
        if 'Worldwide total number of subjects' in m_subj.text:
            v_subj = m_subj.find_next('td', class_='valueColumn')
            if v_subj:
                p_text = v_subj.div.get_text(strip=True)
                subj_list.append(p_text)
                
        if len(subj_list)==1:
            all_characteristics_df.iloc[z,5] = subj_list[0]

# Primary endpoint
    point_list = []

    for m_primary in surrogate_soup:
        if 'Primary: ' in m_primary.text:
            v_primary = m_primary('h3')
            if v_primary:
                point_list.append(v_primary)
        
        if len(point_list)==1:
            all_characteristics_df.iloc[z,6] = point_list[0] 
            break 

# Endpoint description
     description_list = []

    for description in surrogate_soup.find_all('td', class_='labelColumn'):
        if 'End point description' in description.text:
            v_description = description.find_next('td', class_='valueColumn')
            if v_description:
                o_description = v_description.div.get_text(strip=True)
                print(o_description)
                description_list.append(o_description)
      if len(description_list) ==1:
            characteristics_df.iloc[z,] = description_list[0]

# Treatment
    switch = False

    for m_drug in surrogate_soup.find_all('td', class_='labelColumn'):
        if 'Arm type' in m_drug.text:
            step_drug = m_drug.find_next('td', class_='valueColumn')
            if 'Active comparator' in step_drug.text:
                switch = False 
            else: 
                switch = True
            
        if switch == True:
            if 'Investigational medicinal product name' in m_drug.text:
                stepp_text = m_drug.find_next('td', class_='valueColumn')
                print(step_drug)
                print(stepp_text)
                all_characteristics_df.iloc[z,7] = stepp_text.text
                break
           
            
# Long-term follow-up?
    lt_list = []

    for m_lt in surrogate_soup.find_all('td'):
        if 'Long term follow-up planned' in m_lt.text:
            v_lt = m_lt.find_next('td', class_='valueColumn')
            print(m_lt)
            print(v_lt)
            if v_lt:
                l_text = v_lt.div.get_text(strip=True)
                lt_list.append(l_text)
        
        if len(lt_list)==1:
            all_characteristics_df.iloc[z,8] = lt_list[0]



display(all_characteristics_df)

all_characteristics_df.to_csv('ns_protocols', index=False)

Check success of extraction

In [None]:
ns_protocols = pd.read_csv()
ns_protocols.sample(40, random_state=4).to_csv('success_check_euctns.csv', index=False) # 20% of the dataframe 
# Manually checked each scraped section in the dataframe against the protocols on the EU clinical trials website, added new column 'Is_correct',
# if only one characteristic was scraped incorrectly, whole scrape for that particular protocol was labelled false

In [None]:
success_check = pd.read_csv()

In [None]:
display(success_check)

In [None]:
success_rate = success_check['is_correct'].mean() # Establishing point accuracy of the sample web scrape
print(f"Success Rate: {accuracy_rate:.2%}")

In [None]:
from statsmodels.stats.proportion import proportion_confint
from statsmodels.stats.proportion import proportions_ztest

In [None]:
n = len(success_check)
successes = success_check['is_correct'].sum()

ci_low, ci_upp = proportion_confint(successes, n, alpha=0.05, method='wilson') # Estimate confidence interval for range of how accurate the scraping is. Wilson method because of small sample size
print(f"95% Confidence Interval: [{ci_low:.2%}, {ci_upp:.2%}]")