In [1]:
# Activate google drive to access data (necessary default setup for Google Colab)
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import sys
sys.path.append('/content/drive/MyDrive/Projects/case_study_mT/') # Add data folder to path; adapt this for your own use
import json
import pandas as pd
import requests
from collections import Counter

sys.path.append('/content/drive/MyDrive/Projects/case_study_mT/src/')
#import importlib
#import analyse_utils
#importlib.reload(analyse_utils)
from analyse_utils import get_enrollment, compute_average_enrollment

In [3]:
# Read in payloads data
records = []
file_path = "/content/drive/MyDrive/Projects/case_study_mT/payloads.jsonl"
with open(file_path, "r") as f:
  for line in f:
    records.append(json.loads(line))

In [4]:
# Convert to DataFrame for easier exploration
df_rec = pd.DataFrame(records)
# Show some rows as an example
df_rec.head()

Unnamed: 0,utn,phase,title,gender,language,studies_id,study_type,maximum_age,minimum_age,interventions,...,detailed_description,countries_and_sites,publications,eligibility,additional_info,references,location,ctgov_location,duplicate_info,mesh_terms
0,NCT05680818,[Phase 3],"CALIBRATE: A Phase 3, Randomized, Open-Label S...",All,,,,,16 Years,"[{'type': 'DRUG', 'name': 'Encaleret', 'descri...",...,ADH1 is a rare genetic form of hypoparathyroid...,"[{'country_start': None, 'country_end': None, ...",[],{'inclusion': 'Key Inclusion Criteria: 1. Par...,{'sponsors': [{'name': 'Calcilytix Therapeutic...,[{'url': 'https://clinicaltrials.gov/study/NCT...,[{'facility': 'UCSF Benioff Children's Hospita...,[{'facility': 'UCSF Benioff Children's Hospita...,"{'nctId': 'NCT05680818', 'orgStudyIdInfo': {'i...","[{'id': 'D000001176', 'term': 'Arthrogryposis'..."
1,NCT06061146,[Phase 2],Tislelizumab Combined With Concurrent Chemorad...,All,,,,90 Years,70 Years,"[{'type': 'DRUG', 'name': 'Arm A', 'descriptio...",...,,"[{'country_start': None, 'country_end': None, ...",[],{'inclusion': 'Inclusion Criteria: 1. Volunte...,{'sponsors': [{'name': 'Tianjin Medical Univer...,[{'url': 'https://clinicaltrials.gov/study/NCT...,"[{'facility': 'Tianjin Cancer Hospital', 'stat...","[{'facility': 'Tianjin Cancer Hospital', 'stat...","{'nctId': 'NCT06061146', 'orgStudyIdInfo': {'i...","[{'id': 'D000004938', 'term': 'Esophageal Neop..."
2,NCT02935257,[Phase 1],Immunotherapy for High Risk/Relapsed CD19+ Acu...,All,,,,,16 Years,"[{'type': 'BIOLOGICAL', 'name': 'CD19CAT-41BBZ...",...,"This is a multi-centre, non-randomised, open l...","[{'country_start': None, 'country_end': None, ...",[],{'inclusion': 'Inclusion Criteria: 1. Age ≥16...,"{'sponsors': [{'name': 'University College, Lo...",[{'url': 'https://clinicaltrials.gov/study/NCT...,[{'facility': 'University College London Hospi...,[{'facility': 'University College London Hospi...,"{'nctId': 'NCT02935257', 'orgStudyIdInfo': {'i...","[{'id': 'D000008223', 'term': 'Lymphoma'}, {'i..."
3,NCT04137653,[Phase 3],Treatment of Triple-negative Breast Cancer Wit...,Female,,,,70 Years,18 Years,"[{'type': 'DRUG', 'name': 'nab-Paclitaxel+carb...",...,Breast cancer has been one of the most common ...,"[{'country_start': None, 'country_end': None, ...",[],{'inclusion': 'Inclusion Criteria:  - breast ...,"{'sponsors': [{'name': 'Shengjing Hospital', '...",[{'url': 'https://clinicaltrials.gov/study/NCT...,[{'facility': 'Shengjing Hospital of China Med...,[{'facility': 'Shengjing Hospital of China Med...,"{'nctId': 'NCT04137653', 'orgStudyIdInfo': {'i...","[{'id': 'D000001943', 'term': 'Breast Neoplasm..."
4,NCT05809414,[Phase 3],Amantadine and Transcranial Magnetic Stimulati...,All,,,,,18 Years,"[{'type': 'DEVICE', 'name': 'Transcranial Magn...",...,Multiple Sclerosis (MS) is the most frequent c...,"[{'country_start': None, 'country_end': None, ...",[],{'inclusion': 'Inclusion Criteria: 1. Expande...,"{'sponsors': [{'name': 'Hospital San Carlos, M...",[{'url': 'https://clinicaltrials.gov/study/NCT...,"[{'facility': 'Hospital Puerta del Mar', 'stat...","[{'facility': 'Hospital Puerta del Mar', 'stat...","{'nctId': 'NCT05809414', 'orgStudyIdInfo': {'i...","[{'id': 'D000009103', 'term': 'Multiple Sclero..."


### Question 1: How many Phase 1, Phase 2, and Phase 3 trials are there?

In [5]:
# Count what types of phase are there, and how many trials each type
counts = df_rec['phase'].astype(str).value_counts().reset_index() # create a new dataframe by counting the number of each level of phase
counts.columns = ['Phase', 'Count'] # modify the name of each column for understanding
counts # print the df

Unnamed: 0,Phase,Count
0,['Phase 2'],6299
1,['Phase 1'],3283
2,"['Phase 1', 'Phase 2']",2358
3,['Phase 3'],2267
4,['Not Applicable'],1330
5,['Early Phase 1'],509
6,"['Phase 2', 'Phase 3']",472
7,['Phase 4'],403
8,,214


From the output table, we can see that there are **3283 Phase 1 trials**, **6299 Phase 2 trials**, and **2267 Phase 3 trials**.

However, the data also reveal some nuances worth highlighting. Several records are assigned to **multiple phases simultaneously** (e.g. _["Phase 1", "Phase 2"]_ or _["Phase 2", "Phase 3"]_), which makes it unclear whether they should be counted fully in both categories or treated as hybrid phases. Besides, there are somewhat **inconsistent labels** such as _Early Phase 1_ (or _Not Applicable_) which do not fit automatically well into the standard Phase 1-4 progression.

For visualisation, I have left these categories separate, to introduce further discussions on this, i.e. whether to integrate these hybrid and non-standard categories into existing phases (e.g. treating _Early Phase 1_ as _Phase 1_) or keep them distinct to better reflect the complexity of clinical trial classification.

### Question 2: What is the average number of (Estimated) Enrollments for each phase?

In [6]:
# Sanity check: any column(s) that provides enrolment info? - not really
df_rec.columns

Index(['utn', 'phase', 'title', 'gender', 'language', 'studies_id',
       'study_type', 'maximum_age', 'minimum_age', 'interventions',
       'protocol_type', 'overall_status', 'primary_purpose', 'conditions',
       'start_date', 'end_date', 'brief_summary', 'detailed_description',
       'countries_and_sites', 'publications', 'eligibility', 'additional_info',
       'references', 'location', 'ctgov_location', 'duplicate_info',
       'mesh_terms'],
      dtype='object')

As no enrolment-related information is available in the current dataset, we will do the following:
1. Use the clinicaltrials.gov API to obtain enrolment data using **NCT-numbers**
2. Select 10 trials per phase to estimate average (estimated and actual) enrollments

In [7]:
# Compute the average actual and estimated enrollments per Phase
phases = [['Phase '+str(i)] for i in range(1,4)]

df_sample = pd.DataFrame() # to store all the trial entries used for later check if needed
df_avg = pd.DataFrame() # to store the average enrollment
for phase in phases:
  df_sample_phase = pd.DataFrame()
  df_avg_phase = pd.DataFrame()

  # Select first 10 trials per phase
  df_sample_phase = df_rec[df_rec['phase'].apply(lambda x: x==phase)].head(10)
  # Add one column (enrollment_info) per row/entry on top of the original dataset
  df_sample_phase['enrollment_info'] = df_sample_phase['utn'].apply(get_enrollment)
  # Compute the avgs
  df_avg_phase = compute_average_enrollment(df_sample_phase)
  # Add the phase data to the bigger df
  df_sample = pd.concat([df_sample, df_sample_phase])
  df_avg = pd.concat([df_avg, df_avg_phase])

df_avg.columns = ['Phase', 'Enrollment Type', 'Average Enrollment', 'Number of Trials Used']
df_sample = df_sample.reset_index()
#df_sample # to visualise for a check

In [8]:
# Visualise the result
df_avg

Unnamed: 0,Phase,Enrollment Type,Average Enrollment,Number of Trials Used
0,Phase 1,ACTUAL,31.5,6
1,Phase 1,ESTIMATED,45.0,4
0,Phase 2,ACTUAL,78.666667,3
1,Phase 2,ESTIMATED,134.428571,7
0,Phase 3,ACTUAL,274.666667,3
1,Phase 3,ESTIMATED,758.142857,7


Summary:

For Phase 1, the average enrollment is around 32 (actual) and 45 (estimated) participants, based on a small sample of 6 and 4 trials, respectively.

For Phase 2, the averages increase to about 79 (actual) and 134 (estimated) participants, based on 3 and 7 trials.

For Phase 3, enrollments are the largest, averaging approximately 275 (actual) and 758 (estimated) participants, based on 3 and 7 trials.

### Question 3: What are the top 10 most commonly studied conditions?
As the answers should only be based on the NCBI - WWW Error Blocked Diagnostic data, we are then extracting **MeSH terms** for counting.

In [9]:
# Run through all trials and record all terms and their frequencies using Counter
condition_counter = Counter()

for terms in df_rec['mesh_terms']:
  if isinstance(terms, list):
    for t in terms:
      term_name = t.get('term')
      if term_name:
        condition_counter[term_name] += 1

# Get the most common 10 conditions
top_conditions = condition_counter.most_common(10)
for condition, count in top_conditions:
  print(f"{condition}: {count}")

Neoplasms: 1484
Carcinoma: 1408
Lymphoma: 944
Leukemia: 794
Breast Neoplasms: 793
Carcinoma, Non-Small-Cell Lung: 751
Lung Neoplasms: 713
Recurrence: 483
Syndrome: 477
Adenocarcinoma: 413


In [10]:
# Save the dataset (only Phase 1, 2, or 3 trials) for further use
phases = [['Phase '+str(i)] for i in range(1,4)]
df_rec_phases = pd.DataFrame()
for phase in phases:
  df_single_phase = df_rec[df_rec['phase'].apply(lambda x: x==phase)]
  df_rec_phases = pd.concat([df_rec_phases, df_single_phase])

df_rec_phases.to_pickle("/content/drive/MyDrive/Projects/case_study_mT/data/df_rec_phases.pkl")