## Part 1 - Criteria generation setup sandbox

In [44]:
import openai
import getpass

api_key = getpass.getpass("api key:")
openai.api_key = api_key
print('redacted')


api key:········
redacted


In [54]:
# openai.api_key = api_key
from openai import OpenAI

client = OpenAI(
    api_key = api_key,
)


completion = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user",
            "content": "Write a haiku about recursion in programming."
        }
    ]
)

print(completion.choices[0].message)

ChatCompletionMessage(content='Functions call themselves,  \nLayers of depth intertwine,  \nTruth in loops of code.', refusal=None, role='assistant', function_call=None, tool_calls=None)


In [59]:
def get_completion(prompt, model="gpt-3.5-turbo"):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0, # this is the degree of randomness of the model's output
    )
    return response.choices[0].message.content

In [61]:
prompt = 'find me clinical trials for patients of age > 18 yo and with heart diease'

response = get_completion(prompt, 'gpt-4o-mini')
print(response)


To find clinical trials for patients over 18 years old with heart disease, you can use several resources:

1. **ClinicalTrials.gov**: This is a database of privately and publicly funded clinical studies conducted around the world. You can search for trials by entering keywords such as "heart disease" and applying filters for age and other criteria.

   - Go to [ClinicalTrials.gov](https://clinicaltrials.gov/)
   - Use the search bar to enter "heart disease."
   - Use the filters to select age (e.g., "18 years and older").

2. **World Health Organization (WHO) International Clinical Trials Registry Platform**: This platform provides access to clinical trial information from various countries.

   - Visit the [WHO ICTRP](https://www.who.int/clinical-trials-registry-platform)
   - Search for "heart disease" and apply relevant filters.

3. **European Union Clinical Trials Register**: If you are in Europe, this site allows you to search for protocol information on interventional clinical tr

In [None]:
# !pip install openai==0.27.6

In [48]:
!openai -V


openai 0.27.6


## Part 3 - dataset generation sandbox - Test set: get some example clinical trials focusing on FHA 

In [70]:
import requests

# https://clinicaltrials.gov/data-api/api
API_SERVER = "https://clinicaltrials.gov/api/v2"

In [87]:
# test connection

test_conn_endpoint = f"{API_SERVER}/version"


resp_v = requests.get(test_conn_endpoint).json()
resp_v

{'apiVersion': '2.0.3', 'dataTimestamp': '2024-08-26T11:12:53'}

In [89]:
# test get by nctid NCT05410886

nctId = 'NCT05410886'
get_by_id_endpoint = f"{API_SERVER}/studies/{nctId}"


resp_id= requests.get(get_by_id_endpoint).json()
resp_id

{'protocolSection': {'identificationModule': {'nctId': 'NCT05410886',
   'orgStudyIdInfo': {'id': '20HH6115'},
   'organization': {'fullName': 'Imperial College London', 'class': 'OTHER'},
   'briefTitle': 'Screening Women for Functional Hypothalamic Amenorrhea (FHA)',
   'officialTitle': 'A Questionnaire-based Study to Improve the Diagnosis of Functional Hypothalamic Amenorrhea (FHA) in Women With Secondary Amenorrhea Attending Hospital'},
  'statusModule': {'statusVerifiedDate': '2024-04',
   'overallStatus': 'COMPLETED',
   'expandedAccessInfo': {'hasExpandedAccess': False},
   'startDateStruct': {'date': '2021-02-11', 'type': 'ACTUAL'},
   'primaryCompletionDateStruct': {'date': '2024-02-02', 'type': 'ACTUAL'},
   'completionDateStruct': {'date': '2024-02-02', 'type': 'ACTUAL'},
   'studyFirstSubmitDate': '2022-06-06',
   'studyFirstSubmitQcDate': '2022-06-06',
   'studyFirstPostDateStruct': {'date': '2022-06-08', 'type': 'ACTUAL'},
   'lastUpdateSubmitDate': '2024-04-26',
   'last

In [90]:
# test get by search condition FHA
cond = 'FHA'

# curl -X GET "https://clinicaltrials.gov/api/v2/studies?query.cond=FHA" -H "accept: application/json" 
get_by_cond_endpoint = f"{API_SERVER}/studies?query.cond={cond}"

resp_cond = requests.get(get_by_cond_endpoint).json()
# resp_cond

In [92]:
len(resp_cond['studies'])

5

In [93]:
#     params = {
#         "query.cond": "FHA",
#         "pageSize": 20,
# #        "query.titles":
#     }
resp_cond

{'studies': [{'protocolSection': {'identificationModule': {'nctId': 'NCT06533865',
     'orgStudyIdInfo': {'id': '2024P000273'},
     'organization': {'fullName': 'Massachusetts General Hospital',
      'class': 'OTHER'},
     'briefTitle': 'Romosozumab as an Adjunct to Physiologic Estrogen Replacement in Functional Hypothalamic Amenorrhea',
     'officialTitle': 'Romosozumab as an Adjunct to Physiologic Estrogen Replacement in Adolescents and Young Adults With Functional Hypothalamic Amenorrhea'},
    'statusModule': {'statusVerifiedDate': '2024-08',
     'overallStatus': 'NOT_YET_RECRUITING',
     'expandedAccessInfo': {'hasExpandedAccess': False},
     'startDateStruct': {'date': '2024-10-01', 'type': 'ESTIMATED'},
     'primaryCompletionDateStruct': {'date': '2028-12-01',
      'type': 'ESTIMATED'},
     'completionDateStruct': {'date': '2029-06-01', 'type': 'ESTIMATED'},
     'studyFirstSubmitDate': '2024-07-30',
     'studyFirstSubmitQcDate': '2024-07-30',
     'studyFirstPostDat

## Part 3 - API payload schema parsing sandbox - parse those criteria into some structured format

In [None]:
# parse those criteria into some structured format: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10797270/ 

# OMOP format: https://www.ohdsi.org/data-standardization/ 

In [None]:
# https://github.com/OHDSI/

In [10]:
import numpy as np
import pandas as pd

In [105]:
df = pd.read_csv('FHA_trials_eligibility_criteria_muti_query.csv')

In [12]:
df.head()

Unnamed: 0,nctId,eligibilityCriteria,healthyVolunteers,sex,genderBased,minimumAge,maximumAge,stdAges,studyPopulation,samplingMethod
0,NCT06533865,Inclusion Criteria:\n\nFor functional hypothal...,True,FEMALE,,14 Years,25 Years,"['CHILD', 'ADULT']",,
1,NCT05410886,Inclusion Criteria:\n\n* Any woman 18-58 years...,False,FEMALE,,18 Years,58 Years,['ADULT'],All women referred by their GP with amenorrhea...,NON_PROBABILITY_SAMPLE
2,NCT02697136,Main Inclusion Criteria:\n\n* Male and female ...,False,ALL,,18 Years,,"['ADULT', 'OLDER_ADULT']",,
3,NCT00453219,Inclusion Criteria:\n\n* Inclusion criteria fo...,True,FEMALE,,18 Years,35 Years,['ADULT'],Women ages 18-35,NON_PROBABILITY_SAMPLE
4,NCT00870350,Inclusion Criteria:\n\n* healthy subject\n* 14...,True,ALL,,14 Years,15 Years,['CHILD'],,


In [27]:
df.eligibilityCriteria[0]

"Inclusion Criteria:\n\nFor functional hypothalamic amenorrhea and controls:\n\n* Female, age 14-25 years, skeletally mature with bone age ≥ 14 years (only 2% of growth left)\n* For women of reproductive age, agree to use an effective non-hormonal contraceptive method or a progestin releasing intrauterine device (no evidence of systemic skeletal effects) for the study duration\n* Negative βHCG (pregnancy test)\n* TSH, prolactin, potassium, magnesium within the normal range\n* Serum ALT ≤ 3 times upper limit of normal, LDL ≤ 190 mg/dl\n* eGFR ≥ 30ml/minute\n\nAdditional inclusion criteria for functional hypothalamic amenorrhea:\n\n* Less than 3 menses in the preceding 6 months\n* BMD Z-score \\< -1.0 at ≥ 1 skeletal site\n* Dental check-up within the past year\n\nExclusion Criteria:\n\nFor functional hypothalamic amenorrhea and controls\n\n* Disease other than FHA known to affect bone, including untreated thyroid dysfunction, Cushing's disease, renal failure, diabetes mellitus\n* Use of

In [13]:
df.dtypes

nctId                   object
eligibilityCriteria     object
healthyVolunteers       object
sex                     object
genderBased            float64
minimumAge              object
maximumAge              object
stdAges                 object
studyPopulation         object
samplingMethod          object
dtype: object

In [None]:
# seven semantic entities: 
# • 4 entity classes:!conditions,!observations,!procedure/device,!and!drug/substance,! !
# • 3 concept attributes:!qualifiers,!measurement,!and!temporal!constraints.! !
#

In [14]:
entity_cate = [
    "Condition",  #a!disease!or!a!medical!condition!determined!by!a!provider!or!reported!by!a!patient
    "Observation", # any!clinical!fact!about!a!patient!obtained!in!the!context!of!examination,!questioning!or!a!procedure.!
    "Procedure/Device",
    "Drug/Substance"
]
attribute_cate = [
    "Measurement",
    "Temporal$coAnstraints",
    "Qualifiers/Qualifiers",
    "Anatomic$location"
]


In [25]:
# !pip install spacy

In [24]:
# !pip install stanza

In [22]:
import stanza

  from .autonotebook import tqdm as notebook_tqdm


In [40]:
# Download and set up the clinical model in Stanza
stanza.download('en', package='mimic', processors={'ner': 'i2b2'})
nlp = stanza.Pipeline(lang='en', package='mimic', processors={'ner': 'i2b2'})

# test clinical text
clinical_text = "The patient has a history of chronic obstructive pulmonary disease and was given albuterol."

doc = nlp(clinical_text)
for sentence in doc.sentences:
    for entity in sentence.ents:
        print(f"Entity: {entity.text}, Type: {entity.type}")

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 386kB [00:00, 20.0MB/s]
2024-08-30 23:06:57 INFO: Downloaded file to /Users/qinxi/stanza_resources/resources.json
2024-08-30 23:06:57 INFO: Downloading these customized packages for language: en (English)...
| Processor       | Package        |
------------------------------------
| tokenize        | mimic          |
| pos             | mimic_charlm   |
| lemma           | mimic_nocharlm |
| depparse        | mimic_charlm   |
| ner             | i2b2           |
| pretrain        | mimic          |
| forward_charlm  | mimic          |
| backward_charlm | mimic          |

2024-08-30 23:06:57 INFO: File exists: /Users/qinxi/stanza_resources/en/tokenize/mimic.pt
2024-08-30 23:06:57 INFO: File exists: /Users/qinxi/stanza_resources/en/pos/mimic_charlm.pt
2024-08-30 23:06:57 INFO: File exists: /Users/qinxi/stanza_resources/en/lemma/mimic_nocharlm.pt
2024-08-30 23:06:57 INFO: File exists: /U

Entity: chronic obstructive pulmonary disease, Type: PROBLEM
Entity: albuterol, Type: TREATMENT


In [28]:
import warnings
warnings.filterwarnings("ignore")

In [41]:
# test by the first one 
clinical_text = df.eligibilityCriteria[0]
doc = nlp(clinical_text)

entities = []

# test print detected entities
for sentence in doc.sentences:
    for entity in sentence.ents:
        print(f"Entity: {entity.text}, Type: {entity.type}")
        entities.append((entity.text, entity.type))
print("Detected Entities:", len(entities))

Entity: functional hypothalamic amenorrhea, Type: PROBLEM
Entity: an effective non-hormonal contraceptive method, Type: TREATMENT
Entity: a progestin releasing intrauterine device, Type: TREATMENT
Entity: systemic skeletal effects, Type: PROBLEM
Entity: the study duration, Type: TEST
Entity: βHCG (pregnancy test, Type: TEST
Entity: TSH, Type: TEST
Entity: prolactin, Type: TEST
Entity: potassium, Type: TEST
Entity: magnesium, Type: TEST
Entity: Serum ALT ≤, Type: TEST
Entity: upper limit, Type: TEST
Entity: LDL, Type: TEST
Entity: eGFR, Type: TEST
Entity: Additional inclusion criteria, Type: TEST
Entity: functional hypothalamic amenorrhea, Type: PROBLEM
Entity: BMD Z-score, Type: TEST
Entity: Dental check, Type: TEST
Entity: functional hypothalamic amenorrhea, Type: PROBLEM
Entity: Disease, Type: PROBLEM
Entity: FHA, Type: PROBLEM
Entity: affect bone, Type: PROBLEM
Entity: untreated thyroid dysfunction, Type: PROBLEM
Entity: Cushing's disease, Type: PROBLEM
Entity: renal failure, Type: 

In [45]:
# test with cleaned text
df_cleaned = pd.read_csv('FHA_trials_eligibility_criteria_muti_query-cleaned.csv')

In [46]:
df_cleaned.head()

Unnamed: 0,nctId,healthyVolunteers,sex,genderBased,minimumAge,maximumAge,stdAges,samplingMethod,eligibilityCriteria_cleaned,studyPopulation_cleaned
0,NCT06533865,True,FEMALE,,14 Years,25 Years,"['CHILD', 'ADULT']",,inclusion criteria for functional hypothalam...,
1,NCT05410886,False,FEMALE,,18 Years,58 Years,['ADULT'],NON_PROBABILITY_SAMPLE,inclusion criteria any woman 18 58 years o...,all women referred by their gp with amenorrhea...
2,NCT02697136,False,ALL,,18 Years,,"['ADULT', 'OLDER_ADULT']",,main inclusion criteria male and female pa...,
3,NCT00453219,True,FEMALE,,18 Years,35 Years,['ADULT'],NON_PROBABILITY_SAMPLE,inclusion criteria inclusion criteria for ...,women ages 18 35
4,NCT00870350,True,ALL,,14 Years,15 Years,['CHILD'],,inclusion criteria healthy subject 14 15...,


In [48]:
clinical_text = df_cleaned.eligibilityCriteria_cleaned[0]
doc = nlp(clinical_text)

entities = []

# test print detected entities
for sentence in doc.sentences:
    for entity in sentence.ents:
        print(f"Entity: {entity.text}, Type: {entity.type}")
        entities.append((entity.text, entity.type))
print("Detected Entities:", len(entities))

Entity: functional hypothalamic amenorrhea, Type: PROBLEM
Entity: an effective non hormonal contraceptive method, Type: TREATMENT
Entity: a progestin releasing intrauterine device, Type: TREATMENT
Entity: systemic skeletal effects, Type: PROBLEM
Entity: the study duration, Type: TEST
Entity: βhcg  pregnancy, Type: PROBLEM
Entity: test, Type: TEST
Entity: tsh, Type: TEST
Entity: prolactin, Type: TEST
Entity: potassium, Type: TEST
Entity: magnesium, Type: TEST
Entity: serum alt, Type: TEST
Entity: ldl, Type: TEST
Entity: egfr, Type: TEST
Entity: additional inclusion criteria, Type: TEST
Entity: functional hypothalamic amenorrhea, Type: PROBLEM
Entity: bmd z score, Type: TEST
Entity: dental check, Type: TEST
Entity: functional hypothalamic amenorrhea, Type: PROBLEM
Entity: controls    disease, Type: PROBLEM
Entity: affect bone, Type: PROBLEM
Entity: untreated thyroid dysfunction  cushing s disease, Type: PROBLEM
Entity: renal failure  diabetes mellitus, Type: PROBLEM
Entity: bisphosphonat

In [32]:
# !pip install pymetamap

In [37]:
# TODO  how to get it to OMOP mapping though, without annotator
# not enough data to train
# closet I found was https://github.com/AnthonyMRios/pymetamap 
# saw some discussion about UMLS -> OMOP not sure if that will work

In [49]:
### New Attempt, use GPT

In [54]:
def get_completion(prompt, model="gpt-4o-mini", temp=0):  #"gpt-3.5-turbo"):

    messages = [
        {"role": "system", "content": "You are a helpful clinical trail annotator."},
        {"role": "user", "content": prompt}
    ]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temp, # this is the degree of randomness of the model's output
    )
    return response.choices[0].message.content

In [50]:
eligibility_cr = df_cleaned.eligibilityCriteria_cleaned[0]

In [None]:
entity_cate = [
    "Condition",
    "Observation",
    "Procedure/Device",
    "Drug/Substance"
]
attribute_cate = [
    "Measurement",
    "Temporal Anstraints",
    "Qualifiers/Qualifiers",
    "Anatomic location"
]

input sent -> list of dicts
each dict represent one identity 





In [64]:
prompt = f"""
Your task is to extract and annotate input Eligibility Criteria into this OMOP Common Data Model (CDM) v5 format then in the "The Annotation Guidelines for Concept in Eligibility Criteria according to OMOP CDM v5" paper.

output structured information for entities and relations in JSON format.  Only output a list of the entities that can be parsed by JSON (Note: double quotes). 

eligibility criteria: {eligibility_cr}
"""

response = get_completion(prompt, model="gpt-4o")
print(response)


```json
[
  {
    "criterion": "inclusion",
    "condition": "functional hypothalamic amenorrhea",
    "gender": "female",
    "age_range": "14-25 years",
    "bone_age": "14 years",
    "growth_left": "2 years",
    "reproductive_age": true,
    "contraceptive_method": "effective non-hormonal or progestin-releasing intrauterine device",
    "systemic_skeletal_effects": "none",
    "pregnancy_test": "negative βhcg",
    "tsh": "within normal range",
    "prolactin": "within normal range",
    "potassium": "within normal range",
    "magnesium": "within normal range",
    "serum_alt": "< 3 times upper limit of normal",
    "ldl": "< 190 mg/dl",
    "egfr": "> 30 ml/min",
    "additional_criteria": {
      "menses": "< 3 in preceding 6 months",
      "bmd_z_score": ">= -1.0 at 1 skeletal site",
      "dental_check_up": "within past year"
    }
  },
  {
    "criterion": "exclusion",
    "condition": "functional hypothalamic amenorrhea",
    "disease_affecting_bone": [
      "untreated thy

In [66]:
for eligibility_cr in df_cleaned.eligibilityCriteria_cleaned:
    prompt = f"""
        Your task is to extract and annotate input Eligibility Criteria into this OMOP Common Data Model (CDM) v5 format then in the "The Annotation Guidelines for Concept in Eligibility Criteria according to OMOP CDM v5" paper.

        output structured information for entities and relations in JSON format.  Only output a list of the entities that can be parsed by JSON (Note: double quotes). 

        eligibility criteria: {eligibility_cr}
    """
    response = get_completion(prompt, model="gpt-4o")
    df_cleaned['ec_parsed'] = response

In [68]:
df_cleaned.to_csv("FHA_trails_EC_anno_parsed.csv", index=False)

In [70]:
df_cleaned.ec_parsed

0     ```json\n[\n    {\n        "criteria_type": "i...
1     ```json\n[\n    {\n        "criteria_type": "i...
2     ```json\n[\n    {\n        "criteria_type": "i...
3     ```json\n[\n    {\n        "criteria_type": "i...
4     ```json\n[\n    {\n        "criteria_type": "i...
5     ```json\n[\n    {\n        "criteria_type": "i...
6     ```json\n[\n    {\n        "criteria_type": "i...
7     ```json\n[\n    {\n        "criteria_type": "i...
8     ```json\n[\n    {\n        "criteria_type": "i...
9     ```json\n[\n    {\n        "criteria_type": "i...
10    ```json\n[\n    {\n        "criteria_type": "i...
11    ```json\n[\n    {\n        "criteria_type": "i...
12    ```json\n[\n    {\n        "criteria_type": "i...
13    ```json\n[\n    {\n        "criteria_type": "i...
14    ```json\n[\n    {\n        "criteria_type": "i...
15    ```json\n[\n    {\n        "criteria_type": "i...
16    ```json\n[\n    {\n        "criteria_type": "i...
17    ```json\n[\n    {\n        "criteria_type"

In [None]:
## Round 2

In [71]:
# test with cleaned text
df_cleaned = pd.read_csv('FHA_trials_eligibility_criteria_muti_query-cleaned.csv')

In [72]:
df_cleaned.head()

Unnamed: 0,nctId,briefTitle,healthyVolunteers,sex,genderBased,minimumAge,maximumAge,stdAges,samplingMethod,eligibilityCriteria_cleaned,studyPopulation_cleaned
0,NCT05410886,Screening Women for Functional Hypothalamic Am...,False,FEMALE,,18 Years,58 Years,['ADULT'],NON_PROBABILITY_SAMPLE,inclusion criteria any woman 18 58 years o...,all women referred by their gp with amenorrhea...
1,NCT06533865,Romosozumab as an Adjunct to Physiologic Estro...,True,FEMALE,,14 Years,25 Years,"['CHILD', 'ADULT']",,inclusion criteria for functional hypothalam...,
2,NCT00453219,"FHA: Characterization of Metabolic Status, Bra...",True,FEMALE,,18 Years,35 Years,['ADULT'],NON_PROBABILITY_SAMPLE,inclusion criteria inclusion criteria for ...,women ages 18 35
3,NCT02697136,CER-001 Therapy as a Novel Approach to Treat G...,False,ALL,,18 Years,,"['ADULT', 'OLDER_ADULT']",,main inclusion criteria male and female pa...,
4,NCT00870350,"An Immunogenicity and Safety Study of Tetanus,...",True,ALL,,14 Years,15 Years,['CHILD'],,inclusion criteria healthy subject 14 15...,


In [75]:
i = 0
for eligibility_cr in df_cleaned.eligibilityCriteria_cleaned:
    i += 1
    if i % 10 == 0:
        print(f'processing {i} out of {len(df_cleaned.eligibilityCriteria_cleaned)} records')
    prompt = f"""
        Your task is to extract and annotate input Eligibility Criteria into this OMOP Common Data Model (CDM) v5 format then in the "The Annotation Guidelines for Concept in Eligibility Criteria according to OMOP CDM v5" paper.

        output structured information for entities and relations in JSON format.  Only output a list of the entities that can be parsed by JSON (Note: double quotes). We need the extracted NER and relations for each single criterion instead of for the entire list of criteria for a trial. The results should be in a structure, for a trial, we have a list of criteria, for each criterion, we have the list of extracted entities and extracted relations.

        eligibility criteria: {eligibility_cr}
    """
    response = get_completion(prompt, model="gpt-4o")
    df_cleaned['ec_parsed'] = response

processing 10 out of 113 records
processing 20 out of 113 records
processing 30 out of 113 records
processing 40 out of 113 records
processing 50 out of 113 records
processing 60 out of 113 records
processing 70 out of 113 records
processing 80 out of 113 records
processing 90 out of 113 records
processing 100 out of 113 records
processing 110 out of 113 records


In [76]:
df_cleaned.to_csv("FHA_trails_EC_anno_parsed_2.csv", index=False)

In [None]:
i = 0
for eligibility_cr in df_cleaned.eligibilityCriteria_cleaned:
    i += 1
    if i % 10 == 0:
        print(f'processing {i} out of {len(df_cleaned.eligibilityCriteria_cleaned)} records')
    prompt = f"Given the following clinical trial eligibility criteria, extract the Named Entities (NER) and the relations for each criterion individually. For each criterion, identify the entities (e.g., age, gender, condition, measurement, test, timeframe, range, etc.) and specify the relations between these entities. Return the output in a structured format where each criterion contains a list of the extracted entities and a list of relations between those entities. Use the following structure:
    
        criterion: The original trial criterion text.
        entities: A list of extracted entities, each defined by:
            entity_type: The type of entity (e.g., Age, Gender, Condition, Measurement, etc.).
            value: The extracted value of the entity (e.g., '14-25 years', 'Female', 'LDL', etc.).
        relations: A list of relations between entities, each defined by:
            relation_type: The type of relation (e.g., 'Age Range', 'Condition Timeframe', 'Measurement Range', etc.).
            source_entity: The entity that initiates the relation.
            target_entity: The entity that is related to the source entity.
    
    eligibility criteria: {eligibility_cr}"
    response = get_completion(prompt, model="gpt-4o")
    df_cleaned['ec_parsed'] = response
df_cleaned.to_csv("FHA_trails_EC_anno_parsed_3.csv", index=False)

In [77]:
eligibility_cr

'inclusion criteria     patients   18 year old   diagnosis of nfpas confirmed with hormonal and histological analysis   patients who underwent surgery in neurosurgery unit of the reims university hospital between 01 01 1991 and 31 12 2004  exclusion criteria'

In [None]:
# notesL
# parse the criteria one by one 
# range relation should be one constraint 
# eval of NER / Info extracting metrics -> F1? how to do it without labeling though?
      # manual check: by correcting its results and override 
# prompt engineering / optimization 
# human annotation / correction

In [78]:
df.head()

Unnamed: 0,nctId,eligibilityCriteria,healthyVolunteers,sex,genderBased,minimumAge,maximumAge,stdAges,studyPopulation,samplingMethod
0,NCT06533865,Inclusion Criteria:\n\nFor functional hypothal...,True,FEMALE,,14 Years,25 Years,"['CHILD', 'ADULT']",,
1,NCT05410886,Inclusion Criteria:\n\n* Any woman 18-58 years...,False,FEMALE,,18 Years,58 Years,['ADULT'],All women referred by their GP with amenorrhea...,NON_PROBABILITY_SAMPLE
2,NCT02697136,Main Inclusion Criteria:\n\n* Male and female ...,False,ALL,,18 Years,,"['ADULT', 'OLDER_ADULT']",,
3,NCT00453219,Inclusion Criteria:\n\n* Inclusion criteria fo...,True,FEMALE,,18 Years,35 Years,['ADULT'],Women ages 18-35,NON_PROBABILITY_SAMPLE
4,NCT00870350,Inclusion Criteria:\n\n* healthy subject\n* 14...,True,ALL,,14 Years,15 Years,['CHILD'],,


In [106]:
df.shape

(113, 11)

In [108]:
df_cleaned.loc[df_cleaned['nctId'] == 'NCT06533865']

Unnamed: 0,nctId,briefTitle,healthyVolunteers,sex,genderBased,minimumAge,maximumAge,stdAges,samplingMethod,eligibilityCriteria_cleaned,studyPopulation_cleaned,ec_parsed
1,NCT06533865,Romosozumab as an Adjunct to Physiologic Estro...,True,FEMALE,,14 Years,25 Years,"['CHILD', 'ADULT']",,inclusion criteria for functional hypothalam...,,"```json\n{\n ""criteria"": [\n {\n ""cri..."


In [110]:
df.iloc[1].eligibilityCriteria

"Inclusion Criteria:\n\nFor functional hypothalamic amenorrhea and controls:\n\n* Female, age 14-25 years, skeletally mature with bone age ≥ 14 years (only 2% of growth left)\n* For women of reproductive age, agree to use an effective non-hormonal contraceptive method or a progestin releasing intrauterine device (no evidence of systemic skeletal effects) for the study duration\n* Negative βHCG (pregnancy test)\n* TSH, prolactin, potassium, magnesium within the normal range\n* Serum ALT ≤ 3 times upper limit of normal, LDL ≤ 190 mg/dl\n* eGFR ≥ 30ml/minute\n\nAdditional inclusion criteria for functional hypothalamic amenorrhea:\n\n* Less than 3 menses in the preceding 6 months\n* BMD Z-score \\< -1.0 at ≥ 1 skeletal site\n* Dental check-up within the past year\n\nExclusion Criteria:\n\nFor functional hypothalamic amenorrhea and controls\n\n* Disease other than FHA known to affect bone, including untreated thyroid dysfunction, Cushing's disease, renal failure, diabetes mellitus\n* Use of

In [99]:
df_cleaned.loc[df_cleaned['nctId'] == 'NCT06533865']

Unnamed: 0,nctId,briefTitle,healthyVolunteers,sex,genderBased,minimumAge,maximumAge,stdAges,samplingMethod,eligibilityCriteria_cleaned,studyPopulation_cleaned,ec_parsed
1,NCT06533865,Romosozumab as an Adjunct to Physiologic Estro...,True,FEMALE,,14 Years,25 Years,"['CHILD', 'ADULT']",,inclusion criteria for functional hypothalam...,,"```json\n{\n ""criteria"": [\n {\n ""cri..."


In [101]:
df_cleaned.iloc[1].eligibilityCriteria_cleaned

'inclusion criteria   for functional hypothalamic amenorrhea and controls     female  age 14 25 years  skeletally mature with bone age   14 years  only 2  of growth left    for women of reproductive age  agree to use an effective non hormonal contraceptive method or a progestin releasing intrauterine device  no evidence of systemic skeletal effects  for the study duration   negative βhcg  pregnancy test    tsh  prolactin  potassium  magnesium within the normal range   serum alt   3 times upper limit of normal  ldl   190 mg dl   egfr   30ml minute  additional inclusion criteria for functional hypothalamic amenorrhea     less than 3 menses in the preceding 6 months   bmd z score     1 0 at   1 skeletal site   dental check up within the past year  exclusion criteria   for functional hypothalamic amenorrhea and controls    disease other than fha known to affect bone  including untreated thyroid dysfunction  cushing s disease  renal failure  diabetes mellitus   use of bisphosphonates   use 

In [112]:
import re
text = df.iloc[1].eligibilityCriteria #df.eligibilityCriteria[0]

def extract_criteria(text):
    # Split text by new lines or bullet points
    criteria = re.split(r'\n\* ', text)
    
    # Remove any leading/trailing whitespace and filter out any empty strings
    criteria = [c.strip() for c in criteria if c.strip()]
    
    return criteria

# Extract criteria
criteria_list = extract_criteria(text)

# Print each criterion
for i, criterion in enumerate(criteria_list, 1):
    print(f"--------- i : {criterion}\n")

--------- i : Inclusion Criteria:

For functional hypothalamic amenorrhea and controls:

--------- i : Female, age 14-25 years, skeletally mature with bone age ≥ 14 years (only 2% of growth left)

--------- i : For women of reproductive age, agree to use an effective non-hormonal contraceptive method or a progestin releasing intrauterine device (no evidence of systemic skeletal effects) for the study duration

--------- i : Negative βHCG (pregnancy test)

--------- i : TSH, prolactin, potassium, magnesium within the normal range

--------- i : Serum ALT ≤ 3 times upper limit of normal, LDL ≤ 190 mg/dl

--------- i : eGFR ≥ 30ml/minute

Additional inclusion criteria for functional hypothalamic amenorrhea:

--------- i : Less than 3 menses in the preceding 6 months

--------- i : BMD Z-score \< -1.0 at ≥ 1 skeletal site

--------- i : Dental check-up within the past year

Exclusion Criteria:

For functional hypothalamic amenorrhea and controls

--------- i : Disease other than FHA known 

In [114]:
def extract_criteria_with_labels(text):
    # Define the sections
    inclusion_section = 'Inclusion Criteria:'
    exclusion_section = 'Exclusion Criteria:'

    # Initialize variables
    criteria = []
    current_label = None

    # Split the text into sections based on double newlines
    sections = re.split(r'\n\n', text)

    for section in sections:
        if inclusion_section in section:
            current_label = 'Inclusion'
            continue
        elif exclusion_section in section:
            current_label = 'Exclusion'
            continue
        
        # Extract criteria within the current section, skipping non-criteria lines
        if current_label:
            # Extract lines that start with '* ' or '• '
            items = re.findall(r'[\*\•]\s+(.*)', section)
            for item in items:
                item = item.strip()
                if item:  # Skip empty lines
                    criteria.append(f"{current_label}: {item}")

    return criteria

# Extract criteria with labels
labeled_criteria = extract_criteria_with_labels(text)

# Print each labeled criterion
for i, criterion in enumerate(labeled_criteria, 1):
    print(f" {i}: {criterion}\n")


 1: Inclusion: Female, age 14-25 years, skeletally mature with bone age ≥ 14 years (only 2% of growth left)

 2: Inclusion: For women of reproductive age, agree to use an effective non-hormonal contraceptive method or a progestin releasing intrauterine device (no evidence of systemic skeletal effects) for the study duration

 3: Inclusion: Negative βHCG (pregnancy test)

 4: Inclusion: TSH, prolactin, potassium, magnesium within the normal range

 5: Inclusion: Serum ALT ≤ 3 times upper limit of normal, LDL ≤ 190 mg/dl

 6: Inclusion: eGFR ≥ 30ml/minute

 7: Inclusion: Less than 3 menses in the preceding 6 months

 8: Inclusion: BMD Z-score \< -1.0 at ≥ 1 skeletal site

 9: Inclusion: Dental check-up within the past year

 10: Exclusion: Disease other than FHA known to affect bone, including untreated thyroid dysfunction, Cushing's disease, renal failure, diabetes mellitus

 11: Exclusion: Use of bisphosphonates

 12: Exclusion: Use of other medications known to affect bone metabolism 

In [117]:
df.iloc[2].nctId, df.iloc[2].eligibilityCriteria 

('NCT00453219',
 'Inclusion Criteria:\n\n* Inclusion criteria for participation are a gynecological age (age since menarche) \\> 5 and \\< 25 years, and chronological age \\> 18 years, within 90-110% of ideal body weight as determined by the 1983 Metropolitan height and weight table for women, and exercise \\< 10 h/wk and run \\< 10 mi/wk, day-awake/night-asleep schedule.\n* Women in the FHA and PCOS groups have to fulfill the diagnostic criteria of FHA or PCOS and to have all other causes of amenorrhea and anovulation excluded.\n\nExclusion Criteria:\n\n* Exclusion criteria are smoking, medications, including psychotropic or illicit drugs, medical, neurological, or ophthalmologic disease except acuity problems, a weight loss or gain of \\> 10 lb within a year preceding or since the onset of amenorrhea, a major Axis I disorder other than depression, parturition in the last 12 months and/or lactating in the last 6 months.')

In [118]:
text2 = df.iloc[2].eligibilityCriteria 
labeled_criteria = extract_criteria_with_labels(text2)

# Print each labeled criterion
for i, criterion in enumerate(labeled_criteria, 1):
    print(f" {i}: {criterion}\n")

 1: Inclusion: Inclusion criteria for participation are a gynecological age (age since menarche) \> 5 and \< 25 years, and chronological age \> 18 years, within 90-110% of ideal body weight as determined by the 1983 Metropolitan height and weight table for women, and exercise \< 10 h/wk and run \< 10 mi/wk, day-awake/night-asleep schedule.

 2: Inclusion: Women in the FHA and PCOS groups have to fulfill the diagnostic criteria of FHA or PCOS and to have all other causes of amenorrhea and anovulation excluded.

 3: Exclusion: Exclusion criteria are smoking, medications, including psychotropic or illicit drugs, medical, neurological, or ophthalmologic disease except acuity problems, a weight loss or gain of \> 10 lb within a year preceding or since the onset of amenorrhea, a major Axis I disorder other than depression, parturition in the last 12 months and/or lactating in the last 6 months.



In [120]:
df.iloc[12].nctId, df.iloc[12].eligibilityCriteria 

('NCT02871986',
 "Inclusion Criteria:\n\n1. Have a diagnosis of hypogonadism (Turner's syndrome, hypogonadotrophic hypogonadism, primary ovarian insufficiency, hypopituitarism, hypothalamic amenorrhoea, transgender)\n2. ≥ 10 years of age\n3. Oestrogen naïve i.e. no prior commencement of oestrogen treatment\n4. Breast Tanner stage ≤ than 2\n\nExclusion Criteria:\n\n1. Previous oncology treatment\n2. Primary amenorrhoea secondary to chronic medical comorbidity\n3. PCOS diagnosis")

In [125]:
def extract_criteria_with_labels(text):
    # Define the sections
    inclusion_section = 'Inclusion Criteria:'
    exclusion_section = 'Exclusion Criteria:'

    # Initialize variables
    criteria = []
    current_label = None

    # Split the text into sections based on double newlines
    sections = re.split(r'\n\n', text)

    for section in sections:
        if inclusion_section in section:
            current_label = 'Inclusion'
            continue
        elif exclusion_section in section:
            current_label = 'Exclusion'
            continue
        
        # Extract criteria within the current section, skipping non-criteria lines
        if current_label:
            # Extract lines that start with a number followed by a dot or a bullet
            items = re.findall(r'^\d+\.\s+(.*)', section, re.MULTILINE)
            items += re.findall(r'^[\*\•]\s+(.*)', section, re.MULTILINE)
            for item in items:
                item = item.strip()
                if item:  # Skip empty lines
                    criteria.append(f"{current_label}: {item}")

    return criteria

# Extract criteria with labels
labeled_criteria = extract_criteria_with_labels(text)

# Print each labeled criterion
for i, criterion in enumerate(labeled_criteria, 1):
    print(f" {i}: {criterion}\n")


 1: Inclusion: Female, age 14-25 years, skeletally mature with bone age ≥ 14 years (only 2% of growth left)

 2: Inclusion: For women of reproductive age, agree to use an effective non-hormonal contraceptive method or a progestin releasing intrauterine device (no evidence of systemic skeletal effects) for the study duration

 3: Inclusion: Negative βHCG (pregnancy test)

 4: Inclusion: TSH, prolactin, potassium, magnesium within the normal range

 5: Inclusion: Serum ALT ≤ 3 times upper limit of normal, LDL ≤ 190 mg/dl

 6: Inclusion: eGFR ≥ 30ml/minute

 7: Inclusion: Less than 3 menses in the preceding 6 months

 8: Inclusion: BMD Z-score \< -1.0 at ≥ 1 skeletal site

 9: Inclusion: Dental check-up within the past year

 10: Exclusion: Disease other than FHA known to affect bone, including untreated thyroid dysfunction, Cushing's disease, renal failure, diabetes mellitus

 11: Exclusion: Use of bisphosphonates

 12: Exclusion: Use of other medications known to affect bone metabolism 

In [124]:
labeled_criteria = extract_criteria_with_labels(text2)

# Print each labeled criterion
for i, criterion in enumerate(labeled_criteria, 1):
    print(f" {i}: {criterion}\n")

 1: Inclusion: Inclusion criteria for participation are a gynecological age (age since menarche) \> 5 and \< 25 years, and chronological age \> 18 years, within 90-110% of ideal body weight as determined by the 1983 Metropolitan height and weight table for women, and exercise \< 10 h/wk and run \< 10 mi/wk, day-awake/night-asleep schedule.

 2: Inclusion: Women in the FHA and PCOS groups have to fulfill the diagnostic criteria of FHA or PCOS and to have all other causes of amenorrhea and anovulation excluded.

 3: Exclusion: Exclusion criteria are smoking, medications, including psychotropic or illicit drugs, medical, neurological, or ophthalmologic disease except acuity problems, a weight loss or gain of \> 10 lb within a year preceding or since the onset of amenorrhea, a major Axis I disorder other than depression, parturition in the last 12 months and/or lactating in the last 6 months.



In [131]:
text3 = df.iloc[12].eligibilityCriteria 
labeled_criteria = extract_criteria_with_labels(text3)

# Print each labeled criterion
for i, criterion in enumerate(labeled_criteria, 1):
    print(f" {i}: {criterion}\n")

 1: Inclusion: Have a diagnosis of hypogonadism (Turner's syndrome, hypogonadotrophic hypogonadism, primary ovarian insufficiency, hypopituitarism, hypothalamic amenorrhoea, transgender)

 2: Inclusion: ≥ 10 years of age

 3: Inclusion: Oestrogen naïve i.e. no prior commencement of oestrogen treatment

 4: Inclusion: Breast Tanner stage ≤ than 2

 5: Exclusion: Previous oncology treatment

 6: Exclusion: Primary amenorrhoea secondary to chronic medical comorbidity

 7: Exclusion: PCOS diagnosis



In [140]:
# ner prompt 
file_path = 'ner.py'

with open(file_path, 'r') as file:
    prompt_ner = file.read()

In [139]:
# relation prompt 
file_path = 're.py'

with open(file_path, 'r') as file:
    prompt_re = file.read()

In [132]:
text3 = df.iloc[12].eligibilityCriteria 
labeled_criteria = extract_criteria_with_labels(text3)

# Print each labeled criterion
for i, criterion in enumerate(labeled_criteria, 1):
    print(f" {i}: {criterion}\n")

 1: Inclusion: Have a diagnosis of hypogonadism (Turner's syndrome, hypogonadotrophic hypogonadism, primary ovarian insufficiency, hypopituitarism, hypothalamic amenorrhoea, transgender)

 2: Inclusion: ≥ 10 years of age

 3: Inclusion: Oestrogen naïve i.e. no prior commencement of oestrogen treatment

 4: Inclusion: Breast Tanner stage ≤ than 2

 5: Exclusion: Previous oncology treatment

 6: Exclusion: Primary amenorrhoea secondary to chronic medical comorbidity

 7: Exclusion: PCOS diagnosis



In [138]:
for i, criterion in enumerate(labeled_criteria, 1):
    print(f" {i}: {criterion}\n")
    prompt = prompt_ner + ": " + criterion
    response = get_completion(prompt, model="gpt-4o")
    print(f" {i} parsed:  {response}\n\n")

 1: Inclusion: Have a diagnosis of hypogonadism (Turner's syndrome, hypogonadotrophic hypogonadism, primary ovarian insufficiency, hypopituitarism, hypothalamic amenorrhoea, transgender)

 1 parsed:  ```json
[
    {
        "entity": "diagnosis",
        "category": "Observation"
    },
    {
        "entity": "hypogonadism",
        "category": "Condition"
    },
    {
        "entity": "Turner's syndrome",
        "category": "Condition"
    },
    {
        "entity": "hypogonadotrophic hypogonadism",
        "category": "Condition"
    },
    {
        "entity": "primary ovarian insufficiency",
        "category": "Condition"
    },
    {
        "entity": "hypopituitarism",
        "category": "Condition"
    },
    {
        "entity": "hypothalamic amenorrhoea",
        "category": "Condition"
    },
    {
        "entity": "transgender",
        "category": "Condition"
    }
]
```


 2: Inclusion: ≥ 10 years of age

 2 parsed:  ```json
[
    {
        "entity": "≥ 10 years",
    

In [142]:
df.columns

Index(['nctId', 'briefTitle', 'eligibilityCriteria', 'healthyVolunteers',
       'sex', 'genderBased', 'minimumAge', 'maximumAge', 'stdAges',
       'studyPopulation', 'samplingMethod'],
      dtype='object')

In [148]:
# create df and save
sample_df = pd.DataFrame(columns=['nctId', 'ec_single', 'relation'])

In [None]:
# text3 = df.iloc[12].eligibilityCriteria 
# labeled_criteria = extract_criteria_with_labels(text3)

start = 0
for index, row in df.head(5).iterrows():
    ec = row.eligibilityCriteria
    nct_id = row.nctId
    # print(text)
    labeled_criteria = extract_criteria_with_labels(ec)
    for i, criterion in enumerate(labeled_criteria, 1):
        print(i)
        # print(f" {i}: {criterion}\n")
        prompt = prompt_ner + ": " + criterion
        ner_response = get_completion(prompt, model="gpt-4o")

        prompt2 = prompt_re + ": " + ner_response
        re_response = get_completion(prompt2, model="gpt-4o")
        
        # sample_df['nctId'] = nct_id
        # sample_df['ec_single'] = ner_response
        # sample_df['relation'] = response
        sample_df.loc[start] = [nct_id, ner_response, re_response]
        start += 1

# for ec in sample_df:
#     prompt = prompt_re + ": " + ec
#     response = get_completion(prompt, model="gpt-4o")
#     # print(f" {i} parsed:  {response}\n\n")
#     sample_df['relation'] = response

1
2
3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
1
2
3
1
2
3
4
5
6
7
8
9
10
11
12
13
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19


In [None]:
sample_df.head()

In [None]:
sample_df.to_csv("sample_df_5trails.csv", index=False)