## Graph Information Extraction from clinical notes


In [1]:
# Verify environment after fixing
import numpy as np
import transformers
import datasets
import os
import traceback

print(f"NumPy version: {np.__version__}")
print(f"Transformers version: {transformers.__version__}")
print(f"Datasets version: {datasets.__version__}")

# Also check if packages can be imported without errors
try:
    # Try to import some commonly used modules
    import pandas as pd
    from gliner import GLiNER
    # from relik import Relik

    print("\nAll required packages imported successfully!")
    print("The NumPy compatibility issue appears to be resolved.")

    # Check if the data directory exists
    data_dir = "src/data"
    if os.path.exists(data_dir):
        print(f"\nData directory '{data_dir}' exists.")
        # List any files in the directory
        files = os.listdir(data_dir)
        if files:
            print(f"Files in directory: {', '.join(files)}")
        else:
            print("No files in directory yet.")
    else:
        print(f"\nData directory '{data_dir}' does not exist yet. It will be created when running the extraction.")
    
except Exception as e:
    print(f"\nError importing packages: {e}")
    print("You may need to reinstall additional packages or restart the kernel.")

NumPy version: 2.2.5
Transformers version: 4.51.3
Datasets version: 3.6.0

All required packages imported successfully!
The NumPy compatibility issue appears to be resolved.

Data directory 'src/data' does not exist yet. It will be created when running the extraction.


In [2]:
from datasets import load_dataset
# from relik.inference.data.objects import RelikOutput
# from relik.retriever.indexers.document import Document


dataset = load_dataset(
    'AGBonnet/augmented-clinical-notes',
    split='train',
)

# sample = dataset.__iter__().__next__()

def nested_print(key, element, level=0):
    if isinstance(element, dict):
        print(f'{"│ "*(level)}├─{key}:')
        for k, v in element.items():
            nested_print(k, v, level+1)
    else:
        print(f'{"│ "*(level)}├─{key}: {element}')

# nested_print('sample', sample)

# Print the first 5 elements of the dataset using nested_print
for i, sample in enumerate(dataset):
    if i < 5:
        print(f"\nSample {i+1}:")
        nested_print('sample', sample)
    else:
        break


Sample 1:
├─sample:
│ ├─idx: 155216
│ ├─note: A a sixteen year-old girl, presented to our Outpatient department with the complaints of discomfort in the neck and lower back as well as restriction of body movements. She was not able to maintain an erect posture and would tend to fall on either side while standing up from a sitting position. She would keep her head turned to the right and upwards due to the sustained contraction of the neck muscles. There was a sideways bending of the back in the lumbar region. To counter the abnormal positioning of the back and neck, she would keep her limbs in a specific position to allow her body weight to be supported. Due to the restrictions with the body movements at the neck and in the lumbar region, she would require assistance in standing and walking. She would require her parents to help her with daily chores, including all activities of self-care.
She had been experiencing these difficulties for the past four months since when she was introdu

In [3]:
# extract idx and full_note
def extract_idx_full_note(sample):
    idx = sample['idx']
    full_note = sample['full_note']
    return {
        'idx': idx,
        'full_note': full_note
    }

# extract idx and full_note from the sample
dataset = dataset.map(
    extract_idx_full_note,
    remove_columns=dataset.column_names,
    batch_size=1000)

# Print the first 5 elements of the dataset using nested_print
for i, sample in enumerate(dataset):
    if i < 5:
        print(f"\nSample {i+1}:")
        nested_print('sample', sample)
    else:
        break


Sample 1:
├─sample:
│ ├─idx: 155216
│ ├─full_note: A a sixteen year-old girl, presented to our Outpatient department with the complaints of discomfort in the neck and lower back as well as restriction of body movements. She was not able to maintain an erect posture and would tend to fall on either side while standing up from a sitting position. She would keep her head turned to the right and upwards due to the sustained contraction of the neck muscles. There was a sideways bending of the back in the lumbar region. To counter the abnormal positioning of the back and neck, she would keep her limbs in a specific position to allow her body weight to be supported. Due to the restrictions with the body movements at the neck and in the lumbar region, she would require assistance in standing and walking. She would require her parents to help her with daily chores, including all activities of self-care.\nShe had been experiencing these difficulties for the past four months since when she was i

In [4]:
# Model
gliner_model = GLiNER.from_pretrained("Ihor/gliner-biomed-bi-large-v1.0")

# Labels for the GLiNER model
labels = ["Disease or Condition", "Medication", "Medication Dosage and Frequency", "Procedure", "Lab Test", "Lab Test Result", "Body Site", "Medical Device", "Demographic Information"]

Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/8.39k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/970 [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

gliner_config.json:   0%|          | 0.00/6.06k [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

trainer_state.json:   0%|          | 0.00/18.2k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.65M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [5]:
# Load the Relik Relation Extraction model
# relik_relation_extract = Relik.from_pretrained("sapienzanlp/relik-relation-extraction-nyt-large")

In [6]:
# from synthIE.models import GenIEFlanT5PL

# model_path = os.path.join(os.getcwd(), '..', 'models', 'synthie_large_fe.ckpt')
# print(f"Model path: {model_path}")
# if os.path.exists(model_path):
#     model = GenIEFlanT5PL.load_from_checkpoint(checkpoint_path=model_path)

In [7]:
from mlx_lm import load, generate

model_name = "google/medgemma-4b-it"
medgemma_model, tokenizer = load(model_name)

Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/70.0 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/90.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.64G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs-us-1.hf.co/repos/3d/df/3ddf3ad8b796b6d5ada0850f03115c053532413d0d668c795d595ebcf54cd23e/1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27tokenizer.model%3B+filename%3D%22tokenizer.model%22%3B&Expires=1748266260&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0ODI2NjI2MH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzNkL2RmLzNkZGYzYWQ4Yjc5NmI2ZDVhZGEwODUwZjAzMTE1YzA1MzUzMjQxM2QwZDY2OGM3OTVkNTk1ZWJjZjU0Y2QyM2UvMTI5OWMxMWQ3Y2Y2MzJlZjNiNGUxMTkzNzUwMTM1OGFkYTAyMWJiZGY3YzQ3NjM4ZDEzYzBlZTk4MmYyZTc5Yz9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=hLoKXwZ2TCSj0FXAAplZLBkxS0IfvZ93hqEBNr7F%7E4KNAKKWrXet-5G0HL%7E37oNl5FX3NRKURHfKsp2OXoOFkylZAmo2OlSjLDMh16vGoLRcADWLivn36wDMel9ksHe-5oSaxhs6GXHiENqRlPlbuFkA23TC7flTTOATKoGcQgeLpA8JhI6NP8G9amyTibnWKd6XV%7EWXW4Xk477KJo8u5sCViiNPdLWPpqK9ilhh%7EFnlTfl75FwT6o9E3XrI22Nt

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs-us-1.hf.co/repos/3d/df/3ddf3ad8b796b6d5ada0850f03115c053532413d0d668c795d595ebcf54cd23e/359b85680f8fe50feedb13cb3d7044f6cd690a967b4c07cb0c24201b0ae7fd87?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model-00001-of-00002.safetensors%3B+filename%3D%22model-00001-of-00002.safetensors%22%3B&Expires=1748266260&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0ODI2NjI2MH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzNkL2RmLzNkZGYzYWQ4Yjc5NmI2ZDVhZGEwODUwZjAzMTE1YzA1MzUzMjQxM2QwZDY2OGM3OTVkNTk1ZWJjZjU0Y2QyM2UvMzU5Yjg1NjgwZjhmZTUwZmVlZGIxM2NiM2Q3MDQ0ZjZjZDY5MGE5NjdiNGMwN2NiMGMyNDIwMWIwYWU3ZmQ4Nz9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=kZlaBN5Xr9RH9BQADscFw3v6kyuIHGDuTbjyKckqwxXWKI3p-HXph0t-LyffQfEHBACy9tdHZGNXP6jMmMSg7mMx-eaJlo%7EIIO3vsTl2LP19C682OsrwGjPl1lUEPiQPC1E6E37u4Q8mmvYTq2ITG9MFxWargCJjg5A0ONF7psFiKGMkwE39BtiDSkztOVjW6PcaYUxnX26yMiFzRCrslbgpLqMa4cghQwOR

model-00001-of-00002.safetensors:   1%|1         | 52.4M/5.01G [00:00<?, ?B/s]

In [8]:
# --- NER Function ---
def perform_ner(text_note, model, label_list):
  """Performs NER on a text using, gliner model, and label list.
      https://huggingface.co/Ihor/gliner-biomed-bi-large-v1.0
  """

  entities = model.predict_entities(
      text_note,
      labels=label_list,
      threshold=0.5,
  )

  return entities

In [9]:
import wikipedia

def fetch_entity_descriptions(entity):
    """Fetch description of entity['text'] from Wikipedia knowledge base
    
    Args:
        entity (dict): Entity dictionary containing 'text' and other fields

    Returns:
        dict or None: Returns the entity with added description if found, None otherwise
    """
    try:
        
        # Get entity text
        query = entity['text']
        
        # Try to get a summary from Wikipedia
        try:
            # Search for the page
            search_results = wikipedia.search(query, results=1)
            if search_results:
                # Get the page summary
                page = wikipedia.page(search_results[0], auto_suggest=False)
                description = page.summary[0:200] + "..." if len(page.summary) > 200 else page.summary
                
                # Add description to entity
                entity['description'] = description
                entity['wiki_page'] = page.url
                return entity
        except (wikipedia.exceptions.DisambiguationError, wikipedia.exceptions.PageError):
            # Entity not found or ambiguous
            pass

        return None  # Will be filtered out
    except Exception as e:
        print(f"Error fetching description for {entity['text']}: {str(e)}")
        return None



In [10]:
file_path = "data/ner_output.txt"

# Clear previous contents if needed
with open(file_path, "w") as f:
  f.write("Starting extraction process\n")
  f.write("="*50 + "\n\n")

# Process a limited number of samples for testing
for i, sample_item in enumerate(dataset):
  try:
    # Limit to first 5 samples for testing
    if i >= 3:
      break
      
    note_id = sample_item['idx']
    full_note_content = sample_item['full_note']

    print(f"Processing note {note_id} ({i+1}/5)...")

    if not full_note_content or not full_note_content.strip():
      print(f"Note {note_id}: Skipped (empty).")
      continue
      
    # Perform NER
    # TODO: Run NER in parallel for all notes like Entity Linking??
    extracted_entities = perform_ner(full_note_content, gliner_model, labels)

    # Remove duplicate entities
    extracted_entities = list({entity['text']: entity for entity in extracted_entities}.values())
    print(f"Extracted {len(extracted_entities)} entities from note {note_id}.")
    if not extracted_entities:
      print(f"Note {note_id}: No entities extracted.")
      continue
    # Entity Linking
    extracted_entities = list(filter(fetch_entity_descriptions, extracted_entities))

    
    # Convert entities to documents
    # extracted_entities = list(map(convert_entities_to_documents, extracted_entities))

    # Entity linking
    # relik_relation_extracted_output: RelikOutput = relik_relation_extract(
    #   text=full_note_content,
    #   candidates=extracted_entities,
    # )


    prompt = f""" I would like you to perform a Closed Information Extraction task on the following clinical note {full_note_content}
    You are provided with a list of entities extracted from the note in the form of JSON objects. {extracted_entities}
    Your task is to reason about the entities provided and the full clinical note content, and generate triplets of the form (entity1, relation, entity2)
    that represent the relationships between the entities in the note.
    Please return the triplets in the following format:
    [
        (entity1, relation, entity2),
        (entity1, relation, entity2),
        ...
    ]
    """
    messages = [
            {
                "role": "system",
                "content": "You are a helpful medical assistant."
            },
            {
                "role": "user",
                "content": prompt
            }]
    inputs = tokenizer.apply_chat_template(
        messages, add_generation_prompt=True
    )

    # Generate text with MLX model
    generation = generate(
        medgemma_model,
        tokenizer,
        prompt=inputs,
        verbose=True,
    )

    # decoded = tokenizer.decode(generation)
    # print(generation)

    # Write to file
    with open(file_path, "a") as f:
      f.write(f"Note ID: {note_id}\n")
      f.write(f"Full Note Content (excerpt): {full_note_content[:500]}...\n")

      # Print some entity examples if available
      if extracted_entities:
        f.write("Entity Examples:\n")
        f.write(f"Number of Extracted Entities: {len(extracted_entities)}\n")
        f.write(f"  - {extracted_entities[:5]}\n")

      if generation:
        f.write("Generated Triplets:\n")
        f.write(f" Number of Triplets: {len(generation)}\n")
        f.write(f"  - {generation}\n")

      f.write("\n" + "="*50 + "\n")

    print(f"Note {note_id}: Processed successfully with {len(extracted_entities)} entities.")

  except Exception as e:
    print(f"Error processing note: {str(e)}")
    traceback.print_exc()

    # Log the error
    with open(file_path, "a") as f:
      f.write(f"ERROR processing note {note_id if 'note_id' in locals() else 'unknown'}: {str(e)}\n")
      f.write(traceback.format_exc())
      f.write("\n" + "="*50 + "\n")

print("Processing complete. Check src/data/ner_output.txt for results.")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Processing note 155216 (1/5)...
Extracted 26 entities from note 155216.
Here are some triplets extracted from the clinical note:

[
    ('neck', 'is part of', 'body'),
    ('lower back', 'is part of', 'body'),
    ('lumbar region', 'is part of', 'body'),
    ('olanzapine tablets', 'treats', 'bipolar affective disorder'),
    ('olanzapine tablets', 'treats', 'schizophrenia'),
    ('olanzapine tablets', 'treats', 'aggressive and self-injurious behaviors'),
    ('olanzapine tablets', 'causes', 'neck pain'),
    ('olanzapine tablets', 'causes', 'abnormal neck muscle contraction'),
    ('olanzapine tablets', 'causes', 'head turning'),
    ('olanzapine tablets', 'causes', 'back bending'),
    ('olanzapine tablets', 'causes', 'limbs in specific position'),
    ('olanzapine tablets', 'causes', 'assistance in standing and walking'),
    ('olanzapine tablets', 'causes', 'parents to help with daily chores'),
    ('olanzapine tablets', 'causes', 'amenorrhea'),
    ('olanzapine tablets', 'causes', 



  lis = BeautifulSoup(html).find_all('li')


Here are some triplets extracted from the clinical note:

[
    ('left hip joint', 'affected by', 'idiopathic osteonecrosis of the femoral head'),
    ('left hip', 'underwent', 'THA'),
    ('left hip', 'underwent', 'second THA'),
    ('contralateral side', 'affected by', 'idiopathic osteonecrosis of the femoral head'),
    ('left hip', 'underwent', 'Repeat MRI'),
    ('left hip', 'underwent', 'second surgery'),
    ('contralateral hip joint', 'affected by', 'idiopathic osteonecrosis of the femoral head'),
    ('contralateral hip joint', 'underwent', 'second surgery'),
    ('second surgery', 'performed on', 'contralateral hip joint'),
    ('second surgery', 'performed on', 'contralateral hip joint'),
    ('second surgery', 'performed on', 'contralateral hip joint'),
    ('second surgery', 'performed on', 'contralateral hip joint'),
    ('second surgery', 'performed on', 'contralateral hip joint'),
    ('second surgery', 'performed on', 'contralateral hip joint'),
Prompt: 4498 tokens, 18