In [None]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir("/content/drive/MyDrive/KLTN/NER-medical-text/")

Mounted at /content/drive


In [None]:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForTokenClassification

In [None]:
from transformers import pipeline

In [None]:
from config import entity_to_acronyms, acronyms_to_entities

## Model Definition

In [None]:
model_name = "d4data/biomedical-ner-all"
tokenizer = AutoTokenizer.from_pretrained(model_name)

ner_model = TFAutoModelForTokenClassification.from_pretrained(model_name, from_pt=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/373 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/5.00k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/266M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFDistilBertForTokenClassification.

All the weights of TFDistilBertForTokenClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForTokenClassification for predictions without further training.


In [None]:
pipe = pipeline("ner", model = ner_model, tokenizer = tokenizer, aggregation_strategy = "simple", device = 0 )

In [None]:
result = pipe("""CASE: A 28-year-old previously healthy man presented with a 6-week history of palpitations.
      The symptoms occurred during rest, 2–3 times per week, lasted up to 30 minutes at a time
      and were associated with dyspnea. Except for a grade 2/6 holosystolic tricuspid regurgitation
      murmur (best heard at the left sternal border with inspiratory accentuation), physical
      examination yielded unremarkable findings.""")

In [None]:
result

## Define training parameters

In [None]:
BATCH_SIZE = 16
NUM_EPOCHS = 20
LEARNING_RATE = 1e-5


## Prepare the dataset to fine tune the Pretrained DistilBERT base uncased

In [None]:
MAX_LENGTH = 100

In [None]:
bio_files_dir = 'data/bio_data_files'

In [None]:
import os
import numpy as np

def read_file(file_path):
    """Helper function to read data from a single file."""
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
        sentences = []
        labels = []
        sentence = []
        label = []
        for line in lines:
            if line == '\n':
                if sentence:
                    sentences.append(" ".join(sentence))
                    labels.append(" ".join(label))
                    sentence = []
                    label = []
            else:
                word, tag = line.strip().split("\t")
                sentence.append(word)
                if tag != 'O':
                    tag = tag[:2] + acronyms_to_entities[tag[2:]]
                label.append(tag)
        if sentence:
            sentences.append(" ".join(sentence))
            labels.append(" ".join(label))
        return sentences, labels

def prepare_data(directory_path):
    """Read data from all files in the given directory and prepare for fine-tuning."""
    train_sentences = []
    train_labels = []
    val_sentences = []
    val_labels = []
    test_sentences = []
    test_labels = []
    for i, filename in enumerate(os.listdir(directory_path)):
        file_path = os.path.join(directory_path, filename)
        sentences, labels = read_file(file_path)
        if i % 5 == 0:  # 20% of data for validation
            val_sentences.extend(sentences)
            val_labels.extend(labels)
        elif i % 5 == 1:  # 20% of data for testing
            test_sentences.extend(sentences)
            test_labels.extend(labels)
        else:  # 60% of data for training
            train_sentences.extend(sentences)
            train_labels.extend(labels)

    return train_sentences, train_labels, val_sentences, val_labels, test_sentences, test_labels

In [None]:
train_sentences, train_labels, val_sentences, val_labels, test_sentences, test_labels =  prepare_data(bio_files_dir)

In [None]:
len(test_sentences) + len(train_sentences) + len(val_sentences)

4541

In [None]:
train_data = {"input_ids": tokenizer(train_sentences, truncation=True, max_length=MAX_LENGTH, padding='max_length', return_tensors="tf")["input_ids"],
                  "attention_mask": tokenizer(train_sentences, truncation=True, max_length=MAX_LENGTH, padding='max_length', return_tensors="tf")["attention_mask"],
                  "labels": np.array([[ner_model.config.label2id[token] for token in label.split()] for label in train_labels], dtype='object')}

val_data = {"input_ids": tokenizer(val_sentences, truncation=True, max_length=MAX_LENGTH, padding='max_length', return_tensors="tf")["input_ids"],
                "attention_mask": tokenizer(val_sentences, truncation=True, max_length=MAX_LENGTH, padding='max_length', return_tensors="tf")["attention_mask"],
                "labels": np.array([[ner_model.config.label2id[token] for token in label.split()] for label in val_labels], dtype='object')}

test_data = {"input_ids": tokenizer(test_sentences, truncation=True, max_length=MAX_LENGTH, padding='max_length', return_tensors="tf")["input_ids"],
                 "attention_mask": tokenizer(test_sentences, truncation=True, max_length=MAX_LENGTH, padding='max_length', return_tensors="tf")["attention_mask"],
                 "labels": np.array([[ner_model.config.label2id[token] for token in label.split()] for label in test_labels], dtype='object')}

In [None]:
padded_labels = tf.keras.preprocessing.sequence.pad_sequences(
    train_data['labels'],
    maxlen=MAX_LENGTH,
    padding='post',
    truncating='post',
    value=0
  )

train_data['labels'] = tf.convert_to_tensor(padded_labels)

padded_labels = tf.keras.preprocessing.sequence.pad_sequences(
    val_data['labels'],
    maxlen=MAX_LENGTH,
    padding='post',
    truncating='post',
    value=0
)

val_data['labels'] = tf.convert_to_tensor(padded_labels)

padded_labels = tf.keras.preprocessing.sequence.pad_sequences(
    test_data['labels'],
    maxlen=MAX_LENGTH,
    padding='post',
    truncating='post',
    value=0
)

test_data['labels'] = tf.convert_to_tensor(padded_labels)

In [None]:

print("TRAINING DATA")
print(f"The shape of input ids tensor of train data is {train_data['input_ids'].shape}")
print(f"The shape of attention masks tensor of train data is {train_data['attention_mask'].shape}")
print(f"The shape of labels tensor of train data is {train_data['labels'].shape}")

print("\nVALIDATION DATA")
print(f"The shape of input ids tensor of validation data is {val_data['input_ids'].shape}")
print(f"The shape of attention masks tensor of validation data is {val_data['attention_mask'].shape}")
print(f"The shape of labels tensor of validation data is {val_data['labels'].shape}")

print("\nTEST DATA")
print(f"The shape of input ids tensor of test data is {test_data['input_ids'].shape}")
print(f"The shape of attention masks tensor of test data is {test_data['attention_mask'].shape}")
print(f"The shape of labels tensor of test data is {test_data['labels'].shape}")

TRAINING DATA
The shape of input ids tensor of train data is (2750, 100)
The shape of attention masks tensor of train data is (2750, 100)
The shape of labels tensor of train data is (2750, 100)

VALIDATION DATA
The shape of input ids tensor of validation data is (954, 100)
The shape of attention masks tensor of validation data is (954, 100)
The shape of labels tensor of validation data is (954, 100)

TEST DATA
The shape of input ids tensor of test data is (837, 100)
The shape of attention masks tensor of test data is (837, 100)
The shape of labels tensor of test data is (837, 100)


In [None]:
# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((train_data["input_ids"], train_data["attention_mask"], train_data["labels"])).batch(BATCH_SIZE)
val_dataset = tf.data.Dataset.from_tensor_slices((val_data["input_ids"], val_data["attention_mask"], val_data["labels"])).batch(BATCH_SIZE)
test_dataset = tf.data.Dataset.from_tensor_slices((test_data["input_ids"], test_data["attention_mask"], test_data["labels"])).batch(BATCH_SIZE)

In [None]:
print("TRAINING DATASET")
print(f"Number of batches in train dataset: {len(train_dataset)}")
print(f"Shape of the batches: {train_dataset.element_spec}", )

print("\nVALIDATION DATASET")
print(f"Number of batches in validation dataset: {len(val_dataset)}")
print(f"Shape of the batches: {val_dataset.element_spec}", )

print("\nTEST DATASET")
print(f"Number of batches in test dataset: {len(test_dataset)}")
print(f"Shape of the batches: {test_dataset.element_spec}", )

TRAINING DATASET
Number of batches in train dataset: 172
Shape of the batches: (TensorSpec(shape=(None, 100), dtype=tf.int32, name=None), TensorSpec(shape=(None, 100), dtype=tf.int32, name=None), TensorSpec(shape=(None, 100), dtype=tf.int32, name=None))

VALIDATION DATASET
Number of batches in validation dataset: 60
Shape of the batches: (TensorSpec(shape=(None, 100), dtype=tf.int32, name=None), TensorSpec(shape=(None, 100), dtype=tf.int32, name=None), TensorSpec(shape=(None, 100), dtype=tf.int32, name=None))

TEST DATASET
Number of batches in test dataset: 53
Shape of the batches: (TensorSpec(shape=(None, 100), dtype=tf.int32, name=None), TensorSpec(shape=(None, 100), dtype=tf.int32, name=None), TensorSpec(shape=(None, 100), dtype=tf.int32, name=None))


## Train the model

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
ner_model.compile(optimizer=optimizer, loss=loss, metrics=[tf.keras.metrics.SparseCategoricalAccuracy("accuracy")])

In [None]:
history = ner_model.fit(
    x=[train_data['input_ids'], train_data['attention_mask']],
    y=train_data['labels'],
    validation_data=([val_data['input_ids'], val_data['attention_mask']], val_data['labels']),
    epochs=NUM_EPOCHS
)

Epoch 1/20


Cause: for/else statement not yet supported


Cause: for/else statement not yet supported
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
from sklearn.metrics import classification_report

test_labels_tensor = test_data['labels']

outputs = ner_model.predict(x = [test_data['input_ids'], test_data['attention_mask']])['logits']

predictions = tf.argmax(outputs, axis=-1)

print(classification_report(test_labels_tensor.numpy().flatten(), predictions.numpy().flatten(), zero_division=0))

              precision    recall  f1-score   support

           0       0.95      0.98      0.97     75533
           1       0.44      0.40      0.42        10
           2       0.38      0.10      0.16        29
           3       1.00      0.93      0.96        41
           4       0.00      0.00      0.00        13
           5       0.00      0.00      0.00         1
           6       0.46      0.21      0.29       482
           7       0.66      0.40      0.50       109
           8       0.57      0.33      0.42        12
           9       0.55      0.17      0.26        64
          10       0.66      0.34      0.45       143
          11       0.41      0.20      0.26       480
          12       0.64      0.41      0.50       619
          14       0.48      0.17      0.26       241
          15       0.38      0.18      0.24        57
          16       0.50      0.30      0.37        60
          17       0.33      0.11      0.17        18
          18       0.17    

In [None]:
# Evaluate the model on the train data
ner_model.evaluate(
    x = [train_data['input_ids'], train_data['attention_mask']],
    y = train_data['labels']
)



[0.18936125934123993, 0.9366727471351624]

In [None]:
# Evaluate the model on the test data
ner_model.evaluate(
    x = [test_data['input_ids'], test_data['attention_mask']],
    y = test_data['labels']
)



[0.26659727096557617, 0.9199283123016357]

In [None]:

## Save model
ner_model.save_pretrained("ner_model")

In [None]:

## Save tokenizer
tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

## Prediction

In [None]:
model_fine_tuned = TFAutoModelForTokenClassification.from_pretrained("ner_model")

Some layers from the model checkpoint at ner_model were not used when initializing TFDistilBertForTokenClassification: ['dropout_39']
- This IS expected if you are initializing TFDistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForTokenClassification were not initialized from the model checkpoint at ner_model and are newly initialized: ['dropout_79']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.metrics import classification_report

test_labels_tensor = test_data['labels']

outputs = model_fine_tuned.predict(x = [test_data['input_ids'], test_data['attention_mask']])['logits']

predictions = tf.argmax(outputs, axis=-1)

print(classification_report(test_labels_tensor.numpy().flatten(), predictions.numpy().flatten(), zero_division=0))

              precision    recall  f1-score   support

           0       0.95      0.98      0.97     75533
           1       0.44      0.40      0.42        10
           2       0.38      0.10      0.16        29
           3       1.00      0.93      0.96        41
           4       0.00      0.00      0.00        13
           5       0.00      0.00      0.00         1
           6       0.46      0.21      0.29       482
           7       0.66      0.40      0.50       109
           8       0.57      0.33      0.42        12
           9       0.55      0.17      0.26        64
          10       0.66      0.34      0.45       143
          11       0.41      0.20      0.26       480
          12       0.64      0.41      0.50       619
          14       0.48      0.17      0.26       241
          15       0.38      0.18      0.24        57
          16       0.50      0.30      0.37        60
          17       0.33      0.11      0.17        18
          18       0.17    

In [None]:
model_fine_tuned.summary()

Model: "tf_distil_bert_for_token_classification_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMa  multiple                  66362880  
 inLayer)                                                        
                                                                 
 dropout_79 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  64596     
                                                                 
Total params: 66427476 (253.40 MB)
Trainable params: 66427476 (253.40 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
import spacy
from spacy import displacy

def display_pred(text, entities):
    nlp = spacy.load("en_core_web_sm", disable=['ner'])
    # Generate the entities in Spacy format
    doc = nlp(text)
    # Add the predicted named entities to the Doc object
    for start, end, label in entities:
        span = doc.char_span(start, end, label=label)
        if span is not None:
            doc.ents += tuple([span])

    colors = {"Activity": "#f9d5e5",
              "Administration": "#f7a399",
              "Age": "#f6c3d0",
              "Area": "#fde2e4",
              "Biological_attribute": "#d5f5e3",
              "Biological_structure": "#9ddfd3",
              "Clinical_event": "#77c5d5",
              "Color": "#a0ced9",
              "Coreference": "#e3b5a4",
              "Date": "#f1f0d2",
              "Detailed_description": "#ffb347",
              "Diagnostic_procedure": "#c5b4e3",
              "Disease_disorder": "#c4b7ea",
              "Distance": "#bde0fe",
              "Dosage": "#b9e8d8",
              "Duration": "#ffdfba",
              "Family_history": "#e6ccb2",
              "Frequency": "#e9d8a6",
              "Height": "#f2eecb",
              "History": "#e2f0cb",
              "Lab_value": "#f4b3c2",
              "Mass": "#f4c4c3",
              "Medication": "#f9d5e5",
              "Nonbiological_location": "#f7a399",
              "Occupation": "#f6c3d0",
              "Other_entity": "#d5f5e3",
              "Other_event": "#9ddfd3",
              "Outcome": "#77c5d5",
              "Personal_background": "#a0ced9",
              "Qualitative_concept": "#e3b5a4",
              "Quantitative_concept": "#f1f0d2",
              "Severity": "#ffb347",
              "Sex": "#c5b4e3",
              "Shape": "#c4b7ea",
              "Sign_symptom": "#bde0fe",
              "Subject": "#b9e8d8",
              "Texture": "#ffdfba",
              "Therapeutic_procedure": "#e6ccb2",
              "Time": "#e9d8a6",
              "Volume": "#f2eecb",
              "Weight": "#e2f0cb"}
    options = {"compact": True, "bg": "#F8F8F8",
               "ents": list(colors.keys()),
               "colors": colors}

    # Generate the HTML visualization
    html = displacy.render(doc, style="ent", options=options)

In [None]:
text = "A 28-year-old previously healthy man presented with a 6-week history of palpitations. The symptoms occurred during rest, 2–3 times per week, lasted up to 30 minutes at a time and were associated with dyspnea.Except for a grade 2/6 holosystolic tricuspid regurgitation murmur (best heard at the left sternal border with inspiratory accentuation), physical examination yielded unremarkable findings. An electrocardiogram (ECG) revealed normal sinus rhythm and a Wolff– Parkinson– White pre-excitation pattern (Fig.1: Top), produced by a right-sided accessory pathway. Transthoracic echocardiography demonstrated the presence of Ebstein's anomaly of the tricuspid valve, with apical displacement of the valve and formation of an “atrialized” right ventricle (a functional unit between the right atrium and the inlet [inflow] portion of the right ventricle) (Fig.2). The anterior tricuspid valve leaflet was elongated (Fig.2C, arrow), whereas the septal leaflet was rudimentary (Fig.2C, arrowhead). Contrast echocardiography using saline revealed a patent foramen ovale with right-to-left shunting and bubbles in the left atrium (Fig.2D). The patient underwent an electrophysiologic study with mapping of the accessory pathway, followed by radiofrequency ablation (interruption of the pathway using the heat generated by electromagnetic waves at the tip of an ablation catheter). His post-ablation ECG showed a prolonged PR interval and an odd “second” QRS complex in leads III, aVF and V2–V4 (Fig.1Bottom), a consequence of abnormal impulse conduction in the “atrialized” right ventricle. The patient reported no recurrence of palpitations at follow-up 6 months after the ablation."
# Tokenize the input sentence
encoded = tokenizer.encode_plus(text, return_tensors="tf", return_offsets_mapping=True)
ner_model = TFAutoModelForTokenClassification.from_pretrained(model_name, from_pt=True)
input_ids = encoded['input_ids']
attention_mask = encoded['attention_mask']

inputs = {
    'input_ids': input_ids,
    'attention_mask': attention_mask
}

offsets = encoded['offset_mapping'][0].numpy()


# Get the model predictions
outputs = ner_model(input_ids=input_ids, attention_mask=attention_mask)['logits']
predictions = tf.argmax(outputs, axis=-1)

# # Convert the predicted label ids to label names

predicted_labels = [ner_model.config.id2label[prediction] for prediction in predictions[0].numpy()]

entities = []
prev_tag = None
prev_end = -1

for start_end, label in zip(offsets, predicted_labels):
    start = start_end[0]
    end = start_end[1]
    if label != 'O':
        tag = label[2:]
        if len(entities) > 0:
                prev_end = entities[-1][1]
                prev_start = entities[-1][0]
                prev_tag = entities[-1][2]
        if prev_tag == tag and (prev_end == start or  prev_end+1 == start):
            entities[-1] = (prev_start, end, tag)
        else:
            entities.append((start, end, tag))

All PyTorch model weights were used when initializing TFDistilBertForTokenClassification.

All the weights of TFDistilBertForTokenClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForTokenClassification for predictions without further training.


In [None]:
entities

[(2, 13, 'Age'),
 (14, 32, 'History'),
 (33, 36, 'Sex'),
 (37, 46, 'Clinical_event'),
 (54, 60, 'Duration'),
 (72, 75, 'Sign_symptom'),
 (90, 98, 'Sign_symptom'),
 (121, 130, 'Detailed_description'),
 (148, 174, 'Detailed_description'),
 (200, 201, 'Sign_symptom'),
 (221, 230, 'Lab_value'),
 (231, 236, 'Detailed_description'),
 (244, 250, 'Biological_structure'),
 (254, 274, 'Sign_symptom'),
 (294, 313, 'Biological_structure'),
 (346, 366, 'Diagnostic_procedure'),
 (375, 377, 'Lab_value'),
 (401, 418, 'Diagnostic_procedure'),
 (420, 422, 'Diagnostic_procedure'),
 (434, 440, 'Lab_value'),
 (441, 453, 'Diagnostic_procedure'),
 (460, 483, 'Sign_symptom'),
 (487, 498, 'Sign_symptom'),
 (566, 571, 'Biological_structure'),
 (580, 596, 'Diagnostic_procedure'),
 (626, 643, 'Disease_disorder'),
 (651, 657, 'Biological_structure'),
 (673, 692, 'Sign_symptom'),
 (700, 705, 'Coreference'),
 (739, 754, 'Biological_structure'),
 (786, 798, 'Biological_structure'),
 (837, 847, 'Biological_structure')

In [None]:
display_pred(text, entities)

