## Explore boundary detection

In [1]:
from collections import defaultdict
from datasets import load_dataset, Sequence, ClassLabel
from enum import Enum
from huggingface_hub import Repository
from huggingface_hub import get_full_repo_name, notebook_login
from datetime import datetime, timedelta
from pytz import timezone
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import DataCollatorForTokenClassification
from transformers import Trainer, TrainingArguments
from transformers import get_scheduler

import evaluate
import numpy as np
import os
import torch

In [2]:
PICO_NER_LABELS = [
    0, # O
    1, # Outcomes
    2, # Interventions
    3, # Interventions + Outcomes
    4, # Participants
    5, # Participants + Outcomes
    6, # Pariticpants + Interventions
    7, # Pariticpants + Interventions + Outcomes
]

BOUNDARY_LABELS = [
    0, # 'OUT',
    1, # 'START',
    2, # 'END',
    3, # 'BOTH',
    4, # 'IN',
]

class PicoType(Enum):
    PARTICIPANTS = 4
    INTERVENTIONS = 2
    OUTCOMES = 1

In [3]:
input_folder = 'data/bioc/json'

In [4]:
ebm_nlp = load_dataset(
    'json',
    data_files = {
        'train': os.path.join(input_folder, 'train.json'),
        'validation': os.path.join(input_folder, 'validation.json'),
        'test': os.path.join(input_folder, 'test.json'),
    }
)

remove_features = [f for f in ebm_nlp['train'].features if f not in['pmid', 'tokens', 'labels']]
ebm_nlp['train'] = ebm_nlp['train'].remove_columns(remove_features)
ebm_nlp['validation'] = ebm_nlp['validation'].remove_columns(remove_features)
ebm_nlp['test'] = ebm_nlp['test'].remove_columns(remove_features)

Found cached dataset json (/home/gzhang/.cache/huggingface/datasets/json/default-50d59fa552f11522/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
print(ebm_nlp)

DatasetDict({
    train: Dataset({
        features: ['pmid', 'tokens', 'labels'],
        num_rows: 49031
    })
    validation: Dataset({
        features: ['pmid', 'tokens', 'labels'],
        num_rows: 2471
    })
    test: Dataset({
        features: ['pmid', 'tokens', 'labels'],
        num_rows: 2042
    })
})


### Align Labels with Word Piece Tokens

In [6]:
model_checkpoint = 'microsoft/BiomedNLP-PubMedBERT-large-uncased-abstract'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

id2label = {i: label for i, label in enumerate(BOUNDARY_LABELS)}
label2id = {v: k for k, v in id2label.items()}

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-large-uncased-abstract were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassifi

In [7]:
'''
Check the pretrained tokenizer's output.

Example sentence from PMID 6420374:

"Similarly, post-operative increments in urinary excretion of ammonia, "
"creatinine and 3-methylhistidine were not altered by addition of insulin."
'''

input_words = [
    'Similarly', ',', 'post-operative', 'increments', 'in', 'urinary', 'excretion', 'of', 'ammonia', ',',
    'creatinine', 'and', '3-methylhistidine', 'were', 'not', 'altered', 'by', 'addition', 'of', 'insulin', '.',
]
outcome_labels = [
    0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
    1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
]
print('Original input has {} words.'.format(len(input_words)))

tokenized_input = tokenizer(input_words, is_split_into_words=True)
print('Tokenized input has {} tokens.\n'.format(len(tokenized_input['input_ids'])))
print(tokenized_input.tokens())
print(tokenized_input.word_ids())

print('\nTerm "{}" and "{}" are split into word pieces.'.format(input_words[2], input_words[12]))

Original input has 21 words.
Tokenized input has 29 tokens.

['[CLS]', 'similarly', ',', 'post', '-', 'operative', 'increments', 'in', 'urinary', 'excretion', 'of', 'ammonia', ',', 'creatinine', 'and', '3', '-', 'methyl', '##his', '##tidine', 'were', 'not', 'altered', 'by', 'addition', 'of', 'insulin', '.', '[SEP]']
[None, 0, 1, 2, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 12, 12, 12, 13, 14, 15, 16, 17, 18, 19, 20, None]

Term "post-operative" and "3-methylhistidine" are split into word pieces.


In [8]:
def start_appears(a, b):
    for p in list(PicoType):
        if (a & p.value) < (b & p.value):
            return True
    return False

def end_appears(a, b):
    for p in list(PicoType):
        if (a & p.value) > (b & p.value):
            return True
    return False

def extract_boundary_labels(labels):
    boundary_labels = []
    for i, l in enumerate(labels):
        b = 0
        if l > 0:
            start = (i == 0) or start_appears(labels[i-1], l)
            end = (i == len(labels) -1) or end_appears(l, labels[i+1])
            if start:
                b = b | 1
            if end:
                b = b | 2
            if not start and not end:
                b = 4
            boundary_labels.append(b)
        else:
            boundary_labels.append(l)
    return boundary_labels

In [9]:
'''
Aligns labels with tokenized inputs.

Pre-trained tokenizers in transformers may break a single word into word pieces.
The labels should be aligned with the tokenized outputs. Inserted special tokens
are given -100.

Code modified from https://huggingface.co/course/chapter7/2.

Parameters:
    labels (List(int)): labels for each words before transformer tokenization.
    word_ids (List(int)): The id of the word where the word piece comes from.

Returns:
    The list of labels after alignment.
'''

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            new_labels.append(label)

    return new_labels

In [10]:
# Test label alignment on the example.
print(outcome_labels)

boundary_labels = extract_boundary_labels(outcome_labels)
print(boundary_labels)

aligned_labels = align_labels_with_tokens(boundary_labels, tokenized_input.word_ids())
print(aligned_labels)

print(tokenized_input.word_ids())

[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 1, 4, 4, 4, 4, 4, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0]
[-100, 0, 0, 0, 0, 0, 0, 0, 1, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, -100]
[None, 0, 1, 2, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 12, 12, 12, 13, 14, 15, 16, 17, 18, 19, 20, None]


In [11]:
'''Tokenizes input and align tokens with labels in a batch.'''
def tokenize_and_align_labels(dataset):
    tokenized_inputs = tokenizer(
        dataset['tokens'],
        truncation=True,
        is_split_into_words=True,
    )
    all_labels = dataset['labels']
    new_labels = []
    word_ids_list = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(
            align_labels_with_tokens(
                extract_boundary_labels(labels), 
                word_ids
            ))
        word_ids_list.append(word_ids)

    tokenized_inputs['labels'] = new_labels
    tokenized_inputs['word_ids'] = word_ids_list
    return tokenized_inputs

tokenized_dataset = ebm_nlp.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=['pmid', 'tokens'],
)

tokenized_dataset = tokenized_dataset.cast_column(
    'labels',
    Sequence(ClassLabel(names = BOUNDARY_LABELS))
)

Loading cached processed dataset at /home/gzhang/.cache/huggingface/datasets/json/default-50d59fa552f11522/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-9acfed8fad3625a9.arrow


Map:   0%|          | 0/2471 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Loading cached processed dataset at /home/gzhang/.cache/huggingface/datasets/json/default-50d59fa552f11522/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-750b4ab9381db7e0.arrow
Loading cached processed dataset at /home/gzhang/.cache/huggingface/datasets/json/default-50d59fa552f11522/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-5f33e300656b3fa1.arrow


Casting the dataset:   0%|          | 0/2471 [00:00<?, ? examples/s]

Loading cached processed dataset at /home/gzhang/.cache/huggingface/datasets/json/default-50d59fa552f11522/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-5633caf09f03cc31.arrow


In [12]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 49031
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 2471
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 2042
    })
})

### Fine tune models.

In [13]:
# notebook_login()

In [14]:
training_start = datetime.now(tz = timezone('US/Eastern'))
task = 'PICO_NER'
dataset_name = 'ebm_nlp_bioc'
model_name = 'boundaries-{}-{}-{}'.format(
    task,
    dataset_name,
    datetime.now(timezone('US/Eastern')).strftime('%Y_%m_%d_%H_%M_%S_%Z')
)

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
output_dir = os.path.join('pico_span/boundary_models', model_name)

In [15]:
args = TrainingArguments(
    output_dir,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=5e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
    do_predict=True,
)


token_precision_metric = evaluate.load('precision')
token_recall_metric = evaluate.load('recall')
token_f1_metric = evaluate.load('f1')


'''Calculates precision, recall and F1 scores.'''
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    decoded_labels = [
        [l for l in label if l != -100] 
        for label in labels
    ]
    
    decoded_predictions = [
        [p for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
#     print(decoded_labels, decoded_predictions)
    
    # Token level
    flat_labels = [l for dl in decoded_labels for l in dl]
    flat_predictions = [p for dp in decoded_predictions for p in dp]
    
    token_precision = token_precision_metric.compute(
        predictions=flat_predictions,
        references=flat_labels,
        average='macro',
    )
    token_recall = token_recall_metric.compute(
        predictions=flat_predictions,
        references=flat_labels,
        average='macro',
    )
    token_f1 = token_f1_metric.compute(
        predictions=flat_predictions,
        references=flat_labels,
        average='macro',
    )
    
    start_tp, start_fp, start_fn = 0, 0, 0
    end_tp, end_fp, end_fn = 0, 0, 0
    for label, pred in zip(flat_labels, flat_predictions):
        if label == 0 or label == 4:
            if pred == 1:
                start_fp += 1
            elif pred == 2:
                end_fp += 1
            elif pred == 3:
                start_fp += 1
                end_fp += 1
                
        elif label == 1:
            if pred == 0 or pred == 4:
                start_fn += 1
            elif pred == 1:
                start_tp += 1
            elif pred == 2:
                start_fn += 1
                end_fp += 1
            elif pred == 3:
                start_tp += 1
                end_fp += 1
    
        elif label == 2:
            if pred == 0 or pred == 4:
                end_fn += 1
            elif pred == 1:
                start_fp += 1
                end_fn += 1
            elif pred == 2:
                end_tp += 1
            elif pred == 3:
                start_fp += 1
                end_tp += 1
                
        elif label == 3:
            if pred == 0 or pred == 4:
                start_fn += 1
                end_fn += 1
            elif pred == 1:
                start_tp += 1
                end_fp += 1
            elif pred == 2:
                start_fp += 1
                end_tp += 1
            elif pred == 3:
                start_tp += 1
                end_tp += 1
            
    start_precision = start_tp / (start_tp + start_fp)
    start_recall = start_tp / (start_tp + start_fn)
    start_f1 = 2 * start_precision * start_recall / (start_precision + start_recall) if start_tp else 0
    
    end_precision = end_tp / (end_tp + end_fp)
    end_recall = end_tp / (end_tp + end_fn)
    end_f1 = 2 * end_precision * end_recall / (end_precision + end_recall) if end_tp else 0
    
    return {
        'overall_precision': token_precision['precision'],
        'overall_recall': token_recall['recall'],
        'overall_f1': token_f1['f1'],

        'start_precision': start_precision,
        'start_recall': start_recall,
        'start_f1': start_f1,
        
        'end_precision': end_precision,
        'end_recall': end_recall,
        'end_f1': end_f1,
    }

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset['train'],
#     eval_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [16]:
trainer.train()

save_path = 'pico_span/boundary_models'
model.save_pretrained(os.path.join(save_path, model_name))

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Start Precision,Start Recall,Start F1,End Precision,End Recall,End F1
1,0.4824,0.489319,0.664416,0.648209,0.655783,0.610699,0.596283,0.603404,0.616008,0.62842,0.622152
2,0.4015,0.501913,0.676164,0.644614,0.658917,0.620579,0.575932,0.597423,0.626598,0.619078,0.622815
3,0.3065,0.566453,0.668451,0.658631,0.663079,0.609404,0.604198,0.60679,0.616939,0.638572,0.627569




In [None]:
# trainer