In [1]:
from collections import defaultdict
from datasets import load_dataset, Sequence, ClassLabel
from enum import Enum
from huggingface_hub import Repository
from huggingface_hub import get_full_repo_name, notebook_login
from datetime import datetime, timedelta
from pytz import timezone
from sklearn.metrics import f1_score, precision_score, recall_score
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import Trainer, TrainingArguments
from transformers import get_scheduler

import evaluate
import numpy as np
import os
import torch

In [2]:
class PicoType(Enum):
    PARTICIPANTS = 4
    INTERVENTIONS = 2
    OUTCOMES = 1

In [3]:
input_folder = 'data/bioc/json'

In [4]:
span_clf = load_dataset(
    'json',
    data_files = {
        'train': os.path.join(input_folder, 'train_span_clf_entity_only.json'),
        'validation': os.path.join(input_folder, 'validation_span_clf_entity_only.json'),
        'test': os.path.join(input_folder, 'test_span_clf_entity_only.json')
    }
)

Downloading and preparing dataset json/default to /home/gzhang/.cache/huggingface/datasets/json/default-257f6648a24f4803/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /home/gzhang/.cache/huggingface/datasets/json/default-257f6648a24f4803/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
span_clf

DatasetDict({
    train: Dataset({
        features: ['pmid', 'tokens', 'PARTICIPANTS', 'INTERVENTIONS', 'OUTCOMES'],
        num_rows: 80032
    })
    validation: Dataset({
        features: ['pmid', 'tokens', 'PARTICIPANTS', 'INTERVENTIONS', 'OUTCOMES'],
        num_rows: 4333
    })
    test: Dataset({
        features: ['pmid', 'tokens', 'PARTICIPANTS', 'INTERVENTIONS', 'OUTCOMES'],
        num_rows: 4202
    })
})

In [6]:
PICO_CLASSES = [label for label in span_clf['train'].features.keys() if label not in ['pmid', 'tokens']]

id2label = {i: label for i, label in enumerate(PICO_CLASSES)}
label2id = {v: k for k, v in id2label.items()}

In [7]:
model_checkpoint = 'microsoft/BiomedNLP-PubMedBERT-large-uncased-abstract'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    problem_type='multi_label_classification',
    num_labels = 3,
    id2label=id2label,
    label2id=label2id,
)

Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-large-uncased-abstract were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequ

In [8]:
def preprocess_function(dataset):
    text = dataset['tokens']
    encoding = tokenizer(
        text,
        truncation=True,
        is_split_into_words=True,
    )
    labels_batch = {k: dataset[k] for k in dataset.keys() if k in PICO_CLASSES}
    labels_matrix = np.zeros((len(text), len(PICO_CLASSES)))
    for idx, label in enumerate(PICO_CLASSES):
        labels_matrix[:, idx] = labels_batch[label]
    encoding['labels'] = labels_matrix.tolist()
    return encoding

In [9]:
tokenized_dataset = span_clf.map(
    preprocess_function,
    batched=True,
    remove_columns=span_clf['train'].column_names
)

Map:   0%|          | 0/80032 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/4333 [00:00<?, ? examples/s]

Map:   0%|          | 0/4202 [00:00<?, ? examples/s]

In [10]:
training_start = datetime.now(tz = timezone('US/Eastern'))
task = 'PICO_NER'
dataset_name = 'ebm_nlp_bioc'
model_name = 'span-clf-{}-{}-{}'.format(
    task,
    dataset_name,
    'entity_only'
#     datetime.now(timezone('US/Eastern')).strftime('%Y_%m_%d_%H_%M_%S_%Z')
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
output_dir = os.path.join('pico_span/span_clf', model_name)

In [11]:
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    push_to_hub=False,
)

# precision_metric = evaluate.load('precision')
# recall_metric = evaluate.load('recall')
# f1_metric = evaluate.load('f1')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    probs = torch.nn.functional.sigmoid(torch.Tensor(predictions))
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= 0.5)] = 1
    
    f1 = f1_score(y_true=labels, y_pred=y_pred, average='macro')
    precision = precision_score(y_true=labels, y_pred=y_pred, average='macro')
    recall = recall_score(y_true=labels, y_pred=y_pred, average='macro')
    
    return {
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }
    
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [12]:
# 06-19 no sampled spans spans
trainer.train()

save_path = 'pico_span/span_clf'
model.save_pretrained(os.path.join(save_path, model_name))

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.1504,0.154869,0.915072,0.905181,0.910037
2,0.1104,0.168776,0.91264,0.908616,0.910597
3,0.0862,0.175934,0.913421,0.90973,0.911532




In [12]:
# 06-02 include synthesized spans
trainer.train()

save_path = 'pico_span/span_clf'
model.save_pretrained(os.path.join(save_path, model_name))

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.1538,0.152133,0.887957,0.901808,0.894827
2,0.1199,0.157034,0.896483,0.895555,0.896003
3,0.0941,0.166317,0.897554,0.892407,0.894959




In [12]:
# 06-01 randomly sampled spans
trainer.train()

save_path = 'pico_span/span_clf'
model.save_pretrained(os.path.join(save_path, model_name))

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.1413,0.151439,0.886044,0.854485,0.869544
2,0.1161,0.152993,0.87715,0.868581,0.872249
3,0.0932,0.160661,0.867719,0.870886,0.868863




In [12]:
# trainer.train()

# save_path = 'pico_span/span_clf'
# model.save_pretrained(os.path.join(save_path, model_name))

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.1499,0.125803,0.877059,0.895973,0.886392
2,0.1209,0.126744,0.883057,0.901144,0.891864
3,0.098,0.139777,0.868923,0.907802,0.887713




In [13]:
trainer.state.best_model_checkpoint

'pico_span/span_clf/span-clf-PICO_NER-ebm_nlp_bioc-2023_06_02_05_59_04_EDT/checkpoint-1510'