In [1]:
from collections import defaultdict
from datasets import load_dataset, Sequence, ClassLabel
from enum import Enum
from datetime import datetime, timedelta
from pytz import timezone
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import DataCollatorForTokenClassification

import evaluate
import json
import numpy as np
import os
import torch

In [2]:
BASE_MODEL = 'microsoft/BiomedNLP-PubMedBERT-large-uncased-abstract'
# boundaries-PICO_NER-ebm_nlp_bioc-2023_05_25_06_51_58_EDT
# checkpoint-1508
BOUNDARY_MODEL_PATH = 'pico_span/boundary_models/boundaries-PICO_NER-ebm_nlp_bioc-2023_05_31_20_05_09_EDT/checkpoint-4599'

input_folder = 'data/bioc/json'
OUTPUT_PATH = 'data/bioc/json/step_1_boundary_pred/threshold'
OUTPUT_PATH_AD = 'data/bioc/json/step_1_boundary_pred/brat/AD'
OUTPUT_PATH_COVID = 'data/bioc/json/step_1_boundary_pred/brat/COVID'
OUTPUT_PATH_EBM_NLP = 'data/bioc/json/step_1_boundary_pred/brat/EBM-NLP'

class DatasetSplit(Enum):
    train = 0
    validation = 1
    test = 2
    
class PicoType(Enum):
    PARTICIPANTS = 4
    INTERVENTIONS = 2
    OUTCOMES = 1
    
class SpanBoundary(Enum):
    outside = 0
    start = 1
    end = 2
    both = 3 # both start and end boundary
    inside = 4

In [3]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForTokenClassification.from_pretrained(BOUNDARY_MODEL_PATH)

In [4]:
ebm_nlp = load_dataset(
    'json',
    data_files = {
        'train': os.path.join(input_folder, 'test.json'),
        'validation': os.path.join(input_folder, 'test.json'),
        'test': os.path.join(input_folder, 'test.json')
    }
)

remove_features = [f for f in ebm_nlp['train'].features if f not in['pmid', 'tokens', 'labels']]
ebm_nlp['train'] = ebm_nlp['train'].remove_columns(remove_features)
ebm_nlp['validation'] = ebm_nlp['validation'].remove_columns(remove_features)
ebm_nlp['test'] = ebm_nlp['test'].remove_columns(remove_features)

Found cached dataset json (/home/gzhang/.cache/huggingface/datasets/json/default-c27b087f41e9392f/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
ebm_nlp

DatasetDict({
    train: Dataset({
        features: ['pmid', 'tokens', 'labels'],
        num_rows: 2042
    })
    validation: Dataset({
        features: ['pmid', 'tokens', 'labels'],
        num_rows: 2042
    })
    test: Dataset({
        features: ['pmid', 'tokens', 'labels'],
        num_rows: 2042
    })
})

In [6]:
def merge_boundary_labels_by_word_id(boundary_pred, word_ids, num_words):
    labels = [0 for _ in range(num_words)]
    for pred, word_id in zip(boundary_pred, word_ids):
        if word_id is None:
            continue
        labels[word_id] = labels[word_id] | int(pred)
    return labels

In [7]:
def merge_boundary_probability_by_word_id(y_prob, word_ids, num_words):
    start_prob = [0 for _ in range(num_words)]
    end_prob = [0 for _ in range(num_words)]
    for i, t in enumerate(zip(y_prob, word_ids)):
        prob, word_id = t
        if not word_id:
            continue
        start_prob[word_id] = max(
            start_prob[word_id],
            float(prob[SpanBoundary.start.value]),
            float(prob[SpanBoundary.both.value]),
        )
        end_prob[word_id] = max(
            end_prob[word_id],
            float(prob[SpanBoundary.end.value]),
            float(prob[SpanBoundary.both.value]),
        )
    return start_prob, end_prob
    

In [9]:
class BoundaryLabel:
    def __init__(self):
        self.value = 0
        
    def set_start(self):
        self.value = self.value | SpanBoundary.start.value
        
    def set_end(self):
        self.value = self.value | SpanBoundary.end.value

def extract_boundary_from_prob_dist(prob_dist, threshold):
    label = BoundaryLabel()
    if prob_dist[SpanBoundary.start.value] > threshold or prob_dist[SpanBoundary.both.value] > threshold:
        label.set_start()
    if prob_dist[SpanBoundary.end.value] > threshold or prob_dist[SpanBoundary.both.value] > threshold:
        label.set_end()
    return label.value
        

def generate_boundary_labels(dataset_dict, dataset_split, output_path, model, tokenizer, boundary_threshold=0.5):
    # '{}_boundary_pred_high_precision.json'
    output_file = os.path.join(output_path, f'{boundary_threshold:.2f}_boundary_pred.json')
    if not os.path.exists(output_path):
        os.makedirs(output_path)
        
    dataset = dataset_dict[dataset_split.name]
    progress_bar = tqdm(range(len(dataset)))
    with open(output_file, 'w+') as fout:
        for i in range(len(dataset)):
            row = {}
            row['pmid'] = dataset['pmid'][i]
            row['tokens'] = dataset['tokens'][i]
            row['original_labels'] = dataset['labels'][i]
            x = tokenizer(row['tokens'], padding=True, return_tensors='pt', is_split_into_words=True)
            y = model(**x)
            y_prob = np.squeeze(
                torch.nn.functional.softmax(y.logits, dim=-1).detach().numpy())
            y_pred = [extract_boundary_from_prob_dist(p, boundary_threshold) for p in y_prob]
            # y_pred = np.argmax(y.logits.detach().numpy(), axis=-1)
            row['boundary_pred'] = merge_boundary_labels_by_word_id(
                y_pred,
                x.word_ids(),
                len(row['tokens']),
            )
            start_prob, end_prob = merge_boundary_probability_by_word_id(
                y_prob,
                x.word_ids(),
                len(row['tokens']),
            )
            row['start_confidence'] = start_prob
            row['end_confidence'] = end_prob
            fout.write('{}\n'.format(json.dumps(row)))
            progress_bar.update(1)
    

In [9]:
# generate_boundary_labels(ebm_nlp, DatasetSplit.test, OUTPUT_PATH, model, tokenizer, boundary_threshold=0.25)

  0%|          | 0/1895 [00:00<?, ?it/s]

In [10]:
generate_boundary_labels(ebm_nlp, DatasetSplit.test, OUTPUT_PATH, model, tokenizer, boundary_threshold=0.20)

  0%|          | 0/2042 [00:00<?, ?it/s]

In [11]:
generate_boundary_labels(ebm_nlp, DatasetSplit.test, OUTPUT_PATH, model, tokenizer, boundary_threshold=0.25)

  0%|          | 0/2042 [00:00<?, ?it/s]

In [12]:
generate_boundary_labels(ebm_nlp, DatasetSplit.test, OUTPUT_PATH, model, tokenizer, boundary_threshold=0.3)

  0%|          | 0/2042 [00:00<?, ?it/s]

In [13]:
generate_boundary_labels(ebm_nlp, DatasetSplit.test, OUTPUT_PATH, model, tokenizer, boundary_threshold=0.35)

  0%|          | 0/2042 [00:00<?, ?it/s]

In [14]:
generate_boundary_labels(ebm_nlp, DatasetSplit.test, OUTPUT_PATH, model, tokenizer, boundary_threshold=0.4)

  0%|          | 0/2042 [00:00<?, ?it/s]

In [15]:
generate_boundary_labels(ebm_nlp, DatasetSplit.test, OUTPUT_PATH, model, tokenizer, boundary_threshold=0.45)

  0%|          | 0/2042 [00:00<?, ?it/s]

In [16]:
generate_boundary_labels(ebm_nlp, DatasetSplit.test, OUTPUT_PATH, model, tokenizer, boundary_threshold=0.5)

  0%|          | 0/2042 [00:00<?, ?it/s]