In [None]:
!pip install transformers
!pip install seqeval
!pip install sentencepiece

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m790.4 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=18f53611c7c6a6e219e3d91fa6462518441111d98d42f17be2608766d2c86878
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sen

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn

from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel, RobertaModel
from transformers import AdamW, get_linear_schedule_with_warmup, AutoTokenizer, RobertaConfig

import logging
import os
import json
import numpy as np
from tqdm.auto import tqdm, trange

from seqeval.metrics import f1_score, precision_score, recall_score, classification_report

# Preprocess data

In [None]:
!git clone https://github.com/VinAIResearch/PhoNER_COVID19.git
!gdown 1FrtKtXXA61aZFreMW_MxFEGbcYdfeqM3

Cloning into 'PhoNER_COVID19'...
remote: Enumerating objects: 58, done.[K
remote: Counting objects: 100% (58/58), done.[K
remote: Compressing objects: 100% (38/38), done.[K
remote: Total 58 (delta 23), reused 41 (delta 18), pack-reused 0[K
Receiving objects: 100% (58/58), 3.61 MiB | 13.35 MiB/s, done.
Resolving deltas: 100% (23/23), done.
Downloading...
From: https://drive.google.com/uc?id=1FrtKtXXA61aZFreMW_MxFEGbcYdfeqM3
To: /content/slot_labels.txt
100% 227/227 [00:00<00:00, 1.24MB/s]


In [None]:
def get_slot_labels(slot_label_path):
    '''
        Input: path đến slot label file
        Output: list chứa các slot label
    '''
    return [
        label.strip()
        for label in open(slot_label_path, "r", encoding="utf-8")
    ]

def convert_examples_to_features(
    examples,
    max_seq_len,
    tokenizer,
    pad_token_label_id=-100,
    cls_token_segment_id=0,
    pad_token_segment_id=0,
    mask_padding_with_zero=True,
):
    '''
        Input:
            - examples: list của những data samples. Mỗi sample là một dictionary gồm 2
                        trường: 'words' (list[str]), 'slot_labels' (list[int])
            - tokenizer: PhoBERT tokenizer
            - pad_token_label_id: label id của padding
            - cls_token_segment_id: segment id của cls token
            - pad_token_segment_id: segment id của pad token
            - mask_padding_with_zero:
        Output: list chứa các samples. Mỗi sample là một dictionary chứa 4 trường thông tin:
            - 'input_ids'
            - 'attention_mask'
            - 'val_pos_list'
            - 'slot_labels_ids'
    '''

    # Cài đặt các token đặc biệt dựa trên tokenizer có sẵn
    cls_token = tokenizer.cls_token
    sep_token = tokenizer.sep_token
    unk_token = tokenizer.unk_token
    pad_token_id = tokenizer.pad_token_id

    sw_max_len = 0 # Biến này dùng để kiểm tra xem độ dài tối đa của chuỗi token (subword) đầu vào là bao nhiêu?
    features = []

    for ex_index, example in enumerate(examples):
        # Tokenize từng từ
        tokens = []
        slot_labels_ids = []
        val_pos_list = [] # Dùng để xác định vị trí của những token có label id thực

        for word, slot_label in zip(example['words'], example['slot_labels']):
            word_tokens = tokenizer.tokenize(word)

            if not word_tokens:
                word_tokens = [unk_token]  # For handling the bad-encoded word

            tokens.extend(word_tokens)
            val_pos_list.extend([True] + [False] * (len(word_tokens) - 1))
            # Sử dụng label id thực cho subword đầu tiên của từ; pad label id cho các subwords còn lại
            slot_labels_ids.extend([int(slot_label)] + [pad_token_label_id] * (len(word_tokens) - 1))

            sw_max_len = max(sw_max_len, len(tokens))

        # Mặc dù [CLS] và [SEP] token sẽ có pad label id, nhưng vì PhoBERT được train theo
        # format có 2 token này nên tốt nhất ta vẫn nên thêm vào.

        # Account for [CLS] and [SEP]
        special_tokens_count = 2
        if len(tokens) > max_seq_len - special_tokens_count:
            tokens = tokens[: (max_seq_len - special_tokens_count)]
            slot_labels_ids = slot_labels_ids[: (max_seq_len - special_tokens_count)]
            val_pos_list = val_pos_list[: (max_seq_len - special_tokens_count)]

        # Thêm [SEP] token
        tokens += [sep_token]
        slot_labels_ids += [pad_token_label_id]
        val_pos_list += [False]

        # Thêm [CLS] token
        tokens = [cls_token] + tokens
        slot_labels_ids = [pad_token_label_id] + slot_labels_ids
        val_pos_list = [False] + val_pos_list

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # Mask có giá trị 1 cho token thực và giá trị 0 cho pad token
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad để đạt được max_seq_len
        padding_length = max_seq_len - len(input_ids)
        input_ids = input_ids + ([pad_token_id] * padding_length)
        attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
        slot_labels_ids = slot_labels_ids + ([pad_token_label_id] * padding_length)
        val_pos_list = val_pos_list + ([False] * padding_length)

        assert len(input_ids) == max_seq_len, "Error with input length {} vs {}".format(len(input_ids), max_seq_len)
        assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(
            len(attention_mask), max_seq_len
        )

        assert len(slot_labels_ids) == max_seq_len, "Error with slot labels length {} vs {}".format(
            len(slot_labels_ids), max_seq_len
        )

        assert len(val_pos_list) == max_seq_len, "Error with valid position list length {} vs {}".format(
            len(val_pos_list), max_seq_len
        )

        feature = {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'val_pos_list': val_pos_list,
            'slot_labels_ids': slot_labels_ids
        }
        features.append(feature)

    print(f'Max length after splitting into subwords: {sw_max_len}')

    return features

def load_dataset(data_path, tokenizer, ignore_index, max_seq_len):
    '''
        Input:
            - data_path : Path tới dữ liệu (JSON file)
            - tokenizer: tokenizer của PhoBERT
            - ignore_index: label id sẽ bị bỏ qua trong hàm loss
            - max_seq_len: Chiều dài tối đa của chuỗi văn bản đầu vào (tính theo subword)

        Output: TensorDataset chứa 4 trường thông tin:
            - all_input_ids: tokens của chuỗi đầu vào
            - all_attetion_masks: mask
            - all_slot_labels_id: nhãn
            - all_val_pos_list: vị trí của các token có nghĩa. Nói cách khác, là
                                vị trí của subword đầu tiên của một từ)

    '''

    # Lấy data từ các file json
    examples = []
    with open(data_path, 'r', encoding='utf-8') as rf:
      for i, line in enumerate(rf):
        obj = json.loads(line)
        # Văn bản đầu vào
        words = obj['words']

        # Chuyển các slot labels thành slot label ids
        slot_labels = []
        for s in obj['tags']:
            slot_labels.append(slot_label_lst.index(s))

        try:
            assert len(words) == len(slot_labels)
        except:
            print(i)
            print(words)
            print(slot_labels)
            print(len(words))
            print(len(slot_labels))

        examples.append({'words': words, 'slot_labels': slot_labels})

    # Sử dụng cross entropy bỏ qua label id của pad token và các token tầm thường
    # (không là subword đầu tiên của một từ), vì vậy chỉ có label id thực mới đóng góp
    # cho hàm loss.
    pad_token_label_id = ignore_index
    features = convert_examples_to_features(
        examples, max_seq_len, tokenizer, pad_token_label_id=pad_token_label_id
    )

    # Chuyển đổi sang tensor và xây dựng dataset
    all_input_ids = torch.tensor([f['input_ids'] for f in features], dtype=torch.long)
    all_attention_mask = torch.tensor([f['attention_mask'] for f in features], dtype=torch.long)
    all_slot_labels_ids = torch.tensor([f['slot_labels_ids'] for f in features], dtype=torch.long)
    all_val_pos_list = torch.tensor([f['val_pos_list'] for f in features], dtype=torch.bool)
    dataset = TensorDataset(
        all_input_ids, all_attention_mask, all_slot_labels_ids, all_val_pos_list
    )
    return dataset

Tiếng Việt là một ngôn ngữ đa âm tiết nên một từ có thể chứa nhiều tiếng. PhoBERT được pretrained trên mức độ từ, nên chúng tôi sẽ sử dụng bộ ngữ liệu mức độ từ để finetune.

In [None]:
TRAIN_PATH = '/content/PhoNER_COVID19/data/word/train_word.json'
DEV_PATH = '/content/PhoNER_COVID19/data/word/dev_word.json'
TEST_PATH = '/content/PhoNER_COVID19/data/word/test_word.json'
model_name = 'vinai/phobert-base'

# Lấy slot label list
slot_label_lst = get_slot_labels('/content/slot_labels.txt')

# Lấy tokenizer của PhoBERT
tokenizer = AutoTokenizer.from_pretrained(model_name)

ignore_index = -100 # Hàm loss sẽ bỏ những label id là ignore_index
pad_token_label_id = ignore_index
max_seq_len = 256

# Load dữ liệu
train_dataset = load_dataset(TRAIN_PATH, tokenizer, ignore_index, max_seq_len)
dev_dataset = load_dataset(DEV_PATH, tokenizer, ignore_index, max_seq_len)
test_dataset = load_dataset(TEST_PATH, tokenizer, ignore_index, max_seq_len)

config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]

Max length after splitting into subwords: 175
Max length after splitting into subwords: 182
Max length after splitting into subwords: 175


In [None]:
tokenizer = AutoTokenizer.from_pretrained('vinai/phobert-base')

config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]

In [None]:
tokenizer("Bệnh_nhân N.V.A bị viêm khớp")

{'input_ids': [0, 6207, 22290, 768, 45, 1743, 2819, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
tokenizer.tokenize("Bệnh_nhân N.V.A bị viêm khớp")

['Bệnh_nhân', 'N.V.@@', 'A', 'bị', 'viêm', 'khớp']

# Build model

In [None]:
class SlotClassifier(nn.Module):
    def __init__(
        self,
        input_dim,
        num_slot_labels,
        dropout_rate=0.0,
    ):
        super(SlotClassifier, self).__init__()
        self.num_slot_labels = num_slot_labels
        self.dropout = nn.Dropout(dropout_rate)
        self.linear = nn.Linear(input_dim, num_slot_labels)

    def forward(self, x):
        x = self.dropout(x)
        return self.linear(x)

In [None]:
class NERecognizer(RobertaPreTrainedModel):
    def __init__(self, config, slot_label_lst, ignore_index):
        super(NERecognizer, self).__init__(config)
        self.num_slot_labels = len(slot_label_lst)
        self.roberta = RobertaModel(config)  # Load pretrained bert

        self.ignore_index = ignore_index

        self.slot_classifier = SlotClassifier(
            config.hidden_size,
            self.num_slot_labels,
            dropout_rate=0.1,
        )

    def forward(self, input_ids, attention_mask, slot_labels_ids=None):
        outputs = self.roberta(
            input_ids, attention_mask=attention_mask
        )  # sequence_output, pooled_output, (hidden_states), (attentions)
        sequence_output = outputs[0]
        pooled_output = outputs[1]  # [CLS]


        slot_logits = self.slot_classifier(sequence_output)

        total_loss = 0

        # 2. Slot Softmax
        if slot_labels_ids is not None:
            slot_loss_fct = nn.CrossEntropyLoss(ignore_index=self.ignore_index)
            # Chỉ giữ những phần active của hàm loss.
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                active_logits = slot_logits.view(-1, self.num_slot_labels)[active_loss]
                active_labels = slot_labels_ids.view(-1)[active_loss]
                slot_loss = slot_loss_fct(active_logits, active_labels)
            else:
                slot_loss = slot_loss_fct(slot_logits.view(-1, self.num_slot_labels), slot_labels_ids.view(-1))
            total_loss += slot_loss

        outputs = ((slot_logits),) + outputs[2:]  # add hidden states and attention if they are here

        outputs = (total_loss,) + outputs

        return outputs  # (loss), logits, (hidden_states), (attentions) # Logits is a tuple of intent and slot logits

In [None]:
token_level = 'word-level'

config = RobertaConfig.from_pretrained(model_name, finetuning_task=token_level)
model = NERecognizer.from_pretrained(
    model_name,
    config=config,
    slot_label_lst=slot_label_lst,
    ignore_index=ignore_index
)

pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of NERecognizer were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['slot_classifier.linear.weight', 'slot_classifier.linear.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Train

In [None]:
class EarlyStopping:
    """Dừng training sớm nếu val loss không cải thiện sau một số lần nhất định (patience)"""

    def __init__(self, patience=7, verbose=False):
        """
        Args:
            patience (int): Bao lâu kể từ lần cuối val loss được cải thiện
            verbose (bool): Nếu True thì print ra thông báo cải thiện của val loss.
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf

    def __call__(self, val_loss, model, model_dir):
        score = val_loss
        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model, model_dir)
        elif score < self.best_score:
            self.counter += 1
            print(f"EarlyStopping counter: {self.counter} out of {self.patience}")
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model, model_dir)
            self.counter = 0

    def save_checkpoint(self, val_loss, model, model_dir):
        """Saves model when validation loss decreases or accuracy/f1 increases."""
        if self.verbose:
            print(f"slot_f1 increased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...")
        model.save_pretrained(model_dir)
        self.val_loss_min = val_loss

def compute_metrics(slot_preds, slot_labels):
    assert len(slot_preds) == len(slot_labels)
    results = {}
    slot_result = get_slot_metrics(slot_preds, slot_labels)
    results.update(slot_result)
    return results

def get_slot_metrics(preds, labels):
    assert len(preds) == len(labels)
    print(classification_report(labels, preds, digits=4))
    return {
        "slot_precision": precision_score(labels, preds),
        "slot_recall": recall_score(labels, preds),
        "slot_f1": f1_score(labels, preds),
    }

In [None]:
def train(
    model,
    device,
    train_dataset,
    dev_dataset,
    train_batch_size=32,
    dev_batch_size=32,
    num_train_epochs=20,
    learning_rate=5e-5,
    tuning_metric='slot_f1',
    gradient_accumulation_steps=1,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    logging_steps=200,
    save_steps=200,
    early_stopping=5,
    max_grad_norm=1.0,
    model_dir='checkpoints'
  ):

      # Load model to device
      model.to(device)

      # Dataloader
      train_dataloader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
      t_total = len(train_dataloader) // gradient_accumulation_steps * num_train_epochs

      # Eval on dev
      results = evaluate(model, device, dev_dataset, dev_batch_size)
      print(results)

      # Prepare optimizer and schedule (linear warmup and decay)
      no_decay = ["bias", "LayerNorm.weight"]
      optimizer_grouped_parameters = [
          {
              "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
              "weight_decay": weight_decay,
          },
          {
              "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
              "weight_decay": 0.0,
          },
      ]
      optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
      scheduler = get_linear_schedule_with_warmup(
          optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total
      )

      # Create checkpoint folder
      if not os.path.exists(model_dir):
        os.mkdir(model_dir)

      # Train!
      print("***** Running training *****")
      print("  Num examples = ", len(train_dataset))
      print("  Num Epochs = ", num_train_epochs)
      print("  Total train batch size = ", train_batch_size)
      print("  Gradient Accumulation steps = ", gradient_accumulation_steps)
      print("  Total optimization steps = ", t_total)
      print("  Logging steps = ", logging_steps)
      print("  Save steps = ", save_steps)

      global_step = 0
      tr_loss = 0.0
      model.zero_grad()

      train_iterator = trange(int(num_train_epochs), desc="Epoch")
      early_stopping = EarlyStopping(patience=early_stopping, verbose=True)

      for _ in train_iterator:
          epoch_iterator = tqdm(train_dataloader, desc="Iteration", position=0, leave=True)
          print("\nEpoch", _)

          for step, batch in enumerate(epoch_iterator):
              model.train()
              batch = tuple(t.to(device) for t in batch)  # GPU or CPU

              inputs = {
                  "input_ids": batch[0],
                  "attention_mask": batch[1],
                  "slot_labels_ids": batch[2]
              }

              outputs = model(**inputs)
              loss = outputs[0]

              if gradient_accumulation_steps > 1:
                  loss = loss / gradient_accumulation_steps

              loss.backward()

              tr_loss += loss.item()
              if (step + 1) % gradient_accumulation_steps == 0:
                  torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # Avoid gradient exploding

                  optimizer.step()
                  scheduler.step()  # Update learning rate schedule
                  model.zero_grad()
                  global_step += 1

                  if logging_steps > 0 and global_step % logging_steps == 0:
                      print("\nTuning metrics:", tuning_metric)
                      results = evaluate(model, device, dev_dataset, dev_batch_size)
                      early_stopping(results[tuning_metric], model, model_dir)
                      if early_stopping.early_stop:
                          print("Early stopping")
                          break

          if early_stopping.early_stop:
              train_iterator.close()
              break

      return global_step, tr_loss / global_step

In [None]:
def evaluate(
    model,
    device,
    eval_dataset,
    eval_batch_size=32
):
    eval_dataloader = DataLoader(eval_dataset, batch_size=eval_batch_size)
    print("***** Running evaluation *****")
    print("  Num examples = ", len(eval_dataset))
    print("  Batch size = ", eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0

    slot_preds = None
    out_slot_labels_ids = None
    masks = None
    all_val_pos_list = None
    model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        val_pos_list = batch[3]
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "slot_labels_ids": batch[2],
            }
            outputs = model(**inputs)
            tmp_eval_loss, (slot_logits) = outputs[:2]

            eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1


        # Slot prediction
        if slot_preds is None:
            slot_preds = slot_logits.detach().cpu().numpy()
            out_slot_labels_ids = inputs["slot_labels_ids"].detach().cpu().numpy()
            masks = inputs["attention_mask"].detach().cpu().numpy()
            all_val_pos_list = val_pos_list.detach().cpu().numpy()
        else:
            slot_preds = np.append(slot_preds, slot_logits.detach().cpu().numpy(), axis=0)

            out_slot_labels_ids = np.append(
                out_slot_labels_ids, inputs["slot_labels_ids"].detach().cpu().numpy(), axis=0
            )

            masks = np.append(
              masks, inputs["attention_mask"].detach().cpu().numpy(), axis=0
            )

            all_val_pos_list = np.append(
                all_val_pos_list, val_pos_list.detach().cpu().numpy(), axis=0
            )

    eval_loss = eval_loss / nb_eval_steps
    results = {"loss": eval_loss}

    # Slot result
    slot_preds = np.argmax(slot_preds, axis=2)
    slot_label_map = {i: label for i, label in enumerate(slot_label_lst)}
    out_slot_label_list = [[] for _ in range(out_slot_labels_ids.shape[0])]
    slot_preds_list = [[] for _ in range(out_slot_labels_ids.shape[0])]

    for i in range(all_val_pos_list.shape[0]):
        for j in range(all_val_pos_list.shape[1]):
            if all_val_pos_list[i, j]:
                out_slot_label_list[i].append(slot_label_map[out_slot_labels_ids[i][j]])
                slot_preds_list[i].append(slot_label_map[slot_preds[i][j]])
    print(len(slot_preds_list))
    print(slot_preds_list[0])
    print(out_slot_label_list[0])

    total_result = compute_metrics(slot_preds_list, out_slot_label_list)
    results.update(total_result)

    print("***** Eval results *****")
    for key in sorted(results.keys()):
        print(f"  {key} = {str(results[key])}")

    return results, slot_preds_list, out_slot_label_list

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_parameters =  sum(p.numel() for p in model.parameters() if p.requires_grad)
model_dir = '/content/drive/MyDrive/Colab Notebooks/phoner/checkpoints_v2'
print('#params:',model_parameters)

#params: 135014421


In [None]:
train(
    model=model,
    device=device,
    train_dataset=train_dataset,
    dev_dataset=dev_dataset,
    train_batch_size=32,
    dev_batch_size=128,
    num_train_epochs=30,
    early_stopping=5,
    model_dir=model_dir
  )

***** Running evaluation *****
  Num examples =  2000
  Batch size =  128


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

2000
['I-JOB', 'B-SYMPTOM_AND_DISEASE', 'I-TRANSPORTATION', 'O', 'I-TRANSPORTATION', 'B-SYMPTOM_AND_DISEASE', 'B-GENDER', 'B-SYMPTOM_AND_DISEASE', 'B-ORGANIZATION', 'B-SYMPTOM_AND_DISEASE', 'I-SYMPTOM_AND_DISEASE', 'O', 'I-SYMPTOM_AND_DISEASE', 'B-PATIENT_ID', 'B-GENDER', 'I-PATIENT_ID', 'I-TRANSPORTATION', 'I-SYMPTOM_AND_DISEASE', 'I-SYMPTOM_AND_DISEASE', 'I-SYMPTOM_AND_DISEASE', 'B-DATE', 'B-DATE', 'B-PATIENT_ID', 'I-TRANSPORTATION', 'I-GENDER', 'B-SYMPTOM_AND_DISEASE', 'I-GENDER', 'O']
['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'B-SYMPTOM_AND_DISEASE', 'I-SYMPTOM_AND_DISEASE', 'O', 'O', 'O', 'O']
                     precision    recall  f1-score   support

                AGE     0.0127    0.0305    0.0179       361
               DATE     0.0039    0.0127    0.0059      1103
             GENDER     0.0020    0.0289    0.0038       2



***** Running training *****
  Num examples =  5027
  Num Epochs =  30
  Total train batch size =  32
  Gradient Accumulation steps =  1
  Total optimization steps =  4740
  Logging steps =  200
  Save steps =  200


Epoch:   0%|          | 0/30 [00:00<?, ?it/s]

Iteration:   0%|          | 0/158 [00:00<?, ?it/s]


Epoch 0


Iteration:   0%|          | 0/158 [00:00<?, ?it/s]


Epoch 1

Tuning metrics: slot_f1
***** Running evaluation *****
  Num examples =  2000
  Batch size =  128


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

2000
['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'B-SYMPTOM_AND_DISEASE', 'I-SYMPTOM_AND_DISEASE', 'O', 'O', 'O', 'O']
                     precision    recall  f1-score   support

                AGE     0.9291    0.9806    0.9542       361
               DATE     0.9803    0.9918    0.9860      1103
             GENDER     0.9460    0.9495    0.9477       277
                JOB     0.5833    0.5303    0.5556       132
           LOCATION     0.9307    0.9565    0.9434      2737
               NAME     0.8835    0.9681    0.9239       188
       ORGANIZATION     0.8632    0.8475    0.8553       551
         PATIENT_I

Iteration:   0%|          | 0/158 [00:00<?, ?it/s]


Epoch 2

Tuning metrics: slot_f1
***** Running evaluation *****
  Num examples =  2000
  Batch size =  128


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

2000
['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'B-SYMPTOM_AND_DISEASE', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'B-SYMPTOM_AND_DISEASE', 'I-SYMPTOM_AND_DISEASE', 'O', 'O', 'O', 'O']
                     precision    recall  f1-score   support

                AGE     0.9727    0.9861    0.9794       361
               DATE     0.9785    0.9918    0.9851      1103
             GENDER     0.9477    0.9819    0.9645       277
                JOB     0.7500    0.7500    0.7500       132
           LOCATION     0.9487    0.9671    0.9578      2737
               NAME     0.9040    0.9521    0.9275       188
       ORGANIZATION     0.9018    0.9328    0.9170       55

Iteration:   0%|          | 0/158 [00:00<?, ?it/s]


Epoch 3

Tuning metrics: slot_f1
***** Running evaluation *****
  Num examples =  2000
  Batch size =  128


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

2000
['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'B-SYMPTOM_AND_DISEASE', 'I-SYMPTOM_AND_DISEASE', 'O', 'O', 'O', 'O']
                     precision    recall  f1-score   support

                AGE     0.9727    0.9861    0.9794       361
               DATE     0.9812    0.9927    0.9869      1103
             GENDER     0.9644    0.9783    0.9713       277
                JOB     0.8435    0.7348    0.7854       132
           LOCATION     0.9518    0.9591    0.9554      2737
               NAME     0.9101    0.9149    0.9125       188
       ORGANIZATION     0.8801    0.9328    0.9057       551
         PATIENT_I

Iteration:   0%|          | 0/158 [00:00<?, ?it/s]


Epoch 4


Iteration:   0%|          | 0/158 [00:00<?, ?it/s]


Epoch 5

Tuning metrics: slot_f1
***** Running evaluation *****
  Num examples =  2000
  Batch size =  128


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

2000
['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'B-SYMPTOM_AND_DISEASE', 'I-SYMPTOM_AND_DISEASE', 'O', 'O', 'O', 'O']
                     precision    recall  f1-score   support

                AGE     0.9861    0.9806    0.9833       361
               DATE     0.9891    0.9882    0.9887      1103
             GENDER     0.9743    0.9567    0.9654       277
                JOB     0.9107    0.7727    0.8361       132
           LOCATION     0.9433    0.9602    0.9517      2737
               NAME     0.9389    0.8989    0.9185       188
       ORGANIZATION     0.8968    0.9147    0.9057       551
         PATIENT_I

Iteration:   0%|          | 0/158 [00:00<?, ?it/s]


Epoch 6

Tuning metrics: slot_f1
***** Running evaluation *****
  Num examples =  2000
  Batch size =  128


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

2000
['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'B-SYMPTOM_AND_DISEASE', 'I-SYMPTOM_AND_DISEASE', 'O', 'O', 'O', 'O']
                     precision    recall  f1-score   support

                AGE     0.9701    0.9889    0.9794       361
               DATE     0.9820    0.9900    0.9860      1103
             GENDER     0.9680    0.9819    0.9749       277
                JOB     0.7836    0.7955    0.7895       132
           LOCATION     0.9649    0.9543    0.9596      2737
               NAME     0.9171    0.9415    0.9291       188
       ORGANIZATION     0.8719    0.9510    0.9097       551
         PATIENT_I

Iteration:   0%|          | 0/158 [00:00<?, ?it/s]


Epoch 7

Tuning metrics: slot_f1
***** Running evaluation *****
  Num examples =  2000
  Batch size =  128


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

2000
['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'B-SYMPTOM_AND_DISEASE', 'I-SYMPTOM_AND_DISEASE', 'O', 'O', 'O', 'O']
                     precision    recall  f1-score   support

                AGE     0.9753    0.9861    0.9807       361
               DATE     0.9776    0.9900    0.9838      1103
             GENDER     0.9747    0.9747    0.9747       277
                JOB     0.8320    0.7879    0.8093       132
           LOCATION     0.9571    0.9532    0.9552      2737
               NAME     0.9206    0.9255    0.9231       188
       ORGANIZATION     0.8786    0.9456    0.9108       551
         PATIENT_I

Iteration:   0%|          | 0/158 [00:00<?, ?it/s]


Epoch 8

Tuning metrics: slot_f1
***** Running evaluation *****
  Num examples =  2000
  Batch size =  128


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

2000
['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'B-SYMPTOM_AND_DISEASE', 'I-SYMPTOM_AND_DISEASE', 'O', 'O', 'O', 'O']
                     precision    recall  f1-score   support

                AGE     0.9675    0.9889    0.9781       361
               DATE     0.9794    0.9900    0.9847      1103
             GENDER     0.9288    0.9892    0.9580       277
                JOB     0.8455    0.7879    0.8157       132
           LOCATION     0.9513    0.9635    0.9573      2737
               NAME     0.8900    0.9468    0.9175       188
       ORGANIZATION     0.8975    0.9220    0.9096       551
         PATIENT_I

Iteration:   0%|          | 0/158 [00:00<?, ?it/s]


Epoch 9


Iteration:   0%|          | 0/158 [00:00<?, ?it/s]


Epoch 10

Tuning metrics: slot_f1
***** Running evaluation *****
  Num examples =  2000
  Batch size =  128


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

2000
['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'B-SYMPTOM_AND_DISEASE', 'I-SYMPTOM_AND_DISEASE', 'O', 'O', 'O', 'O']
                     precision    recall  f1-score   support

                AGE     0.9726    0.9834    0.9780       361
               DATE     0.9811    0.9909    0.9860      1103
             GENDER     0.9611    0.9819    0.9714       277
                JOB     0.8320    0.7879    0.8093       132
           LOCATION     0.9552    0.9591    0.9572      2737
               NAME     0.9158    0.9255    0.9206       188
       ORGANIZATION     0.8904    0.9437    0.9163       551
         PATIENT_I

Iteration:   0%|          | 0/158 [00:00<?, ?it/s]


Epoch 11

Tuning metrics: slot_f1
***** Running evaluation *****
  Num examples =  2000
  Batch size =  128


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

2000
['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'B-SYMPTOM_AND_DISEASE', 'I-SYMPTOM_AND_DISEASE', 'O', 'O', 'O', 'O']
                     precision    recall  f1-score   support

                AGE     0.9675    0.9889    0.9781       361
               DATE     0.9785    0.9900    0.9842      1103
             GENDER     0.9448    0.9892    0.9665       277
                JOB     0.8110    0.7803    0.7954       132
           LOCATION     0.9518    0.9591    0.9554      2737
               NAME     0.9077    0.9415    0.9243       188
       ORGANIZATION     0.9088    0.9401    0.9242       551
         PATIENT_I

Iteration:   0%|          | 0/158 [00:00<?, ?it/s]


Epoch 12

Tuning metrics: slot_f1
***** Running evaluation *****
  Num examples =  2000
  Batch size =  128


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

2000
['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'B-SYMPTOM_AND_DISEASE', 'I-SYMPTOM_AND_DISEASE', 'O', 'O', 'O', 'O']
                     precision    recall  f1-score   support

                AGE     0.9673    0.9834    0.9753       361
               DATE     0.9803    0.9900    0.9851      1103
             GENDER     0.9817    0.9675    0.9745       277
                JOB     0.8390    0.7500    0.7920       132
           LOCATION     0.9656    0.9638    0.9647      2737
               NAME     0.9128    0.9468    0.9295       188
       ORGANIZATION     0.8916    0.9256    0.9083       551
         PATIENT_I

Iteration:   0%|          | 0/158 [00:00<?, ?it/s]


Epoch 13

Tuning metrics: slot_f1
***** Running evaluation *****
  Num examples =  2000
  Batch size =  128


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

2000
['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'B-SYMPTOM_AND_DISEASE', 'I-SYMPTOM_AND_DISEASE', 'O', 'O', 'O', 'O']
                     precision    recall  f1-score   support

                AGE     0.9726    0.9834    0.9780       361
               DATE     0.9785    0.9891    0.9838      1103
             GENDER     0.9713    0.9783    0.9748       277
                JOB     0.8475    0.7576    0.8000       132
           LOCATION     0.9613    0.9624    0.9618      2737
               NAME     0.9110    0.9255    0.9182       188
       ORGANIZATION     0.9037    0.9365    0.9198       551
         PATIENT_I

Iteration:   0%|          | 0/158 [00:00<?, ?it/s]


Epoch 14


Iteration:   0%|          | 0/158 [00:00<?, ?it/s]


Epoch 15

Tuning metrics: slot_f1
***** Running evaluation *****
  Num examples =  2000
  Batch size =  128


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

2000
['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'B-SYMPTOM_AND_DISEASE', 'I-SYMPTOM_AND_DISEASE', 'O', 'O', 'O', 'O']
                     precision    recall  f1-score   support

                AGE     0.9729    0.9945    0.9836       361
               DATE     0.9794    0.9891    0.9842      1103
             GENDER     0.9613    0.9856    0.9733       277
                JOB     0.8140    0.7955    0.8046       132
           LOCATION     0.9643    0.9583    0.9613      2737
               NAME     0.9082    0.9468    0.9271       188
       ORGANIZATION     0.8969    0.9474    0.9214       551
         PATIENT_I

Iteration:   0%|          | 0/158 [00:00<?, ?it/s]


Epoch 16

Tuning metrics: slot_f1
***** Running evaluation *****
  Num examples =  2000
  Batch size =  128


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

2000
['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'B-SYMPTOM_AND_DISEASE', 'I-SYMPTOM_AND_DISEASE', 'O', 'O', 'O', 'O']
                     precision    recall  f1-score   support

                AGE     0.9726    0.9834    0.9780       361
               DATE     0.9794    0.9891    0.9842      1103
             GENDER     0.9647    0.9856    0.9750       277
                JOB     0.8374    0.7803    0.8078       132
           LOCATION     0.9585    0.9631    0.9608      2737
               NAME     0.9110    0.9255    0.9182       188
       ORGANIZATION     0.9084    0.9183    0.9134       551
         PATIENT_I

Iteration:   0%|          | 0/158 [00:00<?, ?it/s]


Epoch 17

Tuning metrics: slot_f1
***** Running evaluation *****
  Num examples =  2000
  Batch size =  128


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

2000
['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'B-SYMPTOM_AND_DISEASE', 'I-SYMPTOM_AND_DISEASE', 'O', 'O', 'O', 'O']
                     precision    recall  f1-score   support

                AGE     0.9780    0.9834    0.9807       361
               DATE     0.9802    0.9891    0.9847      1103
             GENDER     0.9818    0.9747    0.9783       277
                JOB     0.8673    0.7424    0.8000       132
           LOCATION     0.9650    0.9664    0.9657      2737
               NAME     0.9105    0.9202    0.9153       188
       ORGANIZATION     0.9113    0.9328    0.9220       551
         PATIENT_I

Iteration:   0%|          | 0/158 [00:00<?, ?it/s]


Epoch 18

Tuning metrics: slot_f1
***** Running evaluation *****
  Num examples =  2000
  Batch size =  128


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

2000
['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'B-SYMPTOM_AND_DISEASE', 'I-SYMPTOM_AND_DISEASE', 'O', 'O', 'O', 'O']
                     precision    recall  f1-score   support

                AGE     0.9780    0.9834    0.9807       361
               DATE     0.9803    0.9900    0.9851      1103
             GENDER     0.9749    0.9819    0.9784       277
                JOB     0.8425    0.8106    0.8263       132
           LOCATION     0.9617    0.9646    0.9632      2737
               NAME     0.9067    0.9309    0.9186       188
       ORGANIZATION     0.9069    0.9365    0.9214       551
         PATIENT_I

Iteration:   0%|          | 0/158 [00:00<?, ?it/s]


Epoch 19


Iteration:   0%|          | 0/158 [00:00<?, ?it/s]


Epoch 20

Tuning metrics: slot_f1
***** Running evaluation *****
  Num examples =  2000
  Batch size =  128


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

2000
['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'B-SYMPTOM_AND_DISEASE', 'I-SYMPTOM_AND_DISEASE', 'O', 'O', 'O', 'O']
                     precision    recall  f1-score   support

                AGE     0.9754    0.9889    0.9821       361
               DATE     0.9802    0.9891    0.9847      1103
             GENDER     0.9681    0.9856    0.9767       277
                JOB     0.8306    0.7803    0.8047       132
           LOCATION     0.9625    0.9664    0.9644      2737
               NAME     0.8969    0.9255    0.9110       188
       ORGANIZATION     0.9043    0.9256    0.9148       551
         PATIENT_I

Iteration:   0%|          | 0/158 [00:00<?, ?it/s]


Epoch 21

Tuning metrics: slot_f1
***** Running evaluation *****
  Num examples =  2000
  Batch size =  128


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

2000
['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'B-SYMPTOM_AND_DISEASE', 'I-SYMPTOM_AND_DISEASE', 'O', 'O', 'O', 'O']
                     precision    recall  f1-score   support

                AGE     0.9677    0.9945    0.9809       361
               DATE     0.9811    0.9900    0.9856      1103
             GENDER     0.9784    0.9819    0.9802       277
                JOB     0.8860    0.7652    0.8211       132
           LOCATION     0.9556    0.9664    0.9609      2737
               NAME     0.9101    0.9149    0.9125       188
       ORGANIZATION     0.8995    0.9093    0.9043       551
         PATIENT_I

Iteration:   0%|          | 0/158 [00:00<?, ?it/s]


Epoch 22

Tuning metrics: slot_f1
***** Running evaluation *****
  Num examples =  2000
  Batch size =  128


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

2000
['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'B-SYMPTOM_AND_DISEASE', 'I-SYMPTOM_AND_DISEASE', 'O', 'O', 'O', 'O']
                     precision    recall  f1-score   support

                AGE     0.9780    0.9834    0.9807       361
               DATE     0.9802    0.9891    0.9847      1103
             GENDER     0.9713    0.9783    0.9748       277
                JOB     0.8718    0.7727    0.8193       132
           LOCATION     0.9634    0.9627    0.9631      2737
               NAME     0.9162    0.9309    0.9235       188
       ORGANIZATION     0.8958    0.9365    0.9157       551
         PATIENT_I

Iteration:   0%|          | 0/158 [00:00<?, ?it/s]


Epoch 23


Iteration:   0%|          | 0/158 [00:00<?, ?it/s]


Epoch 24

Tuning metrics: slot_f1
***** Running evaluation *****
  Num examples =  2000
  Batch size =  128


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

2000
['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'O', 'B-SYMPTOM_AND_DISEASE', 'I-SYMPTOM_AND_DISEASE', 'O', 'O', 'O', 'O']
                     precision    recall  f1-score   support

                AGE     0.9753    0.9834    0.9793       361
               DATE     0.9802    0.9891    0.9847      1103
             GENDER     0.9749    0.9819    0.9784       277
                JOB     0.8718    0.7727    0.8193       132
           LOCATION     0.9626    0.9678    0.9652      2737
               NAME     0.9124    0.9415    0.9267       188
       ORGANIZATION     0.8996    0.9274    0.9133       551
         PATIENT_I

(3800, 0.02526304316320181)

# Evaluate on test

## Load model

In [None]:
# Download check point
!mkdir checkpoints
%cd checkpoints
!gdown 1-0WPxvR_akePwKkdLJWmgfSawUwhoKQ3
!gdown 1--JNfXziCgiIzFmjwg_FLXHQBWi_HBIe

/content/checkpoints
Downloading...
From: https://drive.google.com/uc?id=1-0WPxvR_akePwKkdLJWmgfSawUwhoKQ3
To: /content/checkpoints/pytorch_model.bin
100% 540M/540M [00:17<00:00, 31.1MB/s]
Downloading...
From: https://drive.google.com/uc?id=1--JNfXziCgiIzFmjwg_FLXHQBWi_HBIe
To: /content/checkpoints/config.json
100% 783/783 [00:00<00:00, 4.38MB/s]


In [None]:
model_dir = '/content/checkpoints'
token_level = 'word-level'
device = 'cuda'
config = RobertaConfig.from_pretrained(model_name, finetuning_task=token_level)
model = NERecognizer.from_pretrained(
        model_dir,
        config=config,
        slot_label_lst=slot_label_lst,
        ignore_index=ignore_index
    )

model.to(device)

NERecognizer(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(258, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerN

## Evaluate

In [None]:
score, pred, true = evaluate(model, device, test_dataset, 128)

***** Running evaluation *****
  Num examples =  3000
  Batch size =  128


Evaluating:   0%|          | 0/24 [00:00<?, ?it/s]

3000
['O', 'B-DATE', 'I-DATE', 'I-DATE', 'O', 'B-DATE', 'I-DATE', 'I-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'B-NAME', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOCATION', 'I-LOCATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOCATION', 'I-LOCATION', 'O', 'B-LOCATION', 'I-LOCATION', 'O', 'B-LOCATION', 'I-LOCATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'B-DATE', 'I-DATE', 'I-DATE', 'O', 'B-DATE', 'I-DATE', 'I-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'B-NAME', 'O', 'B-AGE', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOCATION', 'I-LOCATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOCATION', 'I-LOCATION', 'O', 'B-LOCATION', 'I-LOCATION', 'O', 'B-LOCATION', 'I-LOCATION', 'O', 'O', 'B-JOB', 'I-JOB', 'O', 'O', 'O']
                     precision    recall  f1-score   support

                AGE     0.9911    0.9605    0.9756       582
               DATE     0.9826    0.9909    0.9868      1654
             GENDER     0.9846    0.9697    0.9771       462
                JOB   

# Data analysis

In [None]:
# Read train, dev, test
import json

def load_dataset(fpath):
  data = []
  with open(fpath, 'r', encoding='utf-8') as rf:
    for line in rf:
        data.append(json.loads(line))
  return data

test_data = load_dataset('/content/PhoNER_COVID19/data/word/test_word.json')

def read_samples(dataset, output_file, label):
    with open(output_file, 'w', encoding='utf-8') as file:
        for sample_id, sample in enumerate(dataset, start=0):
            for i, (word, tag) in enumerate(zip(sample['words'], sample['tags'])):
                file.write(f'{word} {label[sample_id][i]}\n')
            file.write(f'\n')

# Sử dụng hàm với một tên file đầu ra
output_file_name = 'test_true.txt'
read_samples(test_data, output_file_name, true)

output_file_name = 'test_pred.txt'
read_samples(test_data, output_file_name, pred)

In [None]:
def load_dataset(filepath, concatenate_words=False):
    """
    Load the dataset from the text file
    :param concatenate_words: whether to keep the sample as a single string instance or a list of words
    :param filepath: path to the .txt file
    :return: sentences, labels
    """
    with open(filepath, encoding='utf8') as f:
        lines = []
        for line in f:
            lines.append(line.replace('\n', ''))

    sentences = []
    labels = []

    words = []
    label = []
    for line in lines:
        if len(line.split()) < 2:
            if concatenate_words:
                sentences.append(' '.join(words))
                labels.append(' '.join(label))
            else:
                sentences.append(words)
                labels.append(label)
            words = []
            label = []
        else:
            words.append(line.split()[0])
            label.append(line.split()[1])

    return sentences, labels

def get_unique_tags(filepath):
    """
    Get all unique tags that are present in one .txt file
    :param filepath: path to the .txt file
    :return: list of tags
    """
    with open(filepath, encoding='utf8') as f:
        lines = []
        for line in f:
            lines.append(line.replace('\n', ''))

    all_tags = set()
    for line in lines:
        if len(line.split()) == 2:
            tag = line.split()[1].replace('B-', '').replace('I-', '')
            all_tags.add(tag)

    return list(all_tags)

def convert_cell_to_tag(cell):
    """
    Convert each cell of the dataframe to list of tags
    :param cell: List of tuples, each tuple has 2 lists: 1 contains words, the other contains labels
    :return: List of list, each inner list contains, labels from corresponding tuple of the input cell
    """
    tags = []
    for my_tuple in cell:
        tag = list(set(word.replace('B-', '').replace('I-', '') for word in my_tuple[0]))
        tags.append(tag[0])
    return tags

In [None]:
all_words_true, all_tags_true = load_dataset(r'/content/checkpoints/test_true.txt')
unique_tags = get_unique_tags(r'/content/checkpoints/test_true.txt')
all_words_pred, all_tags_pred = load_dataset(r'/content/checkpoints/test_pred.txt')

In [None]:
import pandas as pd

class ErrorTypesGold:
    def __init__(self, tags_true, tags_pred, words_true, words_pred):
        self.result = {'No Extraction': [], 'No Annotation': [], 'Wrong Tag': [], 'Wrong Range': [], 'Wrong Range and tag': [], 'Num correct tags': []}
        self.tags_true = tags_true
        self.tags_pred = tags_pred
        self.words_true = words_true
        self.words_pred = words_pred
        self.spans_tags_true, self.spans_O_tags_true = self._get_span_of(tags_true)
        self.spans_tags_pred, self.spans_O_tags_pred = self._get_span_of(tags_pred)

    def _get_span_of(self, tags):
        entities = []
        single_entity = []

        for i, tag in enumerate(tags):
            if 'I-' in tag:
                single_entity.append(i)
            elif single_entity:
                if tag == 'O':
                    entities.append(single_entity)
                    single_entity = []
                elif 'B-' in tag:
                    entities.append(single_entity)
                    single_entity = [i]
            elif 'B-' in tag:
                single_entity.append(i)
        if single_entity:
            entities.append(single_entity)

        empty_spans = []
        empty_span = []

        for i, tag in enumerate(tags):
            if not empty_span and tag == 'O':
                empty_span.append(i)
            elif not any(x in tag for x in ['B-', 'I-']):
                empty_span.append(i)
            elif empty_span and any(x in tag for x in ['B-', 'I-']):

                empty_spans.append(empty_span)
                empty_span = []
        if empty_span:
            empty_spans.append(empty_span)

        return entities, empty_spans

    def _get_tags_true_in(self, span, exclude_o_tag=False):
        start, end = span[0], span[-1] + 1
        tags = self.tags_true[start: end]
        if exclude_o_tag:
            tags = list(set(word.replace('B-', '').replace('I-', '') for word in tags if word != 'O'))
        else:
            tags = list(set(word.replace('B-', '').replace('I-', '') for word in tags))
        return tags

    def _get_tags_pred_in(self, span, exclude_o_tag=False):
        start, end = span[0], span[-1] + 1
        tags = self.tags_pred[start: end]
        if exclude_o_tag:
            tags = list(set(word.replace('B-', '').replace('I-', '') for word in tags if word != 'O'))
        else:
            tags = list(set(word.replace('B-', '').replace('I-', '') for word in tags))
        return tags

    def _is_correct_range(self, span):
        return span in self.spans_tags_pred

    def _is_overlap(self, span, with_O_spans_tags_true=False):

        if not with_O_spans_tags_true:
            for that_span in self.spans_tags_pred:
                if len([bound for bound in span if bound in that_span]) == len(span):
                    return False
        else:
            for that_span in self.spans_O_tags_true:
                if len([bound for bound in span if bound in that_span]) == len(span):
                    return False

        return True

    def check(self):
        for span in self.spans_tags_true:
            start, end = span[0], span[-1] + 1
            tags_pred = self._get_tags_pred_in(span)
            tags_true = self._get_tags_true_in(span)
            raw_tag_true = self.tags_true[start: end]
            raw_tag_pred = self.tags_pred[start: end]
            raw_tag_true_to_end = self.tags_true[start:]
            raw_tag_pred_to_end = self.tags_pred[start:]
            raw_words = self.words_true[start: end]

            if self._is_correct_range(span):
                if tags_pred != tags_true:
                    self.result['Wrong Tag'].append((raw_tag_true, raw_tag_pred, raw_words))
                elif tags_pred == ['O'] * len(tags_pred):
                    self.result['No Extraction'].append((raw_tag_true, raw_tag_pred, raw_words))
                else:
                    self.result['Num correct tags'].append((raw_tag_true, raw_tag_pred, raw_words))
            else:
                if self._is_overlap(span):
                    if raw_tag_pred == ['O'] * len(raw_tag_pred):
                        self.result['No Extraction'].append((raw_tag_true, raw_tag_pred, raw_words))
                    elif tags_true[0] in tags_pred:
                        if 'O' in tags_pred:
                            self.result['Wrong Range'].append((raw_tag_true, raw_tag_pred, raw_words))
                        elif len([tag for tag in raw_tag_pred if 'B-' in tag]) != 1:
                            self.result['Wrong Range'].append((raw_tag_true, raw_tag_pred, raw_words))
                        elif len(raw_tag_pred) >= end + 1 and 'I-' in raw_tag_pred[end]:
                            self.result['Wrong Range'].append((raw_tag_true, raw_tag_pred, raw_words))

                    else:
                        self.result['Wrong Range and tag'].append((raw_tag_true, raw_tag_pred, raw_words))  # print(raw_tag_true, raw_tag_pred, raw_words, sep='\n')  # print(raw_tag_true_to_end)  # print(raw_tag_pred_to_end)  # print('---------------')

                else:
                    if len([tag for tag in raw_tag_pred if 'B-' in tag]) != 1:
                        if tags_true[0] in tags_pred:
                            self.result['Wrong Range'].append((raw_tag_true, raw_tag_pred, raw_words))
                        else:
                            self.result['Wrong Range and tag'].append((raw_tag_true, raw_tag_pred, raw_words))  # print(raw_tag_true, raw_tag_pred, raw_words, sep='\n')  # print(raw_tag_true_to_end)  # print(raw_tag_pred_to_end)  # print('---------------')
                    else:
                        self.result['Num correct tags'].append((raw_tag_true, raw_tag_pred, raw_words))

        for span in self.spans_tags_pred:
            start, end = span[0], span[-1] + 1
            tags_pred = self._get_tags_pred_in(span)
            tags_true = self._get_tags_true_in(span)
            raw_tag_true = self.tags_true[start: end]
            raw_tag_pred = self.tags_pred[start: end]
            raw_words = self.words_true[start: end]
            if not self._is_overlap(span, with_O_spans_tags_true=True):
                self.result['No Annotation'].append((raw_tag_true, raw_tag_pred, raw_words))

        return self


##

all_words_true, all_tags_true = load_dataset(r'/content/checkpoints/test_true.txt')
unique_tags = get_unique_tags(r'/content/checkpoints/test_true.txt')
all_words_pred, all_tags_pred = load_dataset(r'/content/checkpoints/test_pred.txt')

errors = ['No Extraction', 'No Annotation', 'Wrong Range', 'Wrong Tag', 'Wrong Range and tag', 'Num correct tags']
df = pd.DataFrame(columns=errors)

for i, (tags_true, tags_pred, words_true, words_pred) in enumerate(zip(all_tags_true, all_tags_pred, all_words_true, all_words_pred)):
    error_types = ErrorTypesGold(tags_true, tags_pred, words_true, words_pred).check()
    df = pd.concat([df, pd.DataFrame([error_types.result])])

df.reset_index(drop=True, inplace=True)
df.reset_index(inplace=True)
df.rename(columns={'index': 'Row'}, inplace=True)
df = pd.concat([df, pd.DataFrame(data={'Sentence': all_words_true})], axis=1)
df.to_csv('df_error_types_phobert.csv', index=False)

# print summary
for column in df.columns:
    if column in ['Num correct tags', 'Row']:
        continue
    print(column)
    print(df[column].apply(lambda x: len(x) if x else False).sum())

##
df_new = df.iloc[:, :-1].copy()

for error in errors:
    df_new[error] = df_new[error].map(convert_cell_to_tag)

##
# Tao df moi
unique_tags = ['PATIENT_ID', 'NAME', 'AGE', 'GENDER', 'JOB', 'LOCATION', 'ORGANIZATION', 'SYMPTOM_AND_DISEASE', 'TRANSPORTATION', 'DATE', 'O']

df2 = pd.DataFrame(columns=errors)
for tag in unique_tags:
    counts = {}
    for error in errors:
        count_tag = lambda x: len([tag_inside for tag_inside in x if tag_inside == tag])
        temp = df_new[error]
        counts[error] = temp.map(count_tag).sum()
    df2 = pd.concat([df2, pd.DataFrame(data=counts, index=[0])])
# count tung truong hop roi gan do df moi

df2.reset_index(drop=True, inplace=True)
df2 = pd.concat([pd.DataFrame(data={'Tag': unique_tags}), df2], axis=1)

# Tạo cột Errors
num_errors = []
for tag in unique_tags:
    num_errors.append(df2[df2['Tag'] == tag].iloc[:, 1:-1].sum().sum())
df2 = pd.concat([pd.DataFrame(data={'Errors': num_errors}), df2], axis=1)

# tạo cột total
totals = []
for tag in unique_tags:
    totals.append(df2[df2['Tag'] == tag].iloc[:, 2:].sum().sum())
df2 = pd.concat([pd.DataFrame(data={'Total': totals}), df2], axis=1)

# Sắp xếp lại
df2 = df2[['Tag', 'Total', 'Errors', 'No Extraction', 'No Annotation', 'Wrong Range', 'Wrong Tag', 'Wrong Range and tag']]

# Tạo hàng Total
total_row = df2.sum(axis=0).to_dict()
total_row['Tag'] = 'Total'
df2 = pd.concat([df2, pd.DataFrame(total_row, index=[0])])

df2.to_csv('df_error_types.csv', index=False)

No Extraction
188
No Annotation
153
Wrong Range
224
Wrong Tag
104
Wrong Range and tag
13
Sentence
85678


# Demo

In [None]:
!pip install py_vncorenlp

Collecting py_vncorenlp
  Downloading py_vncorenlp-0.1.4.tar.gz (3.9 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyjnius (from py_vncorenlp)
  Downloading pyjnius-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: py_vncorenlp
  Building wheel for py_vncorenlp (setup.py) ... [?25l[?25hdone
  Created wheel for py_vncorenlp: filename=py_vncorenlp-0.1.4-py3-none-any.whl size=4307 sha256=8aa727eedc1e21b7a29839d5f2e96b5c09a1dcaa6c867be9f640925b913b0976
  Stored in directory: /root/.cache/pip/wheels/d5/d9/bf/62632cdb007c702a0664091e92a0bb1f18a2fcecbe962d9827
Successfully built py_vncorenlp
Installing collected packages: pyjnius, py_vncorenlp
Successfully installed py_vncorenlp-0.1.4 pyjnius-1.6.1


In [None]:
from seqeval.metrics.sequence_labeling import get_entities
import py_vncorenlp

ModuleNotFoundError: ignored

In [None]:
def convert_examples_to_features_for_demo(
    texts,
    max_seq_len,
    tokenizer,
    pad_token_label_id=-100,
    cls_token_segment_id=0,
    pad_token_segment_id=0,
    mask_padding_with_zero=True,
):
    # Setting based on the current model type
    cls_token = tokenizer.cls_token
    sep_token = tokenizer.sep_token
    unk_token = tokenizer.unk_token
    pad_token_id = tokenizer.pad_token_id

    sw_max_len = 0
    features = []

    for txt_index, text in enumerate(texts):
        # Tokenize word by word (for NER)
        tokens = []
        val_pos_list = []

        for word in text:
            word_tokens = tokenizer.tokenize(word)
            if not word_tokens:
                word_tokens = [unk_token]  # For handling the bad-encoded word
            tokens.extend(word_tokens)
            val_pos_list.extend([True] + [False] * (len(word_tokens) - 1))
            sw_max_len = max(sw_max_len, len(tokens))

        # Account for [CLS] and [SEP]
        special_tokens_count = 2
        if len(tokens) > max_seq_len - special_tokens_count:
            tokens = tokens[: (max_seq_len - special_tokens_count)]
            val_pos_list = val_pos_list[: (max_seq_len - special_tokens_count)]

        # Add [SEP] token
        tokens += [sep_token]
        val_pos_list += [False]

        # Add [CLS] token
        tokens = [cls_token] + tokens
        val_pos_list = [False] + val_pos_list

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_seq_len - len(input_ids)
        input_ids = input_ids + ([pad_token_id] * padding_length)
        attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
        val_pos_list = val_pos_list + ([False] * padding_length)

        assert len(input_ids) == max_seq_len, "Error with input length {} vs {}".format(len(input_ids), max_seq_len)
        assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(
            len(attention_mask), max_seq_len
        )
        assert len(val_pos_list) == max_seq_len, "Error with valid position list length {} vs {}".format(
            len(val_pos_list), max_seq_len
        )

        feature = {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'val_pos_list': val_pos_list
        }
        features.append(feature)

    print(f'Max length after splitting into subwords: {sw_max_len}')

    return features

def preprocess_texts(texts, tokenizer, ignore_index, max_seq_len):
    # Load data from json file
    # logger.info("Creating features from dataset file at %s", args.data_dir)
    # examples = processor.get_examples(args)
    # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
    pad_token_label_id = ignore_index
    features = convert_examples_to_features_for_demo(
        texts, max_seq_len, tokenizer, pad_token_label_id=pad_token_label_id
    )

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f['input_ids'] for f in features], dtype=torch.long)
    all_attention_mask = torch.tensor([f['attention_mask'] for f in features], dtype=torch.long)
    all_val_pos_list = torch.tensor([f['val_pos_list'] for f in features], dtype=torch.bool)
    dataset = TensorDataset(
        all_input_ids, all_attention_mask, all_val_pos_list
    )
    return dataset

def infer(
    model,
    device,
    dataset,
    batch_size=32,
):
  dataloader = DataLoader(dataset, batch_size=batch_size)
  print("***** Running inference *****")
  print("  Num examples = ", len(dataset))
  print("  Batch size = ", batch_size)

  slot_preds = None
  masks = None
  all_val_pos_list = None
  model.eval()

  for batch in tqdm(dataloader, desc="Inference"):
      val_pos_list = batch[2]
      batch = tuple(t.to(device) for t in batch[:2])
      with torch.no_grad():
          inputs = {
              "input_ids": batch[0],
              "attention_mask": batch[1]
          }
          outputs = model(**inputs)
          _, (slot_logits) = outputs[:2]


      # Slot prediction
      if slot_preds is None:
          slot_preds = slot_logits.detach().cpu().numpy()
          masks = inputs["attention_mask"].detach().cpu().numpy()
          all_val_pos_list = val_pos_list.detach().cpu().numpy()
      else:
          slot_preds = np.append(slot_preds, slot_logits.detach().cpu().numpy(), axis=0)

          masks = np.append(
            masks, inputs["attention_mask"].detach().cpu().numpy(), axis=0
          )

          all_val_pos_list = np.append(
              all_val_pos_list, val_pos_list.detach().cpu().numpy(), axis=0
          )

  # Slot result
  slot_preds = np.argmax(slot_preds, axis=2)
  slot_label_map = {i: label for i, label in enumerate(slot_label_lst)}
  slot_preds_list = [[] for _ in range(slot_preds.shape[0])]

  for i in range(all_val_pos_list.shape[0]):
      for j in range(all_val_pos_list.shape[1]):
          if all_val_pos_list[i, j]:
              slot_preds_list[i].append(slot_label_map[slot_preds[i][j]])
  return slot_preds_list

In [None]:
# Automatically download VnCoreNLP components from the original repository
# and save them in some local machine folder
!mkdir '/content/vncorenlp'
py_vncorenlp.download_model(save_dir='/content/vncorenlp')

# Load the word and sentence segmentation component
rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir='/content/vncorenlp')

mkdir: cannot create directory ‘/content/vncorenlp’: File exists


FileExistsError: ignored

In [None]:
texts = ['Bệnh nhân Trần Thanh Linh đi đến bến xe miền Tây',
         'Bác sĩ Trần Thanh Linh , từ Bệnh viện Chợ Rẫy chi viện phụ trách đơn nguyên hồi sức tích cực, cho biết "bệnh nhân 416" vẫn đang duy trì ECMO, thở máy, hiện xơ phổi rất nhiều.']
texts

['Bệnh nhân Trần Thanh Linh đi đến bến xe miền Tây',
 'Bác sĩ Trần Thanh Linh , từ Bệnh viện Chợ Rẫy chi viện phụ trách đơn nguyên hồi sức tích cực, cho biết "bệnh nhân 416" vẫn đang duy trì ECMO, thở máy, hiện xơ phổi rất nhiều.']

In [None]:
texts = [' '.join(rdrsegmenter.word_segment(text)) for text in texts]
texts

['Bệnh_nhân Trần_Thanh_Linh đi đến bến_xe miền Tây',
 'Bác_sĩ Trần_Thanh_Linh , từ Bệnh_viện Chợ_Rẫy chi_viện phụ_trách đơn_nguyên hồi_sức tích_cực , cho biết " bệnh_nhân 416 " vẫn đang duy_trì ECMO , thở máy , hiện xơ phổi rất nhiều .']

In [None]:
texts = [text.split() for text in texts]
inputs = preprocess_texts(texts, tokenizer=tokenizer, ignore_index=ignore_index, max_seq_len=max_seq_len)

Max length after splitting into subwords: 35


In [None]:
all_preds = infer(model, device, inputs, 2)
all_entities = [get_entities(pred) for pred in all_preds]
all_entities[1]

***** Running inference *****
  Num examples =  2
  Batch size =  2


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

[('LOCATION', 4, 5), ('PATIENT_ID', 16, 16), ('SYMPTOM_AND_DISEASE', 27, 30)]

In [None]:
for idx, (entities, text) in enumerate(zip(all_entities, texts)):
  print(f'===== Text {idx} ====')
  for entity in entities:
    print(f"{' '.join(text[entity[1]: entity[2] + 1])} ==> {entity[0]}")

===== Text 0 ====
Trần_Thanh_Linh ==> NAME
bến_xe miền Tây ==> LOCATION
===== Text 1 ====
Bệnh_viện Chợ_Rẫy ==> LOCATION
416 ==> PATIENT_ID
xơ phổi rất nhiều ==> SYMPTOM_AND_DISEASE


 'Bác_sĩ Trần_Thanh_Linh , từ Bệnh_viện Chợ_Rẫy chi_viện phụ_trách đơn_nguyên hồi_sức tích_cực , cho biết " bệnh_nhân 416 " vẫn đang duy_trì ECMO , thở máy , hiện xơ phổi rất nhiều .']