In [36]:
#tải các thư viện cần thiết
!pip install transformer
!pip install seqeval

[31mERROR: Could not find a version that satisfies the requirement transformer (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for transformer[0m[31m


In [37]:
#import các thư viện cần thiết

import numpy as np
import pandas as pd
import torch

#Embedding
from transformers import BertTokenizerFast, BertForTokenClassification, BertModel

#Modeling
from torch.utils.data import DataLoader
from torch.optim import SGD, Adam
from seqeval.metrics import classification_report
from tqdm import tqdm

In [38]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')

In [39]:
#hàm đọc dữ liệu
def read_dataset(file_path):
    tokens=[]
    ner_tags=[]
    ids=[]
    count=1
    with open(file_path) as f:
        lines=f.readlines()
        ts=[]
        nts=[]
        for line in lines:
            line = line.split()
            if len(line)==0:
                ids.append(count)
                tokens.append(ts)
                ner_tags.append(nts)
                ts=[]
                nts=[]
                count+=1
            else:
                ts.append(line[0])
                nts.append(line[-1])
    data = pd.DataFrame({'Id':ids, 'NER_tags':ner_tags, 'Tokens':tokens})
    return data

In [40]:
def get_NER_labels(data):
    NERs = list(data['NER_tags'].values)
    labels_list = []
    for value in NERs:
        labels_list = labels_list + value
    types = list(set(labels_list))
    return types

In [41]:
train_df = read_dataset('/kaggle/input/covid19vi/syllable/train_syllable.conll')
val_df = read_dataset('/kaggle/input/covid19vi/syllable/dev_syllable.conll')

In [42]:
train_df.head()

Unnamed: 0,Id,NER_tags,Tokens
0,1,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[Đồng, thời, ,, bệnh, viện, tiếp, tục, thực, h..."
1,2,"[O, O, O, O, O, O, O, O, O, O, O, B-SYMPTOM_AN...","["", Số, bệnh, viện, có, thể, tiếp, nhận, bệnh,..."
2,3,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[Ngoài, ra, ,, những, người, tiếp, xúc, gián, ..."
3,4,"[O, O, O, O, O, O, O, B-LOCATION, O, B-LOCATIO...","[Bà, này, khi, trở, về, quá, cảnh, Doha, (, Qa..."
4,5,"[O, O, O, B-PATIENT_ID, O, O, O, O, O, O, O, B...","["", Bệnh, nhân, 523, "", và, chồng, là, "", bệnh..."


In [43]:
unique_labels = get_NER_labels(train_df)

In [44]:
unique_labels

['I-PATIENT_ID',
 'B-AGE',
 'I-SYMPTOM_AND_DISEASE',
 'B-LOCATION',
 'O',
 'I-NAME',
 'B-PATIENT_ID',
 'I-LOCATION',
 'I-TRANSPORTATION',
 'I-JOB',
 'B-DATE',
 'I-GENDER',
 'B-JOB',
 'B-GENDER',
 'B-TRANSPORTATION',
 'I-AGE',
 'B-ORGANIZATION',
 'I-DATE',
 'I-ORGANIZATION',
 'B-SYMPTOM_AND_DISEASE',
 'B-NAME']

# **TẠO LỚP DATASET BẰNG PYTORCH DATASET**

In [45]:
def align_label(text, labels, flag=False):
    label_all_tokens = flag #flag xác định cách thực hiện align_label
    
    tokenized_input = tokenizer(text, padding='max_length', max_length=512, truncation=True, is_split_into_words=True)

    word_ids = tokenized_input.word_ids()

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:

        if word_idx is None:
            label_ids.append(-100)

        elif word_idx != previous_word_idx:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]])
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]] if label_all_tokens else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids

class DataSet(torch.utils.data.Dataset):

    def __init__(self, df, flag_align_label=False):

        lb = df['NER_tags'].values.tolist()
        txt = df['Tokens'].values.tolist()
        self.texts = [tokenizer(i, padding='max_length', max_length = 512,
                                truncation=True, return_tensors="pt", is_split_into_words=True) for i in txt]
        self.labels = [align_label(i,j,flag_align_label) for i,j in zip(txt, lb)]

    def __len__(self):

        return len(self.labels)

    def get_data(self, idx):
        return self.texts[idx]

    def get_labels(self, idx):
        return torch.LongTensor(self.labels[idx])

    def __getitem__(self, idx):
        data = self.get_data(idx)
        labels = self.get_labels(idx)

        return data, labels

**Tạo 2 dictionary để xác định id nào sẽ là label nào và ngược lại.**

In [46]:
labels_to_ids = {k: v for v, k in enumerate(unique_labels)}
ids_to_labels = {v: k for v, k in enumerate(unique_labels)}

In [47]:
labels_to_ids

{'I-PATIENT_ID': 0,
 'B-AGE': 1,
 'I-SYMPTOM_AND_DISEASE': 2,
 'B-LOCATION': 3,
 'O': 4,
 'I-NAME': 5,
 'B-PATIENT_ID': 6,
 'I-LOCATION': 7,
 'I-TRANSPORTATION': 8,
 'I-JOB': 9,
 'B-DATE': 10,
 'I-GENDER': 11,
 'B-JOB': 12,
 'B-GENDER': 13,
 'B-TRANSPORTATION': 14,
 'I-AGE': 15,
 'B-ORGANIZATION': 16,
 'I-DATE': 17,
 'I-ORGANIZATION': 18,
 'B-SYMPTOM_AND_DISEASE': 19,
 'B-NAME': 20}

# **BUILD MODEL BERT FOR TOKEN CLASSIFICATION**

In [48]:
class BertModel(torch.nn.Module):

    def __init__(self):

        super(BertModel, self).__init__()

        self.bert = BertForTokenClassification.from_pretrained('bert-base-multilingual-cased', num_labels=len(unique_labels))

    def forward(self, input_id, mask, label):

        output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)

        return output

# **TRAIN MODEL**

In [49]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [50]:
def train_loop(model, train_df, val_df, flag_align_label):

    train_dataset = DataSet(train_df, flag_align_label)
    val_dataset = DataSet(val_df, flag_align_label)

    train_dataloader = DataLoader(train_dataset, num_workers=4, batch_size=BATCH_SIZE, shuffle=True)
    val_dataloader = DataLoader(val_dataset, num_workers=4, batch_size=BATCH_SIZE)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    optimizer = Adam(model.parameters(), lr=LEARNING_RATE)

    if use_cuda:
        model = model.cuda()
        
    min_val_loss = 1000
    count = 0
    
    for epoch_num in range(EPOCHS):

        total_acc_train = 0
        total_loss_train = 0

        model.train()

        for train_data, train_label in tqdm(train_dataloader):

            train_label = train_label.to(device)
            mask = train_data['attention_mask'].squeeze(1).to(device)
            input_id = train_data['input_ids'].squeeze(1).to(device)

            optimizer.zero_grad()
            loss, logits = model(input_id, mask, train_label)

            for i in range(logits.shape[0]):

                logits_clean = logits[i][train_label[i] != -100]
                label_clean = train_label[i][train_label[i] != -100]

                predictions = logits_clean.argmax(dim=1)
                acc = (predictions == label_clean).float().mean()
                total_acc_train += acc
                total_loss_train += loss.item()

            loss.backward()
            optimizer.step()

        model.eval()

        total_acc_val = 0
        total_loss_val = 0

        for val_data, val_label in val_dataloader:

            val_label = val_label.to(device)
            mask = val_data['attention_mask'].squeeze(1).to(device)
            input_id = val_data['input_ids'].squeeze(1).to(device)

            loss, logits = model(input_id, mask, val_label)

            for i in range(logits.shape[0]):

                logits_clean = logits[i][val_label[i] != -100]
                label_clean = val_label[i][val_label[i] != -100]

                predictions = logits_clean.argmax(dim=1)
                acc = (predictions == label_clean).float().mean()
                total_acc_val += acc
                total_loss_val += loss.item()

        val_accuracy = total_acc_val / len(val_df)
        val_loss = total_loss_val / len(val_df)

        print(
            f'Epochs: {epoch_num + 1} | Loss: {total_loss_train / len(train_df): .3f} | Accuracy: {total_acc_train / len(train_df): .3f} | Val_Loss: {total_loss_val / len(val_df): .3f} | Accuracy: {total_acc_val / len(val_df): .3f}')
        if val_loss < min_val_loss:
            min_val_loss = val_loss
            torch.save(model.state_dict(), 'mbert_ner')
            count = epoch_num
        if epoch_num - count >= 5:
            return
        
LEARNING_RATE = 5e-5
EPOCHS = 30
BATCH_SIZE = 8

model = BertModel()
train_loop(model, train_df, val_df, False)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 629/629 [04:36<00:00,  2.28it/s]


Epochs: 1 | Loss:  0.153 | Accuracy:  0.961 | Val_Loss:  0.096 | Accuracy:  0.974


100%|██████████| 629/629 [04:36<00:00,  2.28it/s]


Epochs: 2 | Loss:  0.063 | Accuracy:  0.983 | Val_Loss:  0.104 | Accuracy:  0.976


100%|██████████| 629/629 [04:36<00:00,  2.28it/s]


Epochs: 3 | Loss:  0.043 | Accuracy:  0.989 | Val_Loss:  0.102 | Accuracy:  0.978


100%|██████████| 629/629 [04:36<00:00,  2.28it/s]


Epochs: 4 | Loss:  0.039 | Accuracy:  0.990 | Val_Loss:  0.109 | Accuracy:  0.976


100%|██████████| 629/629 [04:36<00:00,  2.28it/s]


Epochs: 5 | Loss:  0.030 | Accuracy:  0.993 | Val_Loss:  0.089 | Accuracy:  0.979


100%|██████████| 629/629 [04:36<00:00,  2.28it/s]


Epochs: 6 | Loss:  0.028 | Accuracy:  0.993 | Val_Loss:  0.099 | Accuracy:  0.980


100%|██████████| 629/629 [04:36<00:00,  2.28it/s]


Epochs: 7 | Loss:  0.028 | Accuracy:  0.993 | Val_Loss:  0.109 | Accuracy:  0.976


100%|██████████| 629/629 [04:36<00:00,  2.28it/s]


Epochs: 8 | Loss:  0.026 | Accuracy:  0.993 | Val_Loss:  0.102 | Accuracy:  0.979


100%|██████████| 629/629 [04:36<00:00,  2.28it/s]


Epochs: 9 | Loss:  0.022 | Accuracy:  0.994 | Val_Loss:  0.107 | Accuracy:  0.978


100%|██████████| 629/629 [04:36<00:00,  2.28it/s]


Epochs: 10 | Loss:  0.017 | Accuracy:  0.995 | Val_Loss:  0.108 | Accuracy:  0.978


# **EVALUATE MODEL**

In [51]:
test_df = read_dataset('/kaggle/input/covid19vi/syllable/test_syllable.conll')

In [52]:
def evaluate(model, test_df, flag_align_label, ids_to_labels):

    test_dataset = DataSet(test_df, flag_align_label)

    test_dataloader = DataLoader(test_dataset, num_workers=4, batch_size=1)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    predictions = []
    labels = []
    
    for test_data, test_label in test_dataloader:

            test_label = test_label.to(device)
            mask = test_data['attention_mask'].squeeze(1).to(device)

            input_id = test_data['input_ids'].squeeze(1).to(device)

            loss, logits = model(input_id, mask, test_label)

            for i in range(logits.shape[0]):
                cleaned_logits = logits[i][test_label[i] != -100].argmax(dim=1)
                predictions.append([ids_to_labels[val.item()] for val in cleaned_logits])
                cleaned_labels = test_label[i][test_label[i] != -100]
                labels.append([ids_to_labels[val.item()] for val in cleaned_labels])
    print(classification_report(y_pred=predictions, y_true=labels, digits=3))

evaluate(model, test_df, False, ids_to_labels)

                     precision    recall  f1-score   support

                AGE      0.964     0.957     0.960       582
               DATE      0.978     0.984     0.981      1654
             GENDER      0.965     0.957     0.961       462
                JOB      0.671     0.636     0.653       173
           LOCATION      0.913     0.929     0.921      4441
               NAME      0.920     0.934     0.927       318
       ORGANIZATION      0.790     0.844     0.816       771
         PATIENT_ID      0.968     0.984     0.976      2005
SYMPTOM_AND_DISEASE      0.813     0.787     0.800      1136
     TRANSPORTATION      0.934     0.959     0.946       193

          micro avg      0.915     0.926     0.920     11735
          macro avg      0.892     0.897     0.894     11735
       weighted avg      0.915     0.926     0.920     11735



# **INFERENCE**

In [53]:
def align_word_ids(text, flag):
    label_all_tokens = flag
    
    text = text.split()
  
    tokenized_inputs = tokenizer(text, padding='max_length', max_length=512, truncation=True, is_split_into_words=True)

    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:

        if word_idx is None:
            label_ids.append(-100)

        elif word_idx != previous_word_idx:
            try:
                label_ids.append(1)
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(1 if label_all_tokens else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids

In [54]:
def ner(model, sentence, flag_align_label):


    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    text = tokenizer(sentence, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")

    mask = text['attention_mask'].to(device)
    input_id = text['input_ids'].to(device)
    label_ids = torch.Tensor(align_word_ids(sentence, flag_align_label)).unsqueeze(0).to(device)

    logits = model(input_id, mask, None)
    logits_clean = logits[0][label_ids != -100]

    predictions = logits_clean.argmax(dim=1).tolist()
    prediction_label = [ids_to_labels[i] for i in predictions]
    print(sentence)
    print(prediction_label)

In [55]:
ner(model,
    'Bệnh nhân nhập viện tối qua ở Bệnh Viện 115 là bệnh nhân thứ 82 , di chuyển qua nhiều thành phố bằng xe biển hiệu E-402',
    flag_align_label=False)

Bệnh nhân nhập viện tối qua ở Bệnh Viện 115 là bệnh nhân thứ 82 , di chuyển qua nhiều thành phố bằng xe biển hiệu E-402
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOCATION', 'I-LOCATION', 'B-PATIENT_ID', 'O', 'O', 'O', 'O', 'B-PATIENT_ID', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-TRANSPORTATION']
