## 1. Import lib and Load Dataset

In [2]:
# install library
# pip install evaluate

# import library
from typing import List
import numpy as np
import torch
import evaluate
from sklearn.model_selection import train_test_split
import nltk
nltk.download('treebank')


# load tree bank dataset
tagged_sentences = nltk.corpus.treebank.tagged_sents()
print(" Number of samples :", len(tagged_sentences))

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!


 Number of samples : 3914


In [3]:
sentences, sentence_tags = [], []
for tagged_sentence in tagged_sentences:
    sentence, tags = zip(*tagged_sentence)
    sentences.append(np.array(sentence))
    sentence_tags.append(np.array(tags))

sentences[0]

array(['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join',
       'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.',
       '29', '.'], dtype='<U12')

### Create label mapping

In [4]:
def get_label_mapping(sentence_tags: List[List[str]]):
    tags = set()
    for sen_tags in sentence_tags:
        for tag in sen_tags:
            tags.add(tag)

    label2id = {tag: i for i, tag in enumerate(tags)}
    label2id['<PAD>'] = len(label2id)
    id2label = {i: tag for tag, i in label2id.items()}
    return label2id, id2label

In [5]:
label2id, id2label = get_label_mapping(sentence_tags)

## Setup DataLoader

In [6]:
# 0.7 - 0.15 - 0.15
train_sentences, test_sentences, train_tags, test_tags = train_test_split(sentences, sentence_tags, test_size=0.3)
test_sentences, val_sentences, test_tags, val_tags = train_test_split(
    test_sentences, test_tags, test_size=0.5)

In [7]:
# tokenization
from transformers import AutoTokenizer
from torch.utils.data import Dataset

model_name = "QCRI/bert-base-multilingual-cased-pos-english"

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    # use_fast để sử dụng tokenize nhanh
    use_fast=True
)

MAX_LENGTH = 256

class postagging_dataset(Dataset):
    def __init__(self, sentences: List[List[str]], tags: List[List[str]], tokenizer, label2id, max_length = MAX_LENGTH):
        super().__init__()
        self.sentences = sentences
        self.tags = tags
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.label2id = label2id
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        tag = self.tags[idx]

        encode_sen = self.tokenizer.convert_tokens_to_ids(sentence)
        encode_tag = [self.label2id[t] for t in tag]
        attention_mask = [1] * len(encode_sen)

        return {
            "input_ids": self.pad_and_truncate(encode_sen, pad_id=self.tokenizer.pad_token_id),
            "labels": self.pad_and_truncate(encode_tag, pad_id=self.label2id["<PAD>"]),
            "attention_mask": self.pad_and_truncate(attention_mask, pad_id=0)
        }
    
    def pad_and_truncate(self, encoded, pad_id):
        if len(encoded) < self.max_length:
            padding = [pad_id] * (self.max_length - len(encoded))
            encoded = encoded + padding
        else:
            encoded = encoded[:self.max_length]
        return encoded  

In [8]:
train_data = postagging_dataset(train_sentences, train_tags, tokenizer, label2id)
val_data = postagging_dataset(
    val_sentences, val_tags, tokenizer, label2id)
test_data = postagging_dataset(
    train_sentences, train_tags, tokenizer, label2id)

## Modeling

In [10]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_name = "QCRI/bert-base-multilingual-cased-pos-english"

model = AutoModelForTokenClassification.from_pretrained(
    model_name, num_labels=len(label2id), ignore_mismatched_sizes=True
)

Some weights of the model checkpoint at QCRI/bert-base-multilingual-cased-pos-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at QCRI/bert-base-multilingual-cased-pos-english and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([46, 768]) in the checkpoint and torch.Size([47, 768]) in the model i

## Metrics

#### ví dụ về ignore 

In [39]:
import numpy as np

ignore_label = 5
labels = np.array([
    [0, 1, 2, 5, 5],
    [3, 4, 0, 1, 5],
    [2, 3, 4, 0, 1]
])

predictions_logits = np.array([  # Original logits (before argmax) - let's rename for clarity
    [
        [0.1, 0.8, 0.2, 0.05, 0.05],
        [0.6, 0.1, 0.1, 0.1, 0.1],
        [0.0, 0.0, 0.7, 0.2, 0.1],
        # Logits for padding token (doesn't really matter)
        [0.2, 0.2, 0.2, 0.2, 0.2],
        [0.2, 0.2, 0.2, 0.2, 0.2]  # Logits for padding token
    ],
    [
        [0.2, 0.3, 0.1, 0.4, 0.0],
        [0.1, 0.0, 0.8, 0.05, 0.05],
        [0.5, 0.2, 0.1, 0.1, 0.1],
        [0.2, 0.2, 0.2, 0.2, 0.2],  # Logits for padding token
        [0.2, 0.2, 0.2, 0.2, 0.2]  # Logits for padding token
    ],
    [
        [0.2, 0.3, 0.1, 0.4, 0.0],
        [0.1, 0.0, 0.8, 0.05, 0.05],
        [0.5, 0.2, 0.1, 0.1, 0.1],
        [0.2, 0.2, 0.2, 0.2, 0.2],
        [0.2, 0.2, 0.2, 0.2, 0.2]
    ]
])


mask = labels != ignore_label
predicted_classes = np.argmax(predictions_logits, axis=-1)  # Apply argmax

print("Mask:\n", mask)
print("\nPredicted Classes (after argmax):\n", predicted_classes)
print("\nLabels:\n", labels)

masked_predictions = predicted_classes[mask]
masked_labels = labels[mask]

print("\nMasked Predictions:\n", masked_predictions)
print("\nMasked Labels:\n", masked_labels)

Mask:
 [[ True  True  True False False]
 [ True  True  True  True False]
 [ True  True  True  True  True]]

Predicted Classes (after argmax):
 [[1 0 2 0 0]
 [3 2 0 0 0]
 [3 2 0 0 0]]

Labels:
 [[0 1 2 5 5]
 [3 4 0 1 5]
 [2 3 4 0 1]]

Masked Predictions:
 [1 0 2 3 2 0 0 3 2 0 0 0]

Masked Labels:
 [0 1 2 3 4 0 1 2 3 4 0 1]


### Metrics

In [11]:
accuracy = evaluate.load("accuracy")

# vì mình đạt pad = len(label2id) nên mình sẽ bỏ qua label này
ignore_label = label2id["<PAD>"]

def compute_metrics(eval_pred):
    print(eval_pred)
    predictions, labels = eval_pred
    # bỏ qua padding
    mask = labels != ignore_label
    predictions = np.argmax(predictions, axis=-1)
    return accuracy.compute(predictions=predictions[mask], references=labels[mask])

## Trainer

In [12]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="out_dir",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmoonlig73[0m ([33mminhdeptrai[0m). Use [1m`wandb login --relogin`[0m to force relogin


  4%|▍         | 75/1720 [10:58<4:16:18,  9.35s/it]

KeyboardInterrupt: 

## Inferences

In [None]:
# tokenization
test_sentence = "We are exploring the topic of deep learning "
input = torch.as_tensor(
    [tokenizer.convert_tokens_to_ids(test_sentence.split())])
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input = input.to(device)

# prediction
outputs = model(input)
_, preds = torch.max(outputs.logits, -1)
preds = preds[0].cpu().numpy()

# decode
pred_tags = ""
for pred in preds:
    pred_tags += id2label[pred] + " "
pred_tags  # = > PRP VBP RB DT NN IN JJ NN

TokenClassifierOutput(loss=None, logits=tensor([[[-2.2665e-01, -1.1905e+00,  6.3013e-01, -3.3882e-01, -4.6584e-01,
           3.5006e-01,  3.1260e-01, -2.9735e-05,  1.1614e-01, -3.4921e-01,
          -8.9358e-02, -2.4233e-01, -6.4159e-01,  6.0555e-01, -8.0269e-02,
          -1.8956e-01, -6.1509e-01, -3.6126e-01,  2.7667e-01, -6.5976e-02,
          -2.8197e-02, -2.8431e-01, -9.6452e-02, -1.3094e-02, -4.0846e-01,
          -3.1601e-01, -1.7764e-01, -3.1447e-01,  1.1169e+00, -3.6802e-01,
          -3.9152e-01,  5.3389e-02, -1.3965e-01, -4.8342e-01, -4.3083e-01,
          -2.9877e-01,  1.1248e-01, -1.0130e-01, -3.6021e-01, -7.0039e-01,
          -9.2520e-02,  3.9546e-02,  3.0646e-01,  9.2590e-02, -5.4446e-01,
          -5.4229e-01,  5.4034e+00],
         [-7.3233e-01, -1.0640e+00,  3.1379e-01, -2.4655e-01, -4.2472e-01,
           1.7328e+00,  6.7520e-01,  1.7756e-01,  1.2829e+00, -6.3129e-01,
          -6.6374e-02, -5.1448e-01, -1.3419e-01,  8.6598e-01,  9.4508e-01,
          -3.3203e-01, 

'<PAD> NNS -NONE- DT NN IN JJ NN '