## 1. Import lib and Load Dataset

In [1]:
# install library
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [2]:
# import library
from typing import List
import numpy as np
import torch
import evaluate
from sklearn.model_selection import train_test_split
import nltk
nltk.download('treebank')

import wandb
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("wandb_api_key")
wandb.login(key=secret_value_0)

# load tree bank dataset
tagged_sentences = nltk.corpus.treebank.tagged_sents()
print(" Number of samples :", len(tagged_sentences))

[nltk_data] Downloading package treebank to /usr/share/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmoonlig73[0m ([33mminhdeptrai[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


 Number of samples : 3914


In [3]:
sentences, sentence_tags = [], []
for tagged_sentence in tagged_sentences:
    sentence, tags = zip(*tagged_sentence)
    sentences.append(np.array(sentence))
    sentence_tags.append(np.array(tags))

sentences[0]

array(['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join',
       'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.',
       '29', '.'], dtype='<U12')

### Create label mapping

In [4]:
def get_label_mapping(sentence_tags: List[List[str]]):
    tags = set()
    for sen_tags in sentence_tags:
        for tag in sen_tags:
            tags.add(tag)

    label2id = {tag: i for i, tag in enumerate(tags)}
    label2id['<PAD>'] = len(label2id)
    id2label = {i: tag for tag, i in label2id.items()}
    return label2id, id2label

In [5]:
label2id, id2label = get_label_mapping(sentence_tags)

## Setup DataLoader

In [6]:
# 0.7 - 0.15 - 0.15
train_sentences, test_sentences, train_tags, test_tags = train_test_split(sentences, sentence_tags, test_size=0.3)
test_sentences, val_sentences, test_tags, val_tags = train_test_split(
    test_sentences, test_tags, test_size=0.5)

In [7]:
# tokenization
from transformers import AutoTokenizer
from torch.utils.data import Dataset

model_name = "QCRI/bert-base-multilingual-cased-pos-english"

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    # use_fast để sử dụng tokenize nhanh
    use_fast=True
)

MAX_LENGTH = 256

class postagging_dataset(Dataset):
    def __init__(self, sentences: List[List[str]], tags: List[List[str]], tokenizer, label2id, max_length = MAX_LENGTH):
        super().__init__()
        self.sentences = sentences
        self.tags = tags
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.label2id = label2id
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        tag = self.tags[idx]

        encode_sen = self.tokenizer.convert_tokens_to_ids(sentence)
        encode_tag = [self.label2id[t] for t in tag]
        attention_mask = [1] * len(encode_sen)

        return {
            "input_ids": self.pad_and_truncate(encode_sen, pad_id=self.tokenizer.pad_token_id),
            "labels": self.pad_and_truncate(encode_tag, pad_id=self.label2id["<PAD>"]),
            "attention_mask": self.pad_and_truncate(attention_mask, pad_id=0)
        }
    
    def pad_and_truncate(self, encoded, pad_id):
        if len(encoded) < self.max_length:
            padding = [pad_id] * (self.max_length - len(encoded)) # ví pad_id là số , cần đặt ngoặc vuông ở ngoài khi nhân thì thành list , lúc đó mới  '+' được
            encoded = encoded + padding
        else:
            encoded = encoded[:self.max_length]
        return encoded  

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.12k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [8]:
train_data = postagging_dataset(train_sentences, train_tags, tokenizer, label2id)
val_data = postagging_dataset(
    val_sentences, val_tags, tokenizer, label2id)
test_data = postagging_dataset(
    train_sentences, train_tags, tokenizer, label2id)

## Modeling

In [9]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_name = "QCRI/bert-base-multilingual-cased-pos-english"

model = AutoModelForTokenClassification.from_pretrained(
    model_name, num_labels=len(label2id), ignore_mismatched_sizes=True
)

pytorch_model.bin:   0%|          | 0.00/712M [00:00<?, ?B/s]

Some weights of the model checkpoint at QCRI/bert-base-multilingual-cased-pos-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at QCRI/bert-base-multilingual-cased-pos-english and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([46, 768]) in the checkpoint and torch.Size([47, 768]) in the model i

## Metrics

#### ví dụ về ignore 

In [10]:
import numpy as np

ignore_label = 5
labels = np.array([
    [0, 1, 2, 5, 5],
    [3, 4, 0, 1, 5],
    [2, 3, 4, 0, 1]
])

predictions_logits = np.array([  # Original logits (before argmax) - let's rename for clarity
    [
        [0.1, 0.8, 0.2, 0.05, 0.05],
        [0.6, 0.1, 0.1, 0.1, 0.1],
        [0.0, 0.0, 0.7, 0.2, 0.1],
        # Logits for padding token (doesn't really matter)
        [0.2, 0.2, 0.2, 0.2, 0.2],
        [0.2, 0.2, 0.2, 0.2, 0.2]  # Logits for padding token
    ],
    [
        [0.2, 0.3, 0.1, 0.4, 0.0],
        [0.1, 0.0, 0.8, 0.05, 0.05],
        [0.5, 0.2, 0.1, 0.1, 0.1],
        [0.2, 0.2, 0.2, 0.2, 0.2],  # Logits for padding token
        [0.2, 0.2, 0.2, 0.2, 0.2]  # Logits for padding token
    ],
    [
        [0.2, 0.3, 0.1, 0.4, 0.0],
        [0.1, 0.0, 0.8, 0.05, 0.05],
        [0.5, 0.2, 0.1, 0.1, 0.1],
        [0.2, 0.2, 0.2, 0.2, 0.2],
        [0.2, 0.2, 0.2, 0.2, 0.2]
    ]
])


mask = labels != ignore_label
predicted_classes = np.argmax(predictions_logits, axis=-1)  # Apply argmax

print("Mask:\n", mask)
print("\nPredicted Classes (after argmax):\n", predicted_classes)
print("\nLabels:\n", labels)

masked_predictions = predicted_classes[mask]
masked_labels = labels[mask]

print("\nMasked Predictions:\n", masked_predictions)
print("\nMasked Labels:\n", masked_labels)

Mask:
 [[ True  True  True False False]
 [ True  True  True  True False]
 [ True  True  True  True  True]]

Predicted Classes (after argmax):
 [[1 0 2 0 0]
 [3 2 0 0 0]
 [3 2 0 0 0]]

Labels:
 [[0 1 2 5 5]
 [3 4 0 1 5]
 [2 3 4 0 1]]

Masked Predictions:
 [1 0 2 3 2 0 0 3 2 0 0 0]

Masked Labels:
 [0 1 2 3 4 0 1 2 3 4 0 1]


### Metrics

In [11]:
accuracy = evaluate.load("accuracy")

# vì mình đạt pad = len(label2id) nên mình sẽ bỏ qua label này
ignore_label = label2id["<PAD>"]

def compute_metrics(eval_pred):
    print(eval_pred)
    predictions, labels = eval_pred
    # bỏ qua padding
    mask = labels != ignore_label
    predictions = np.argmax(predictions, axis=-1)
    return accuracy.compute(predictions=predictions[mask], references=labels[mask])

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

## Trainer

In [12]:
from transformers import TrainingArguments, Trainer
import shutil
import os

output_dir = "/kaggle/working/out_dir"
shutil.rmtree(output_dir, ignore_errors=True)
os.makedirs(output_dir, exist_ok=True)
print(f"Đã xóa và tạo lại thư mục đầu ra: {output_dir}")

training_args = TrainingArguments(
    output_dir="/kaggle/working/out_dir",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    eval_strategy="no",
    save_strategy="no",
    load_best_model_at_end=True,
    report_to=None
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

Đã xóa và tạo lại thư mục đầu ra: /kaggle/working/out_dir


  trainer = Trainer(
[34m[1mwandb[0m: Tracking run with wandb version 0.19.1
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20250204_162754-bqspr4z5[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33m/kaggle/working/out_dir[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/minhdeptrai/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/minhdeptrai/huggingface/runs/bqspr4z5[0m


Step,Training Loss
500,0.1681


TrainOutput(global_step=860, training_loss=0.11466086742489837, metrics={'train_runtime': 793.3096, 'train_samples_per_second': 34.526, 'train_steps_per_second': 1.084, 'total_flos': 3579914951838720.0, 'train_loss': 0.11466086742489837, 'epoch': 10.0})

## Inferences

In [13]:
# tokenization
test_sentence = "We are exploring the topic of deep learning "
input = torch.as_tensor(
    [tokenizer.convert_tokens_to_ids(test_sentence.split())])
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input = input.to(device)

# prediction
outputs = model(input)
_, preds = torch.max(outputs.logits, -1)
preds = preds[0].cpu().numpy()

# decode
pred_tags = ""
for pred in preds:
    pred_tags += id2label[pred] + " "
pred_tags  # = > PRP VBP RB DT NN IN JJ NN

'DT VBP RB DT NN IN RB VBG '