In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForTokenClassification, AdamW
from sklearn.metrics import classification_report
import numpy as np
from sklearn.metrics import f1_score

In [2]:
def calculate_f1(predictions, true_labels):
    return f1_score(true_labels, predictions, average='weighted')

In [3]:
def read_data(file_path):
    sentences = []
    labels = []
    current_sentence = []
    current_labels = []

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if line:
                parts = line.split('\t')
                if len(parts) >= 4:
                    word, _, _, label = parts[:4]
                    current_sentence.append(word)
                    current_labels.append(label)
            else:
                if current_sentence:
                    sentences.append(' '.join(current_sentence))
                    labels.append(current_labels)
                current_sentence = []
                current_labels = []

    if current_sentence:  # Thêm câu cuối cùng nếu có
        sentences.append(' '.join(current_sentence))
        labels.append(current_labels)

    return sentences, labels

In [4]:
class NERDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_len):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, item):
        sentence = self.sentences[item]
        label = self.labels[item]

        words = sentence.split()
        word_labels = label

        # Tokenize từng từ và tạo nhãn tương ứng
        tokenized_inputs = []
        labels = []
        for word, word_label in zip(words, word_labels):
            word_tokens = self.tokenizer.tokenize(word)
            n_subwords = len(word_tokens)
            tokenized_inputs.extend(word_tokens)
            labels.extend([label_map[word_label]] * n_subwords)

        # Cắt ngắn hoặc đệm nếu cần
        tokenized_inputs = tokenized_inputs[:self.max_len - 2]  # Để có chỗ cho [CLS] và [SEP]
        labels = labels[:self.max_len - 2]

        # Thêm tokens đặc biệt
        tokenized_inputs = ["[CLS]"] + tokenized_inputs + ["[SEP]"]
        labels = [-100] + labels + [-100]  # -100 là giá trị bỏ qua cho loss

        # Đệm nếu cần
        padding_length = self.max_len - len(tokenized_inputs)
        tokenized_inputs += ["[PAD]"] * padding_length
        labels += [-100] * padding_length

        # Chuyển đổi thành IDs
        input_ids = self.tokenizer.convert_tokens_to_ids(tokenized_inputs)
        attention_mask = [1] * len(input_ids)

        # Chuyển đổi thành tensors
        input_ids = torch.tensor(input_ids)
        attention_mask = torch.tensor(attention_mask)
        labels = torch.tensor(labels)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

In [5]:
def train(model, train_dataloader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    return total_loss / len(train_dataloader)

In [6]:
def evaluate(model, dataloader, device):
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=2)
            predictions.extend(preds[labels != -100].cpu().numpy())
            true_labels.extend(labels[labels != -100].cpu().numpy())
    
    # Chuyển đổi số nguyên thành nhãn
    id_to_label = {v: k for k, v in label_map.items()}
    true_labels = [id_to_label[l] for l in true_labels]
    predictions = [id_to_label[p] for p in predictions]
    
    # Kiểm tra xem có nhãn nào không
    if len(set(true_labels + predictions)) == 0:
        print("Warning: No labels found in the evaluation set.")
        return 0.0, "No labels found in the evaluation set."
    
    f1 = calculate_f1(predictions, true_labels)
    
    # Tạo danh sách nhãn duy nhất từ cả true_labels và predictions
    unique_labels = sorted(set(true_labels + predictions))
    
    # Kiểm tra xem có nhãn nào không trước khi tạo classification report
    if len(unique_labels) > 0:
        report = classification_report(true_labels, predictions, labels=unique_labels, zero_division=0)
    else:
        report = "No labels found in the evaluation set."
    
    return f1, report

In [7]:
# Thiết lập các tham số
model_name = "vinai/phobert-base"
max_len = 128
batch_size = 16
epochs = 10
learning_rate = 2e-5

In [8]:
# Đọc dữ liệu
train_sentences, train_labels = read_data('/kaggle/input/dataset-ner-4/train.csv')
test_sentences, test_labels = read_data('/kaggle/input/dataset-ner-4/test.csv')
dev_sentences, dev_labels = read_data('/kaggle/input/dataset-ner-4/dev.csv')

In [9]:
# Xây dựng label_map từ dữ liệu
unique_labels = set()
for labels in train_labels + test_labels:
    unique_labels.update(labels)
label_map = {label: i for i, label in enumerate(sorted(unique_labels))}

num_labels = len(label_map)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:

# Chuẩn bị tokenizer và mô hình
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]

In [11]:
# Chuẩn bị dataset và dataloader
train_dataset = NERDataset(train_sentences, train_labels, tokenizer, max_len)
test_dataset = NERDataset(test_sentences, test_labels, tokenizer, max_len)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)
dev_dataset = NERDataset(dev_sentences, dev_labels, tokenizer, max_len)
dev_dataloader = DataLoader(dev_dataset, batch_size=batch_size)

In [12]:
# Thiết lập device và optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=learning_rate)



In [13]:
best_f1 = 0
for epoch in range(epochs):
    train_loss = train(model, train_dataloader, optimizer, device)
    val_f1, val_report = evaluate(model, dev_dataloader, device)
    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Validation F1: {val_f1:.4f}")
    
    # Lưu mô hình tốt nhất dựa trên F1-score của validation
    if val_f1 > best_f1:
        best_f1 = val_f1
        torch.save(model.state_dict(), 'best_model.pth')
        print(f"New best model saved with F1-score: {best_f1:.4f}")
    
    print("Validation Report:")
    print(val_report)

Epoch 1/10, Train Loss: 0.3512, Validation F1: 0.9578
New best model saved with F1-score: 0.9578
Validation Report:
              precision    recall  f1-score   support

       B-LOC       0.47      0.76      0.58       198
      B-MISC       0.00      0.00      0.00         1
       B-ORG       0.00      0.00      0.00        21
       B-PER       0.77      0.65      0.71       258
       I-LOC       0.71      0.05      0.09       109
      I-MISC       0.00      0.00      0.00         1
       I-ORG       0.00      0.00      0.00        24
       I-PER       0.65      0.77      0.71       128
           O       0.99      0.99      0.99      9078

    accuracy                           0.96      9818
   macro avg       0.40      0.36      0.34      9818
weighted avg       0.96      0.96      0.96      9818

Epoch 2/10, Train Loss: 0.1021, Validation F1: 0.9864
New best model saved with F1-score: 0.9864
Validation Report:
              precision    recall  f1-score   support

       B

In [14]:
# Đánh giá trên tập test với mô hình tốt nhất
model.load_state_dict(torch.load('best_model.pth'))
test_f1, test_report = evaluate(model, test_dataloader, device)
print(f"Test F1-score: {test_f1:.4f}")
print("Test Results:")
print(test_report)

Test F1-score: 0.9928
Test Results:
              precision    recall  f1-score   support

       B-LOC       0.94      0.91      0.92       179
      B-MISC       0.73      1.00      0.85        11
       B-ORG       0.61      0.92      0.73        12
       B-PER       0.97      0.97      0.97       188
       I-LOC       0.85      0.85      0.85        61
      I-MISC       0.55      1.00      0.71        11
       I-ORG       0.89      0.85      0.87        40
       I-PER       0.99      0.99      0.99       154
           O       1.00      1.00      1.00      7445

    accuracy                           0.99      8101
   macro avg       0.84      0.94      0.88      8101
weighted avg       0.99      0.99      0.99      8101

