In [1]:
!pip install -qq datasets

In [9]:
# !gdown 1DuwYRftQjQmQcAR4FVPH-HvGuxGi4ist
# !gdown 11xZZfla8CDH54-EeUUdnAAoT2ummuEJh
# !gdown 1wVyhQkhAzwod2at7Ir3tRHbdCMOszzwg

Downloading...
From: https://drive.google.com/uc?id=1DuwYRftQjQmQcAR4FVPH-HvGuxGi4ist
To: /content/train_word.conll
100% 1.42M/1.42M [00:00<00:00, 31.8MB/s]
Downloading...
From: https://drive.google.com/uc?id=11xZZfla8CDH54-EeUUdnAAoT2ummuEJh
To: /content/test_word.conll
100% 958k/958k [00:00<00:00, 21.0MB/s]
Downloading...
From: https://drive.google.com/uc?id=1wVyhQkhAzwod2at7Ir3tRHbdCMOszzwg
To: /content/dev_word.conll
100% 628k/628k [00:00<00:00, 32.7MB/s]


In [123]:
def read_conll(file_path):
    sentences = []
    sentence_labels = []
    unique_labels = set()  # To collect unique labels

    with open(file_path, 'r') as file:
        current_sentence_tokens = []
        current_sentence_labels = []

        for line in file:
            line = line.strip()  # Remove leading/trailing whitespace, including '\n'

            # If it's an empty line, sentence boundary detected
            if not line:
                if current_sentence_tokens:  # Check if there's a sentence to append
                    sentences.append(' '.join(current_sentence_tokens))
                    sentence_labels.append(' '.join(current_sentence_labels))
                current_sentence_tokens = []  # Reset for the next sentence
                current_sentence_labels = []  # Reset for the next sentence
            else:
                line_parts = line.split()  # Split line into token and label
                current_sentence_tokens.append(line_parts[0])

                if len(line_parts) >= 2:
                    current_sentence_labels.append(line_parts[1])
                    unique_labels.add(line_parts[1])  # Add label to the set of unique labels
                else:
                    current_sentence_labels.append('O')  # Default to 'O' if no label provided

    # Append the last sentence if the file doesn't end with an empty line
    if current_sentence_tokens:
        sentences.append(' '.join(current_sentence_tokens))
        sentence_labels.append(' '.join(current_sentence_labels))

    print(f"Unique labels found: {unique_labels}")
    return sentences, sentence_labels

# Load the datasets
# test_sentences, test_labels = read_conll('./test_word.conll')
# dev_sentences, dev_labels = read_conll('./dev_word.conll')
train_sentences, train_labels = read_conll('/content/da_lat_ner_1000_records_precise.conll')

# Now, test_sentences, test_labels, dev_sentences, dev_labels, train_sentences, and train_labels are arrays of strings


Unique labels found: {'I-RENTALSERVICES', 'B-RESTAURANTS', 'I-HOTELS', 'O', 'B-HOTELS', 'B-DRINKPLACES', 'I-DRINKPLACES', 'B-RENTALSERVICES', 'B-STREETFOODRESTAURANT', 'I-RESTAURANTS', 'I-ATTRACTIONS', 'I-STREETFOODRESTAURANT', 'B-ATTRACTIONS', 'I-TOUR', 'B-TOUR'}


In [124]:
train_sentences[1]

'Hôm nay đi Tiệm Trà Cúc , cảnh rất đẹp và yên bình .'

In [125]:
train_labels[1]

'O O O B-DRINKPLACES I-DRINKPLACES I-DRINKPLACES O O O O O O O O'

In [126]:
from datasets import Dataset

# Step 1: Prepare the datasets from sentences and labels
def prepare_dataset(sentences, labels):
    return {'tokens': sentences, 'labels': labels}

train_dataset = prepare_dataset(train_sentences, train_labels)
# dev_dataset = prepare_dataset(dev_sentences, dev_labels)
# test_dataset = prepare_dataset(test_sentences, test_labels)

# Step 2: Convert strings of tokens and labels into arrays
def process_string_to_array(dataset):
    return {
        'tokens': [sentence.split() for sentence in dataset['tokens']],
        'labels': [label_seq.split() for label_seq in dataset['labels']]
    }

# Step 3: Process the dataset for token and label lists
train_dataset = process_string_to_array(train_dataset)
# dev_dataset = process_string_to_array(dev_dataset)
# test_dataset = process_string_to_array(test_dataset)

# Step 4: Convert processed datasets into Hugging Face Dataset objects
train_dataset = Dataset.from_dict(train_dataset)
# dev_dataset = Dataset.from_dict(dev_dataset)
# test_dataset = Dataset.from_dict(test_dataset)

# Print the size of each dataset and a sample for verification
print(f"Train dataset size: {len(train_dataset)}")
# print(f"Dev dataset size: {len(dev_dataset)}")
# print(f"Test dataset size: {len(test_dataset)}")
print("Train dataset sample:", train_dataset[0])
# print("Dev dataset sample:", dev_dataset[0])
# print("Test dataset sample:", test_dataset[0])

# Step 5: Define an Example class
class Example:
    def __init__(self, words, slot_labels, guid=None):
        self.words = words
        self.slot_labels = slot_labels
        self.guid = guid

# Step 6: Convert the dataset to Example objects
def convert_to_examples(dataset):
    return [
        Example(words=tokens, slot_labels=labels, guid=i)
        for i, (tokens, labels) in enumerate(zip(dataset['tokens'], dataset['labels']))
    ]

# Convert datasets into Example objects
train_examples = convert_to_examples(train_dataset)
# dev_examples = convert_to_examples(dev_dataset)
# test_examples = convert_to_examples(test_dataset)


Train dataset size: 1000
Train dataset sample: {'tokens': ['Checkin', 'ở', 'Nhà', 'hàng', 'Song', 'May', ',', 'sống', 'ảo', 'cả', 'buổi', 'không', 'chán', '.'], 'labels': ['O', 'O', 'B-RESTAURANTS', 'I-RESTAURANTS', 'I-RESTAURANTS', 'I-RESTAURANTS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}


In [127]:
import logging
logger = logging.getLogger(__name__)

import copy
import json
import logging
import os

In [128]:
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, attention_mask, token_type_ids, slot_labels_ids):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.token_type_ids = token_type_ids
        self.slot_labels_ids = slot_labels_ids

    def __repr__(self):
        return str(self.to_json_string())

    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"

In [129]:
def convert_examples_to_features(
    examples,
    max_seq_len,
    tokenizer,
    pad_label_id=-100,
    cls_token_segment_id=0,
    pad_token_segment_id=0,
    sequence_segment_id=0,
    mask_padding_with_zero=True,
):
    # Get special tokens from the tokenizer
    cls_token = tokenizer.cls_token
    sep_token = tokenizer.sep_token
    unk_token = tokenizer.unk_token
    pad_token_id = tokenizer.pad_token_id

    # List to hold the converted features
    features = []

    for example_index, example in enumerate(examples):
        # Log progress every 5000 examples
        if example_index % 400 == 0:
            logger.info(f"Processing example {example_index} of {len(examples)}")

        # Tokenize each word and align its corresponding label
        tokens = []
        label_ids = []

        for word, label in zip(example.words, example.slot_labels):
            word_tokens = tokenizer.tokenize(word)

            # If the word cannot be tokenized, use [UNK] token
            if not word_tokens:
                word_tokens = [unk_token]

            tokens.extend(word_tokens)

            # Map string label to integer ID, apply pad_label_id for subword tokens
            label_id = label_map[label]
            label_ids.extend([label_id] + [pad_label_id] * (len(word_tokens) - 1))

        # Handle sequence truncation for [CLS] and [SEP] tokens
        special_tokens_count = 2
        if len(tokens) > max_seq_len - special_tokens_count:
            tokens = tokens[:max_seq_len - special_tokens_count]
            label_ids = label_ids[:max_seq_len - special_tokens_count]

        # Add [SEP] token at the end of the sentence
        tokens.append(sep_token)
        label_ids.append(pad_label_id)
        token_type_ids = [sequence_segment_id] * len(tokens)

        # Add [CLS] token at the start of the sentence
        tokens = [cls_token] + tokens
        label_ids = [pad_label_id] + label_ids
        token_type_ids = [cls_token_segment_id] + token_type_ids

        # Convert tokens to input IDs
        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # Create attention masks (1 for real tokens, 0 for padding tokens)
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)


        # Pad sequences to the maximum sequence length
        padding_length = max_seq_len - len(input_ids)
        input_ids += [pad_token_id] * padding_length
        attention_mask += [0 if mask_padding_with_zero else 1] * padding_length
        token_type_ids += [pad_token_segment_id] * padding_length
        label_ids += [pad_label_id] * padding_length

        # Create InputFeatures object and append it to the list of features
        features.append(
            InputFeatures(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                slot_labels_ids=label_ids,
            )
        )

    return features


In [130]:
# Define the label list (ensure that it includes all labels from your dataset)
label_list = ['I-RENTALSERVICES', 'I-STREETFOODRESTAURANT', 'I-TOUR', 'B-HOTELS', 'I-RESTAURANTS', 'B-STREETFOODRESTAURANT', 'I-DRINKPLACES', 'I-HOTELS', 'B-RESTAURANTS', 'B-DRINKPLACES', 'I-ATTRACTIONS', 'B-ATTRACTIONS', 'O', 'B-RENTALSERVICES', 'B-TOUR']

# Create a mapping from label strings to integers
label_map = {label: i for i, label in enumerate(label_list)}


In [131]:
import json

In [132]:
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, attention_mask, token_type_ids, slot_labels_ids):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.token_type_ids = token_type_ids
        self.slot_labels_ids = slot_labels_ids

    def __repr__(self):
        return str(self.to_json_string())

    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"

In [133]:
from transformers import AutoTokenizer

# Initialize PhoBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("quocanh944/phoBERT-ner")

# Set the maximum sequence length
max_seq_len = 128  # You can adjust this based on your model/input

# Convert examples to features
train_features = convert_examples_to_features(train_examples, max_seq_len, tokenizer)
# dev_features = convert_examples_to_features(dev_examples, max_seq_len, tokenizer)
# test_features = convert_examples_to_features(test_examples, max_seq_len, tokenizer)

In [134]:
tokenizer.cls_token, tokenizer.sep_token, tokenizer.unk_token, tokenizer.pad_token_id

('<s>', '</s>', '<unk>', 1)

In [135]:
import torch
from torch.utils.data import Dataset

# Define a Dataset class to wrap the tokenized features for training
class NERDataset(Dataset):
    def __init__(self, features):
        self.features = features

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        feature = self.features[idx]
        return {
            'input_ids': torch.tensor(feature.input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(feature.attention_mask, dtype=torch.long),
            'token_type_ids': torch.tensor(feature.token_type_ids, dtype=torch.long),
            'labels': torch.tensor(feature.slot_labels_ids, dtype=torch.long),
        }

# Convert tokenized features into PyTorch datasets
train_dataset = NERDataset(train_features)
# dev_dataset = NERDataset(dev_features)
# test_dataset = NERDataset(test_features)


In [136]:
train_dataset[0]

{'input_ids': tensor([   0, 1735, 4675, 8821,   25, 1706,  119, 1842, 7646,    4,  235, 2156,
           94,  391,   17, 5015,    5,    2,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0

In [137]:
from transformers import AutoModelForTokenClassification

# Đảm bảo bạn đã có label_list
num_labels = len(label_list)  # VD: O, B-HOTEL, I-HOTEL, B-RESTAURANT, ...

# Load PhoBERT for token classification
# model = AutoModelForTokenClassification.from_pretrained(
#     "vinai/phobert-base",
#     num_labels=num_labels
# )


model = AutoModelForTokenClassification.from_pretrained(
    "quocanh944/phoBERT-ner",
    num_labels=num_labels,
    ignore_mismatched_sizes=True  # Cho phép load weight cũ không khớp head
)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at quocanh944/phoBERT-ner and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([15]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([15, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [138]:
for param in model.base_model.parameters():
    param.requires_grad = False

In [139]:
import transformers
print(transformers.__version__)

4.52.4


In [140]:
from transformers import TrainingArguments, Trainer

In [153]:
training_args = TrainingArguments(
    output_dir='./results',                     # Nơi lưu checkpoint
    per_device_train_batch_size=16,             # Batch size train
    per_device_eval_batch_size=16,              # Batch size eval
    num_train_epochs=2,                         # Số epoch train
    weight_decay=0.01,                          # Weight decay
    logging_dir='./logs',                       # Thư mục log
    logging_strategy="steps",                   # Chiến lược logging
    logging_steps=10,                           # Log mỗi 10 step
    save_strategy="steps",                      # Chiến lược save
    save_steps=500,                             # Save mỗi 500 step
    save_total_limit=2,                         # Tối đa 2 checkpoint
    report_to="none"                            # Không gửi log lên wandb
)


In [None]:
# from transformers import TrainingArguments

# # Define training arguments
# training_args = TrainingArguments(
#     output_dir='./results',           # output directory to save model checkpoints and results
#     evaluation_strategy="epoch",      # evaluation is done at the end of every epoch
#     per_device_train_batch_size=16,   # batch size per device during training
#     per_device_eval_batch_size=16,    # batch size for evaluation
#     num_train_epochs=3,               # number of epochs to train the model
#     weight_decay=0.01,                # strength of weight decay
#     logging_dir='./logs',             # directory for storing logs
#     logging_steps=10,                 # log every 10 steps
#     save_steps=500,                   # save model checkpoint every 500 steps
#     save_total_limit=2,               # limit the number of total checkpoints to save
# )


In [21]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=9fd50e7f435f918afd8916b64116215ccc83d436962ecbb646fe75ab91a256b0
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [71]:
label_list

['I-RENTALSERVICES',
 'I-STREETFOODRESTAURANT',
 'I-TOUR',
 'B-HOTELS',
 'I-RESTAURANTS',
 'B-STREETFOODRESTAURANT',
 'I-DRINKPLACES',
 'I-HOTELS',
 'B-RESTAURANTS',
 'B-DRINKPLACES',
 'I-ATTRACTIONS',
 'B-ATTRACTIONS',
 'O',
 'B-RENTALSERVICES',
 'B-TOUR']

In [142]:
id2label = {i: label for i, label in enumerate(label_list)}

In [143]:
# !pip uninstall transformers
# !pip install transformers==4.53.0


In [144]:
from transformers import Trainer
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
from transformers import EvalPrediction

In [145]:
from transformers import EvalPrediction
def compute_metrics(p: EvalPrediction):
    predictions = p.predictions.argmax(axis=2)  # Get predicted label indices
    labels = p.label_ids  # True label IDs

    # Debugging: Print shapes of predictions and labels
    print(f"Shape of predictions: {predictions.shape}")
    print(f"Shape of labels: {labels.shape}")

    # Debugging: Log first few predictions and labels for inspection
    print(f"First few predictions: {predictions[:2]}")
    print(f"First few labels: {labels[:2]}")

    pred_labels = []
    true_labels = []

    # Iterate through predictions and labels
    for i, (pred_seq, true_seq) in enumerate(zip(predictions, labels)):
        pred_label_seq = []
        true_label_seq = []

        # Iterate through each token in the sequence
        for pred_idx, true_idx in zip(pred_seq, true_seq):
            if true_idx == -100:
                # Debugging: Log any padding tokens encountered
                # print(f"Padding token encountered at position {i}")
                continue

            # Check if the indices are within the valid range
            if pred_idx < len(label_list) and true_idx < len(label_list):
                pred_label_seq.append(label_list[pred_idx])
                true_label_seq.append(label_list[true_idx])
            else:
                # Debugging: Log when out-of-bound indices are encountered
                print(f"Index out of range: pred_idx={pred_idx}, true_idx={true_idx} at position {i}")

        pred_labels.append(pred_label_seq)
        true_labels.append(true_label_seq)

    # Debugging: Log final processed predictions and labels
    print(f"Processed pred_labels: {pred_labels[:2]}")
    print(f"Processed true_labels: {true_labels[:2]}")

    # Compute token-level F1, Precision, and Recall
    precision = precision_score(true_labels, pred_labels)
    # Trong 10 lần dự đoán nhãn PER: thì chúng ta đoán đúng 6 lần -> 6/10 = 60%

    recall = recall_score(true_labels, pred_labels)
    # Trong 8 nhãn PER thật: thì chúng ta đoán đúng 6 lần -> 6/8 = 75%

    f1 = f1_score(true_labels, pred_labels)

    # Debugging: Print classification report
    print("Classification Report:")
    print(classification_report(true_labels, pred_labels))

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [146]:
from transformers import Trainer
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
from transformers import EvalPrediction


In [154]:
# Initialize the Trainer with the modified compute_metrics function
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    # eval_dataset=dev_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics  # Updated function

)

# Train the model
trainer.train()


  trainer = Trainer(


Step,Training Loss
10,2.1645
20,2.0716
30,1.9941
40,1.9229
50,1.8502
60,1.8169
70,1.7625
80,1.7257
90,1.6979
100,1.681


TrainOutput(global_step=126, training_loss=1.8241927888658311, metrics={'train_runtime': 1022.5711, 'train_samples_per_second': 1.956, 'train_steps_per_second': 0.123, 'total_flos': 130663733760000.0, 'train_loss': 1.8241927888658311, 'epoch': 2.0})

In [155]:
model = trainer.model

In [156]:
import torch

def predict_ner(text, model, tokenizer, id2label):
    # Tokenize
    # text = text.lower()
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        is_split_into_words=False
    )

    # Predict
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

    # Decode tokens và labels
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    print(tokens)
    labels = [id2label[label_id.item()] for label_id in predictions[0]]

    # Hiển thị sạch
    for token, label in zip(tokens, labels):
        token_clean = token.replace("▁", "") if "▁" in token else token
        print(f"{token_clean}\t{label}")

In [159]:
text = "tôi ăn ở Quán Bánh Căn, đi chơi ở Hồ Xuân Hương"
predict_ner(text, model, tokenizer, id2label)

['<s>', 'tôi', 'ăn', 'ở', 'Quán', 'Bánh', 'C@@', 'ă@@', 'n@@', ',', 'đi', 'chơi', 'ở', 'Hồ', 'Xuân', 'Hương', '</s>']
<s>	O
tôi	O
ăn	O
ở	O
Quán	O
Bánh	O
C@@	O
ă@@	O
n@@	O
,	O
đi	O
chơi	O
ở	O
Hồ	O
Xuân	O
Hương	O
</s>	O
