## Import lib and load datasets

In [1]:
! pip install -q datasets==3.2.0

In [2]:
from datasets import load_dataset
ds = load_dataset ("thainq107/abte-restaurants")

README.md:   0%|          | 0.00/454 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/183k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/61.8k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3602 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1119 [00:00<?, ? examples/s]

In [3]:
train_dataset = ds['train']
test_dataset = ds['test']

print("Description features:", train_dataset.features)
print("Number of trainning samples:", train_dataset.num_rows)
print("First trainning sample:", train_dataset[0])

Description features: {'Tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'Tags': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'Polarities': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}
Number of trainning samples: 3602
First trainning sample: {'Tokens': ['But', 'the', 'staff', 'was', 'so', 'horrible', 'to', 'us', '.'], 'Tags': ['0', '0', '1', '0', '0', '0', '0', '0', '0'], 'Polarities': ['-1', '-1', '0', '-1', '-1', '-1', '-1', '-1', '-1']}


## Tokenizer

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [5]:
def tokenize_and_align_labels(examples):
    tokenized_inputs_batch = []
    labels_batch = []
    for i, words in enumerate(examples['Tokens']):
        word_ids = tokenizer.convert_tokens_to_ids(words)
        tokenized_inputs_batch.append(word_ids)
        
    for i, sentence_tag in enumerate(examples['Tags']):
        tag_int = [int(tag) for tag in sentence_tag]
        labels_batch.append(tag_int)

    return {
            'input_ids': tokenized_inputs_batch,
            'labels': labels_batch
        }
 
preprocessing_ds = ds.map(tokenize_and_align_labels, batched=True)
preprocessing_ds['train'][5]

Map:   0%|          | 0/3602 [00:00<?, ? examples/s]

Map:   0%|          | 0/1119 [00:00<?, ? examples/s]

{'Tokens': ['Not',
  'only',
  'was',
  'the',
  'food',
  'outstanding',
  ',',
  'but',
  'the',
  'little',
  '`',
  'perks',
  '""',
  'were',
  'great',
  '.'],
 'Tags': ['0',
  '0',
  '0',
  '0',
  '1',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '1',
  '0',
  '0',
  '0',
  '0'],
 'Polarities': ['-1',
  '-1',
  '-1',
  '-1',
  '2',
  '-1',
  '-1',
  '-1',
  '-1',
  '-1',
  '-1',
  '-1',
  '-1',
  '-1',
  '-1',
  '-1'],
 'input_ids': [100,
  2069,
  2001,
  1996,
  2833,
  5151,
  1010,
  2021,
  1996,
  2210,
  1036,
  100,
  100,
  2020,
  2307,
  1012],
 'labels': [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]}

## Data Collator
- Tập hợp các ví dụ riêng lẻ thành batch.
- Thêm token đặc biệt như: [PAD], [UNK], [CLS], [SEP], [MASK].
- Chuyển về tensor
- (Tùy chọn) Tăng cường dữ liệu ngay trong quá trình tạo batch.

In [6]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer = tokenizer)

In [7]:
data_collator

DataCollatorForTokenClassification(tokenizer=DistilBertTokenizerFast(name_or_path='distilbert/distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
), padding=True, max_length=None, pad_to_multiple_of=None, l

## Evaluate

In [8]:
! pip install -q seqeval==1.2.2

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [9]:
# predictions: xác suất của từng class O, B-Term, I-Term
# labels : 0 hoặc 1
# Loop từng câu trong batch và xử lý
import numpy as np
from seqeval.metrics import accuracy_score

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [str(p) for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [str(l) for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = accuracy_score(true_predictions, true_labels)
    return {"accuracy": results}

## Model

In [10]:
# Đánh label giống bài toán NER
id2label = {
    0: "O",
    1: "B-Term",
    2: "I-Term"
}
label2id = {
    "O": 0 ,
    "B-Term ": 1 ,
    "I-Term ": 2
}

In [11]:
from transformers import AutoModelForTokenClassification
model = AutoModelForTokenClassification.from_pretrained(
    "distilbert/distilbert-base-uncased",
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
import os
os.environ['WANDB_DISABLED'] = 'true'
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="/kaggle/working/abte-restaurants-distilbert-base-uncased",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to=None, 
    save_total_limit=1,
    metric_for_best_model = "eval_accuracy"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=preprocessing_ds["train"],
    eval_dataset=preprocessing_ds["test"],
    tokenizer=tokenizer, # Corrected: changed 'processing_class' to 'tokenizer'
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.171038,0.932828
2,No log,0.169214,0.938708
3,No log,0.180905,0.942142
4,No log,0.213122,0.941107
5,0.114100,0.231221,0.940308
6,0.114100,0.225715,0.941154
7,0.114100,0.230868,0.941342
8,0.114100,0.244866,0.941719
9,0.020600,0.247952,0.94233
10,0.020600,0.253245,0.942754




TrainOutput(global_step=1695, training_loss=0.04350998774390657, metrics={'train_runtime': 275.7567, 'train_samples_per_second': 195.934, 'train_steps_per_second': 6.147, 'total_flos': 677233705553100.0, 'train_loss': 0.04350998774390657, 'epoch': 15.0})

In [13]:
from transformers import pipeline

token_classifier = pipeline(
    model= model, 
    task="ner",
    tokenizer = tokenizer,
    aggregation_strategy="simple"
)

test_sentence = 'The bread is top notch as well '
results = token_classifier(test_sentence)
print(results)

Device set to use cuda:0


[{'entity_group': 'Term', 'score': 0.8947163, 'word': 'bread', 'start': 4, 'end': 9}]
