## Download datasets and import lib

In [1]:
from datasets import load_dataset
ds = load_dataset("thainq107/abte-restaurants")

README.md:   0%|          | 0.00/454 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/183k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/61.8k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3602 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1119 [00:00<?, ? examples/s]

In [2]:
train_dataset = ds['train']
test_dataset = ds['test']

print("Description features:", train_dataset.features)
print("Number of trainning samples:", train_dataset.num_rows)
print("First trainning sample:", train_dataset[0])

Description features: {'Tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'Tags': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'Polarities': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}
Number of trainning samples: 3602
First trainning sample: {'Tokens': ['But', 'the', 'staff', 'was', 'so', 'horrible', 'to', 'us', '.'], 'Tags': ['0', '0', '1', '0', '0', '0', '0', '0', '0'], 'Polarities': ['-1', '-1', '0', '-1', '-1', '-1', '-1', '-1', '-1']}


## Tokenization

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [4]:
def tokenize_and_align_labels(examples):
    sentences, sentence_tags = [], []
    labels = []

    for tokens, pols in zip(examples['Tokens'], examples['Polarities']):

        bert_tokens = [] # toàn bộ tokens của câu
        bert_att = [] # token muốn chú ý trong training (có polarity khác -1)
        pols_label = 0 # một câu chỉ có 1 polarity cần chú ý , nên khởi tạo bằng int

        for i in range(len(tokens)):
            t = tokenizer.tokenize(tokens[i]) # tách thành sub-tokens
            bert_tokens += t
            if int(pols[i]) != -1:
                bert_att += t
                pols_label = int(pols[i])

        sentences.append(" ".join(bert_tokens))
        sentence_tags.append(" ".join(bert_att))
        labels.append(pols_label)
    tokenized_inputs = tokenizer(
        sentences, sentence_tags, padding=True, truncation=True, return_tensors="pt")

    tokenized_inputs['labels'] = labels
    return tokenized_inputs


preprocessing_ds = ds.map(tokenize_and_align_labels, batched=True)
preprocessing_ds['train'][5]

Map:   0%|          | 0/3602 [00:00<?, ? examples/s]

Map:   0%|          | 0/1119 [00:00<?, ? examples/s]

{'Tokens': ['Not',
  'only',
  'was',
  'the',
  'food',
  'outstanding',
  ',',
  'but',
  'the',
  'little',
  '`',
  'perks',
  '""',
  'were',
  'great',
  '.'],
 'Tags': ['0',
  '0',
  '0',
  '0',
  '1',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '1',
  '0',
  '0',
  '0',
  '0'],
 'Polarities': ['-1',
  '-1',
  '-1',
  '-1',
  '2',
  '-1',
  '-1',
  '-1',
  '-1',
  '-1',
  '-1',
  '-1',
  '-1',
  '-1',
  '-1',
  '-1'],
 'input_ids': [101,
  2025,
  2069,
  2001,
  1996,
  2833,
  5151,
  1010,
  2021,
  1996,
  2210,
  1036,
  2566,
  1001,
  1001,
  29535,
  1000,
  1000,
  2020,
  2307,
  1012,
  102,
  2833,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0

## Evaluate

In [5]:
! pip install -q evaluate==0.4.3

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [6]:
import evaluate
import torch
import numpy as np
# Giả sử predictions là đầu ra của mô hình
# predictions (sau khi qua softmax hoặc sigmoid) có dạng: (batch_size, num_classes)
predictions = torch.randn(5, 3)  # 5 mẫu, 3 lớp
labels = torch.tensor([0, 2, 1, 0, 2])  # Nhãn đúng cho 5 mẫu

# Chuyển predictions về dạng mảng numpy và tính toán chỉ số lớp có xác suất cao nhất
predictions = np.argmax(predictions.detach().numpy(), axis=1)
print(predictions)
print(labels)
# Tính độ chính xác
accuracy = evaluate.load("accuracy")
accuracy_score = accuracy.compute(
    predictions=predictions, references=labels.numpy())

print(f"Accuracy: {accuracy_score['accuracy']}")

[2 2 2 1 0]
tensor([0, 2, 1, 0, 2])


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Accuracy: 0.2


In [7]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


## Model

In [8]:
from transformers import AutoModelForSequenceClassification

id2label = {0: 'Negative', 1: 'Neutral', 2: 'Positive'}
label2id = {'Negative': 0, 'Neutral': 1, 'Positive': 2}

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=3, id2label=id2label, label2id=label2id
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
from transformers import TrainingArguments, Trainer
import os
os.environ['WANDB_DISABLED'] = 'true'

training_args = TrainingArguments(
    output_dir="/kaggle/working/abte-restaurants-distilbert-base-uncased",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to=None,
    save_total_limit=1,
    metric_for_best_model="eval_accuracy"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=preprocessing_ds["train"],
    eval_dataset=preprocessing_ds["test"],
    tokenizer=tokenizer, 
    compute_metrics=compute_metrics,
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.610823,0.756032
2,No log,0.537879,0.774799
3,No log,0.512587,0.811439
4,No log,0.531839,0.816801
5,0.495500,0.531097,0.818588
6,0.495500,0.635601,0.821269
7,0.495500,0.714545,0.815907
8,0.495500,0.747484,0.822163
9,0.142900,0.816834,0.82395
10,0.142900,0.812743,0.818588




TrainOutput(global_step=2260, training_loss=0.15932120686083767, metrics={'train_runtime': 700.8587, 'train_samples_per_second': 102.788, 'train_steps_per_second': 3.225, 'total_flos': 2758558582923840.0, 'train_loss': 0.15932120686083767, 'epoch': 20.0})

In [10]:
from transformers import pipeline

classifier = pipeline(
    model=model,
    task="text-classification",
    tokenizer=tokenizer,

)

test_sentence = 'The bread is top notch as well'
label = "bread"
results = classifier(f'{test_sentence} [SEP] {label}')
print(results)

Device set to use cuda:0


[{'label': 'Positive', 'score': 0.9996583461761475}]
