In [None]:
# Step 0: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Load từng file
def read_txt(filepath):
    with open(filepath, encoding='utf-8') as f:
        return [line.strip() for line in f.readlines()]

# Đường dẫn tới các file txt
train_sents_path = "/content/drive/MyDrive/Hugging Face NLP Course/Classify Vietnamese Students’ Feedback Corpus/train/sents.txt"
train_labels_path = "/content/drive/MyDrive/Hugging Face NLP Course/Classify Vietnamese Students’ Feedback Corpus/train/topics.txt"

dev_sents_path = "/content/drive/MyDrive/Hugging Face NLP Course/Classify Vietnamese Students’ Feedback Corpus/dev/sents.txt"
dev_labels_path = "/content/drive/MyDrive/Hugging Face NLP Course/Classify Vietnamese Students’ Feedback Corpus/dev/topics.txt"

test_sents_path = "/content/drive/MyDrive/Hugging Face NLP Course/Classify Vietnamese Students’ Feedback Corpus/test/sents.txt"
test_labels_path = "/content/drive/MyDrive/Hugging Face NLP Course/Classify Vietnamese Students’ Feedback Corpus/test/topics.txt"

# Đọc dữ liệu
train_texts = read_txt(train_sents_path)
train_labels = read_txt(train_labels_path)

dev_texts = read_txt(dev_sents_path)
dev_labels = read_txt(dev_labels_path)

test_texts = read_txt(test_sents_path)
test_labels = read_txt(test_labels_path)

In [None]:
from datasets import Dataset

# Tạo dataset từ list
train_dataset = Dataset.from_dict({"text": train_texts, "label": [int(label) for label in train_labels]})
dev_dataset = Dataset.from_dict({"text": dev_texts, "label": [int(label) for label in dev_labels]})
test_dataset = Dataset.from_dict({"text": test_texts, "label": [int(label) for label in test_labels]})

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)

# Hàm tokenize
def tokenize_fn(example):
    return tokenizer(example["text"], truncation=True, padding='max_length', max_length=256)

# Tokenize toàn bộ dataset
train_dataset = train_dataset.map(tokenize_fn, batched=True)
dev_dataset = dev_dataset.map(tokenize_fn, batched=True)
test_dataset = test_dataset.map(tokenize_fn, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]

Map:   0%|          | 0/11426 [00:00<?, ? examples/s]

Map:   0%|          | 0/1583 [00:00<?, ? examples/s]

Map:   0%|          | 0/3166 [00:00<?, ? examples/s]

In [None]:
# Chỉ định định dạng dữ liệu cho Trainer (PyTorch tensors)
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
dev_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


In [None]:
print(type(train_dataset))
print(train_dataset[0]) #so sánh với khi chưa set format phía trên

<class 'datasets.arrow_dataset.Dataset'>
{'label': tensor(1), 'input_ids': tensor([    0, 48090,  4368,  1893,   545,   312,     5,     2,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,    

In [None]:
#Tạo một hàm để tính toán metrics
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)  # Dự đoán lớp có xác suất cao nhất từ logits

    # Tính toán accuracy
    accuracy = accuracy_score(labels, predictions)

    # Tính toán F1 score (có thể là macro hoặc weighted tùy theo yêu cầu)
    f1 = f1_score(labels, predictions, average='weighted')  # hoặc 'macro'

    return {
        'accuracy': accuracy,
        'f1': f1
    }

In [None]:
import torch
#check lại xem có đang dùng GPU không
print("CUDA:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
else:
    print("Đang dùng CPU")


CUDA: True
GPU: Tesla T4


In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

# Load model
model = AutoModelForSequenceClassification.from_pretrained("vinai/phobert-base", num_labels=4)

# Define TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    save_strategy = 'epoch',
    logging_dir='./logs',
    logging_steps=100,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    load_best_model_at_end=True,
    metric_for_best_model='f1', # có thể chọn 'accuracy' hoặc 'f1'
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics,  # Chỉ định compute_metrics
)


pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

In [None]:
#train model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3582,0.40097,0.86355,0.86677
2,0.2877,0.332427,0.891977,0.889959
3,0.1889,0.37222,0.891346,0.890436


TrainOutput(global_step=2145, training_loss=0.310000403166373, metrics={'train_runtime': 1592.3886, 'train_samples_per_second': 21.526, 'train_steps_per_second': 1.347, 'total_flos': 4509541355065344.0, 'train_loss': 0.310000403166373, 'epoch': 3.0})

In [None]:
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(258, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [None]:
#Đánh giá mô hình trên dữ liệu kiểm tra
results = trainer.evaluate(test_dataset)
print(results)

{'eval_loss': 0.3689563274383545, 'eval_accuracy': 0.892293114339861, 'eval_f1': 0.8914759795823366, 'eval_runtime': 38.9868, 'eval_samples_per_second': 81.207, 'eval_steps_per_second': 2.539, 'epoch': 3.0}


In [None]:
# Dự đoán trên một câu
text = "thầy giao nhiều bài tập quá."

# Tiền xử lý văn bản
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256)

# Chuyển inputs và model sang cùng thiết bị
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
inputs = {key: value.to(device) for key, value in inputs.items()}

# Dự đoán
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=-1)

# In kết quả dự đoán
print(predicted_class.item())  # 0, 1, 2,3


0


In [None]:
# Lưu mô hình về thư mục Drive
model_save_path = "/content/drive/MyDrive/Hugging Face NLP Course/Classify Vietnamese Students’ Feedback Corpus/phoBERT_topic_model_HFpipeline"

model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)


('/content/drive/MyDrive/Hugging Face NLP Course/Classify Vietnamese Students’ Feedback Corpus/phoBERT_topic_model_HFpipeline/tokenizer_config.json',
 '/content/drive/MyDrive/Hugging Face NLP Course/Classify Vietnamese Students’ Feedback Corpus/phoBERT_topic_model_HFpipeline/special_tokens_map.json',
 '/content/drive/MyDrive/Hugging Face NLP Course/Classify Vietnamese Students’ Feedback Corpus/phoBERT_topic_model_HFpipeline/vocab.txt',
 '/content/drive/MyDrive/Hugging Face NLP Course/Classify Vietnamese Students’ Feedback Corpus/phoBERT_topic_model_HFpipeline/bpe.codes',
 '/content/drive/MyDrive/Hugging Face NLP Course/Classify Vietnamese Students’ Feedback Corpus/phoBERT_topic_model_HFpipeline/added_tokens.json')