In [None]:
# # Cài đặt các thư viện cần thiết
# %pip install transformers
# %pip install torch
# %pip install scikit-learn
# %pip install numpy
# %pip install pandas

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp311-cp311-win_amd64.whl.metadata (15 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Using cached scipy-1.15.2-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp311-cp311-win_amd64.whl (11.1 MB)
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
    --------------------------------------- 0.3/11.1 MB ? eta -:--:--
   --------------- ------------------------ 4.2/11.1 MB 14.0 MB/s eta 0:00:01
   --------------------------------- ------ 9.4/11.1 MB 18.4 MB/s eta 0:00:01
   ---------------------------------------- 11.1/11.1 MB 17.9 MB/s eta 0:00:00
Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Using cached scipy-1.15.2-cp311-cp311-win_amd64.whl (41.2 MB)


In [7]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, BertConfig, BertForSequenceClassification
from transformers import get_scheduler
from torch.optim import AdamW
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm import tqdm
import os

In [8]:
path = "H:/SentimentAnalystComment/BERT-base/NTC_SV/NTC_SV_train.csv"
# "H:/SentimentAnalystComment/BERT-base/NTC_SV/NTC_SV_train.csv"

In [9]:
df = pd.read_csv(path).dropna()
texts = df['review'].tolist()
labels = df['label'].tolist()

In [10]:
pretrained_model = 'bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(pretrained_model)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [11]:
class ReviewDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [12]:
SEQ_LEN = 256
BATCH_SIZE = 16
EPOCHS = 10
LR = 2e-5

In [13]:
txt_train, txt_val, y_train, y_val = train_test_split(texts, labels, test_size=0.2, random_state=42)


In [14]:
train_dataset = ReviewDataset(txt_train, y_train, tokenizer, SEQ_LEN)
val_dataset = ReviewDataset(txt_val, y_val, tokenizer, SEQ_LEN)

In [15]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

In [16]:
model = BertForSequenceClassification.from_pretrained(pretrained_model, num_labels=len(set(labels)))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [17]:
optimizer = AdamW(model.parameters(), lr=LR)
total_steps = len(train_loader) * EPOCHS
scheduler = get_scheduler('linear', optimizer=optimizer, num_warmup_steps=0, num_training_steps=total_steps)
loss_fn = torch.nn.CrossEntropyLoss()

In [22]:
SAVE_PATH = "./bertbase_checkpoints"
os.makedirs(SAVE_PATH, exist_ok=True)

In [23]:
def train_model():
    model.train()
    for epoch in range(EPOCHS):
        loop = tqdm(train_loader, leave=True)
        total_loss = 0
        for batch in loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

            loop.set_description(f"Epoch {epoch+1}")
            loop.set_postfix(loss=loss.item())

        # ✅ Save checkpoint at the end of each epoch
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'loss': total_loss / len(train_loader),
        }, f"{SAVE_PATH}/checkpoint_epoch_{epoch+1}.pt")

        print(f"Epoch {epoch+1} average loss: {total_loss / len(train_loader):.4f}")


In [24]:
train_model()

  0%|          | 0/2038 [00:29<?, ?it/s]


KeyboardInterrupt: 

In [None]:
# ✅ Save final model
model.save_pretrained(f"{SAVE_PATH}/final_model")
tokenizer.save_pretrained(f"{SAVE_PATH}/final_model")


In [19]:
# 7. Đánh giá
@torch.no_grad()
def evaluate():
    model.eval()
    all_preds, all_labels = [], []
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

    print(classification_report(all_labels, all_preds))

In [None]:
evaluate()