In [None]:
# %pip install torch
# %pip install transformers
# %pip install pandas
# %pip install numpy
# %pip install scikit-learn
# %pip install gradio
# %pip install safetensors

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from tqdm import tqdm
import os
from safetensors.torch import save_file

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

In [None]:
SEQ_LEN = 256
BATCH_SIZE = 16
EPOCHS = 10
LR = 2e-5

In [None]:
df = pd.read_csv("/kaggle/input/sentimentdataset/NTC_SV/NTC_SV_train.csv").dropna()
texts = df['review'].tolist()
labels = LabelEncoder().fit_transform(df['label'])

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [None]:
class FoodyDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [None]:
X_train, X_val, y_train, y_val = train_test_split(texts, labels, test_size=0.1, random_state=42)
train_dataset = FoodyDataset(X_train, y_train, tokenizer, SEQ_LEN)
val_dataset = FoodyDataset(X_val, y_val, tokenizer, SEQ_LEN)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

In [None]:
df_test = pd.read_csv("/kaggle/input/sentimentdataset/NTC_SV/NTC_SV_test.csv").dropna()
X_test = df_test['review'].tolist()
y_test = LabelEncoder().fit(df_test['label']).transform(df_test['label'])
test_dataset = FoodyDataset(X_test, y_test, tokenizer, SEQ_LEN)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [None]:
class BERTLSTMClassifier(nn.Module):
    def __init__(self, hidden_dim=128, num_classes=3):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-multilingual-cased')
        self.lstm = nn.LSTM(768, hidden_dim, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        lstm_out, _ = self.lstm(outputs.last_hidden_state)
        pooled = torch.mean(lstm_out, dim=1)
        out = self.dropout(pooled)
        return self.fc(out)

In [None]:
model = BERTLSTMClassifier(num_classes=len(set(labels))).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()

In [None]:
def train_epoch(model, data_loader):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []
    for batch in tqdm(data_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    return total_loss / len(data_loader), acc

In [None]:
SAVE_EVERY = 2  # Lưu mô hình mỗi 2 epoch, có thể thay đổi tuỳ ý


In [None]:
import os
os.makedirs("checkpoints", exist_ok=True)

for epoch in range(EPOCHS):
    loss, acc = train_epoch(model, train_loader)
    print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {loss:.4f} - Accuracy: {acc:.4f}")
    
    if (epoch + 1) % SAVE_EVERY == 0:
        save_path = f"checkpoints/bert_lstm_epoch{epoch+1}.safetensors"
        save_file(model.state_dict(), save_path)
        print(f"✔️ Saved checkpoint at {save_path}")


In [None]:
# 11. Đánh giá trên tập test
def evaluate(model, data_loader):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].cuda()
            attention_mask = batch['attention_mask'].cuda()
            labels = batch['label'].cuda()

            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1)

            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    print(classification_report(true_labels, predictions))

print("=== Evaluation on Test Set ===")
evaluate(model, test_loader)



In [None]:
save_file(model.state_dict(), "bert_lstm_foody.safetensors")