In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from sklearn.metrics import precision_recall_fscore_support
from tqdm import tqdm
import re

nltk.download('punkt')

# Config
GLOVE_PATH = "/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.200d.txt"
IMDB_CSV_PATH = "/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv"
EMBEDDING_DIM = 200
MAX_VOCAB_SIZE = 20000
MAX_LEN = 320
BATCH_SIZE = 64
EPOCHS = 10
PATIENCE = 5


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Thông tin liên quan đến notebook và dataset 

link dataset : https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

link glove : https://www.kaggle.com/datasets/rtatman/glove-global-vectors-for-word-representation

link notebook in kaggle : https://www.kaggle.com/code/tqkhanh05/ml-uet

In [None]:

# Load dataset
df = pd.read_csv(IMDB_CSV_PATH)
df['label'] = df['sentiment'].map({'positive':1, 'negative':0})

def tokenizer(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    return word_tokenize(text)

# Chia train-val 80-20
train_size = int(len(df)*0.8)
train_df = df.iloc[:train_size]
val_df = df.iloc[train_size:]

# Build vocab
freq = {}
for text in tqdm(train_df['review'], desc="Building vocab"):
    for token in tokenizer(text):
        freq[token] = freq.get(token, 0) + 1

sorted_freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)[:MAX_VOCAB_SIZE-2]
vocab = {"<pad>":0, "<unk>":1}
for i, (word, _) in enumerate(sorted_freq, 2):
    vocab[word] = i

# Encode text -> ids + padding
def encode(text):
    tokens = tokenizer(text)
    ids = [vocab.get(t, vocab["<unk>"]) for t in tokens]
    if len(ids) > MAX_LEN:
        ids = ids[:MAX_LEN]
    else:
        ids += [vocab["<pad>"]] * (MAX_LEN - len(ids))
    return ids

# Dataset wrapper
class IMDbDataset(Dataset):
    def __init__(self, df):
        self.texts = df['review'].values
        self.labels = df['label'].values
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        return torch.tensor(encode(self.texts[idx]), dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)

train_dataset = IMDbDataset(train_df)
val_dataset = IMDbDataset(val_df)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# Load pretrained GloVe embeddings
def load_glove_embeddings(path, vocab, embedding_dim=100):
    print("Loading GloVe embeddings...")
    embeddings_index = {}
    with open(path, "r", encoding="utf8") as f:
        for line in tqdm(f, total=400000):
            values = line.strip().split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector

    embedding_matrix = np.random.uniform(-0.25, 0.25, (len(vocab), embedding_dim))
    embedding_matrix[vocab["<pad>"]] = np.zeros(embedding_dim)
    found = 0
    for word, idx in vocab.items():
        vec = embeddings_index.get(word)
        if vec is not None:
            embedding_matrix[idx] = vec
            found += 1
    print(f"Found {found} vectors for vocab words.")
    return torch.tensor(embedding_matrix, dtype=torch.float32)

embedding_matrix = load_glove_embeddings(GLOVE_PATH, vocab, EMBEDDING_DIM)


Building vocab: 100%|██████████| 40000/40000 [00:26<00:00, 1511.57it/s]


Loading GloVe embeddings...


100%|██████████| 400000/400000 [00:16<00:00, 24556.30it/s]


Found 18813 vectors for vocab words.


In [None]:

# Attention module
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.attn = nn.Linear(hidden_dim*2, 1)
    def forward(self, lstm_output):
        attn_weights = torch.softmax(self.attn(lstm_output).squeeze(-1), dim=1)
        context = torch.bmm(attn_weights.unsqueeze(1), lstm_output).squeeze(1)
        return context

# Model: CNN + BiLSTM + Attention + GRU
class CNN_BiLSTM_GRU_Attention(nn.Module):
    def __init__(self, vocab_size, embedding_dim, embedding_matrix, hidden_dim=128, output_dim=2, dropout=0.5):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False, padding_idx=0)
        self.conv = nn.Conv1d(embedding_dim, 128, kernel_size=5, padding=2)
        self.bilstm = nn.LSTM(128, hidden_dim, batch_first=True, bidirectional=True)
        self.gru = nn.GRU(hidden_dim*2, hidden_dim, batch_first=True, bidirectional=True)
        self.attention = Attention(hidden_dim)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)

    def forward(self, x):
        emb = self.embedding(x)                   
        emb = emb.transpose(1, 2)                
        conv_out = torch.relu(self.conv(emb))   
        conv_out = conv_out.transpose(1, 2)     
        lstm_out, _ = self.bilstm(conv_out)      
        gru_out, _ = self.gru(lstm_out)          
        attn_out = self.attention(gru_out)      
        dropped = self.dropout(attn_out)
        out = self.fc(dropped)
        return out



In [10]:
# Setup device, model, loss, optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNN_BiLSTM_GRU_Attention(len(vocab), EMBEDDING_DIM, embedding_matrix).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-3)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=1)

# Train and eval functions
def train_epoch(model, loader, criterion, optimizer):
    model.train()
    total_loss, total_correct = 0, 0
    for x, y in tqdm(loader, leave=False):
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        out = model(x)
        loss = criterion(out, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * y.size(0)
        preds = out.argmax(dim=1)
        total_correct += (preds == y).sum().item()
    return total_loss / len(loader.dataset), total_correct / len(loader.dataset)

def eval_epoch(model, loader, criterion):
    model.eval()
    total_loss, total_correct = 0, 0
    all_preds, all_labels = [], []
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            out = model(x)
            loss = criterion(out, y)
            total_loss += loss.item() * y.size(0)
            preds = out.argmax(dim=1)
            total_correct += (preds == y).sum().item()
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y.cpu().numpy())
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary')
    return total_loss / len(loader.dataset), total_correct / len(loader.dataset), precision, recall, f1

# Train loop with early stopping
best_val_loss = float('inf')
patience_counter = 0

for epoch in range(EPOCHS):
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer)
    val_loss, val_acc, val_prec, val_rec, val_f1 = eval_epoch(model, val_loader, criterion)
    scheduler.step(val_loss)

    print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {train_loss:.4f} Acc: {train_acc:.4f}")
    print(f" Val Loss: {val_loss:.4f} Acc: {val_acc:.4f} Precision: {val_prec:.4f} Recall: {val_rec:.4f} F1: {val_f1:.4f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        torch.save(model.state_dict(), "best_model.pt")
        print(" Model saved.")
    else:
        patience_counter += 1
        if patience_counter >= PATIENCE:
            print("Early stopping.")
            break

                                                 

Epoch 1/10 | Train Loss: 0.3425 Acc: 0.8442
 Val Loss: 0.2401 Acc: 0.9051 Precision: 0.8909 Recall: 0.9235 F1: 0.9069
 Model saved.


                                                 

Epoch 2/10 | Train Loss: 0.1911 Acc: 0.9264
 Val Loss: 0.2280 Acc: 0.9076 Precision: 0.9179 Recall: 0.8955 F1: 0.9066
 Model saved.


                                                 

Epoch 3/10 | Train Loss: 0.1257 Acc: 0.9550
 Val Loss: 0.2553 Acc: 0.9050 Precision: 0.8940 Recall: 0.9193 F1: 0.9065


                                                 

Epoch 4/10 | Train Loss: 0.0681 Acc: 0.9773
 Val Loss: 0.3347 Acc: 0.8961 Precision: 0.8689 Recall: 0.9333 F1: 0.9000


                                                 

Epoch 5/10 | Train Loss: 0.0187 Acc: 0.9953
 Val Loss: 0.4312 Acc: 0.8968 Precision: 0.9097 Recall: 0.8814 F1: 0.8953


                                                 

Epoch 6/10 | Train Loss: 0.0113 Acc: 0.9978
 Val Loss: 0.4879 Acc: 0.8954 Precision: 0.9020 Recall: 0.8876 F1: 0.8947


                                                 

Epoch 7/10 | Train Loss: 0.0079 Acc: 0.9988
 Val Loss: 0.4929 Acc: 0.8961 Precision: 0.8966 Recall: 0.8957 F1: 0.8962
Early stopping.


In [11]:

# Load best model for final eval
model.load_state_dict(torch.load("best_model.pt"))
val_loss, val_acc, val_prec, val_rec, val_f1 = eval_epoch(model, val_loader, criterion)
print(f"Final Eval | Loss: {val_loss:.4f} Acc: {val_acc:.4f} Precision: {val_prec:.4f} Recall: {val_rec:.4f} F1: {val_f1:.4f}")

Final Eval | Loss: 0.2280 Acc: 0.9076 Precision: 0.9179 Recall: 0.8955 F1: 0.9066
