In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Giải nén và loại bỏ thư mục gốc UD_English-EWT/
!tar -xzf "/content/drive/MyDrive/nlp/UD_English-EWT.tar.gz" -C "/content/drive/MyDrive/nlp/" --strip-components=1

print("GIẢI NÉN XONG! 3 file .conllu nằm NGAY trong MyDrive/nlp:")
!ls -l "/content/drive/MyDrive/nlp/" | grep conllu

GIẢI NÉN XONG! 3 file .conllu nằm NGAY trong MyDrive/nlp:
-rw------- 1 root root  1756983 Nov  7  2023 en_ewt-ud-dev.conllu
-rw------- 1 root root  1758286 Nov  7  2023 en_ewt-ud-test.conllu
-rw------- 1 root root 13846707 Nov  7  2023 en_ewt-ud-train.conllu


In [3]:
import torch, torch.nn as nn, torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(42)
print("Device:", DEVICE)

Device: cpu


Task 1: Tải và Tiền xử lý Dữ liệu

In [12]:
def load_conllu(path):
    sents = []
    sent = []
    with open(path, encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                if sent: sents.append(sent); sent = []
            elif line.startswith('#'):
                continue
            else:
                parts = line.split('\t')
                if len(parts) >= 10 and parts[0].isdigit():
                    word = parts[1].lower()
                    tag = parts[3]  # UPOS
                    sent.append((word, tag))
        if sent: sents.append(sent)
    return sents

train_sents = load_conllu('/content/drive/MyDrive/nlp/en_ewt-ud-train.conllu')
dev_sents   = load_conllu('/content/drive/MyDrive/nlp/en_ewt-ud-dev.conllu')

word_to_ix = {'<PAD>': 0, '<UNK>': 1}
tag_to_ix  = {'<PAD>': 0}

for sent in train_sents:
    for word, tag in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)

ix_to_tag = {i: t for t, i in tag_to_ix.items()}

print(f"Train: {len(train_sents)} câu | Dev: {len(dev_sents)} câu")
print(f"Vocab size: {len(word_to_ix)} | Tagset size: {len(tag_to_ix)}")

Train: 12544 câu | Dev: 2001 câu
Vocab size: 16656 | Tagset size: 18


Task 2: Tạo PyTorch Dataset và DataLoader

In [6]:
class POSDataset(Dataset):
    def __init__(self, data): self.data = data
    def __len__(self): return len(self.data)
    def __getitem__(self, i):
        w, t = zip(*self.data[i])
        return torch.tensor([word_to_ix.get(x,1) for x in w]), torch.tensor([tag_to_ix[x] for x in t])

def collate(b):
    x, y = zip(*b)
    return pad_sequence(x, batch_first=True, padding_value=0), \
           pad_sequence(y, batch_first=True, padding_value=0)

train_loader = DataLoader(POSDataset(train_sents), 32, shuffle=True,  collate_fn=collate)
dev_loader   = DataLoader(POSDataset(dev_sents),   32, shuffle=False, collate_fn=collate)

Task 3: Xây dựng Mô hình RNN

In [8]:
class SimpleRNNForTokenClassification(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, tagset_size, pad_idx):
        super().__init__()
        # nn.Embedding
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        # nn.RNN
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        # nn.Linear
        self.fc = nn.Linear(hidden_dim, tagset_size)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.rnn(x)
        x = self.fc(x)
        return x

Task 4: Huấn luyện Mô hình

In [9]:
model = SimpleRNNForTokenClassification(
    vocab_size=len(word_to_ix),
    embedding_dim=128,
    hidden_dim=256,
    tagset_size=len(tag_to_ix),
    pad_idx=word_to_ix['<PAD>']
).to(DEVICE)

criterion = nn.CrossEntropyLoss(ignore_index=tag_to_ix['<PAD>'])
optimizer = optim.Adam(model.parameters(), lr=0.001)

print("BẮT ĐẦU TASK 4.2 – HUẤN LUYỆN MÔ HÌNH")

for epoch in range(1, 11):
    model.train()
    total_loss = 0.0
    for i, (sentences, tags) in enumerate(train_loader, 1):
        sentences, tags = sentences.to(DEVICE), tags.to(DEVICE)

        optimizer.zero_grad()                                   # 1
        outputs = model(sentences)                              # 2
        loss = criterion(outputs.view(-1, len(tag_to_ix)),      # 3
                         tags.view(-1))
        loss.backward()                                         # 4
        optimizer.step()                                        # 5

        total_loss += loss.item()

        if i % 100 == 0:
            print(f"   Epoch {epoch} | Batch {i}/{len(train_loader)} | Loss: {loss.item():.4f}")

    avg_loss = total_loss / len(train_loader)
    print(f"\nEPOCH {epoch:02d} HOÀN THÀNH – Average Loss: {avg_loss:.4f}\n")

BẮT ĐẦU TASK 4.2 – HUẤN LUYỆN MÔ HÌNH
   Epoch 1 | Batch 100/392 | Loss: 1.0300
   Epoch 1 | Batch 200/392 | Loss: 0.7257
   Epoch 1 | Batch 300/392 | Loss: 0.6316

EPOCH 01 HOÀN THÀNH – Average Loss: 0.9291

   Epoch 2 | Batch 100/392 | Loss: 0.5103
   Epoch 2 | Batch 200/392 | Loss: 0.5818
   Epoch 2 | Batch 300/392 | Loss: 0.5004

EPOCH 02 HOÀN THÀNH – Average Loss: 0.5243

   Epoch 3 | Batch 100/392 | Loss: 0.4235
   Epoch 3 | Batch 200/392 | Loss: 0.4822
   Epoch 3 | Batch 300/392 | Loss: 0.4621

EPOCH 03 HOÀN THÀNH – Average Loss: 0.3899

   Epoch 4 | Batch 100/392 | Loss: 0.2027
   Epoch 4 | Batch 200/392 | Loss: 0.3096
   Epoch 4 | Batch 300/392 | Loss: 0.3455

EPOCH 04 HOÀN THÀNH – Average Loss: 0.3063

   Epoch 5 | Batch 100/392 | Loss: 0.2487
   Epoch 5 | Batch 200/392 | Loss: 0.2571
   Epoch 5 | Batch 300/392 | Loss: 0.2223

EPOCH 05 HOÀN THÀNH – Average Loss: 0.2440

   Epoch 6 | Batch 100/392 | Loss: 0.1251
   Epoch 6 | Batch 200/392 | Loss: 0.1211
   Epoch 6 | Batch 300/

Task 5: Đánh giá Mô hình

In [10]:
def evaluate(loader):
    model.eval()
    correct = total = 0
    with torch.no_grad():
        for sentences, tags in loader:
            sentences, tags = sentences.to(DEVICE), tags.to(DEVICE)
            outputs = model(sentences)
            predictions = outputs.argmax(dim=-1)
            mask = tags != tag_to_ix['<PAD>']
            correct += (predictions == tags)[mask].sum().item()
            total += mask.sum().item()
    return correct / total if total > 0 else 0

In [11]:
best_dev_acc = 0.0
for epoch in range(1, 11):
    model.train()
    for sentences, tags in train_loader:
        sentences, tags = sentences.to(DEVICE), tags.to(DEVICE)
        optimizer.zero_grad()
        outputs = model(sentences)
        loss = criterion(outputs.view(-1, len(tag_to_ix)), tags.view(-1))
        loss.backward()
        optimizer.step()

    train_acc = evaluate(train_loader)
    dev_acc   = evaluate(dev_loader)

    print(f"EPOCH {epoch:02d}/10")
    print(f"   Train Accuracy: {train_acc:.4%}")
    print(f"   Dev Accuracy  : {dev_acc:.4%}")

    if dev_acc > best_dev_acc:
        best_dev_acc = dev_acc
        torch.save(model.state_dict(), "/content/drive/MyDrive/nlp/best_model_lab5.pth")
        print(f"   → ĐÃ LƯU MÔ HÌNH TỐT NHẤT (Dev = {dev_acc:.4%})")

# Hàm predict
def predict_sentence(sentence):
    model.eval()
    words = sentence.lower().split()
    ids = [word_to_ix.get(w, word_to_ix['<UNK>']) for w in words]
    x = torch.tensor([ids], device=DEVICE)
    with torch.no_grad():
        pred = model(x).argmax(-1)[0].cpu().tolist()
    print(f"\nCâu: \"{sentence}\"")
    for w, p in zip(words, pred):
        print(f"   {w:12} → {ix_to_tag[p]}")

EPOCH 01/10
   Train Accuracy: 98.7403%
   Dev Accuracy  : 88.1148%
   → ĐÃ LƯU MÔ HÌNH TỐT NHẤT (Dev = 88.1148%)
EPOCH 02/10
   Train Accuracy: 99.0160%
   Dev Accuracy  : 87.9279%
EPOCH 03/10
   Train Accuracy: 99.3010%
   Dev Accuracy  : 87.8683%
EPOCH 04/10
   Train Accuracy: 99.3650%
   Dev Accuracy  : 87.9439%
EPOCH 05/10
   Train Accuracy: 99.4907%
   Dev Accuracy  : 87.9279%
EPOCH 06/10
   Train Accuracy: 99.4965%
   Dev Accuracy  : 87.5820%
EPOCH 07/10
   Train Accuracy: 99.5478%
   Dev Accuracy  : 87.9081%
EPOCH 08/10
   Train Accuracy: 99.5674%
   Dev Accuracy  : 87.8922%
EPOCH 09/10
   Train Accuracy: 99.5381%
   Dev Accuracy  : 87.9717%
EPOCH 10/10
   Train Accuracy: 99.3773%
   Dev Accuracy  : 87.5621%


In [13]:
print("KẾT QUẢ THỰC HIỆN")
print(f"• Độ chính xác trên tập dev: {best_dev_acc:.4%}")
print("\n• Ví dụ dự đoán câu mới:")
predict_sentence("Tuan is very handsome")
predict_sentence("My teacher is not as handsome as me")

KẾT QUẢ THỰC HIỆN (copy nguyên vào report)
• Độ chính xác trên tập dev: 88.1148%

• Ví dụ dự đoán câu mới:

Câu: "Tuan is very handsome"
   tuan         → X
   is           → AUX
   very         → ADV
   handsome     → ADJ

Câu: "My teacher is not as handsome as me"
   my           → PRON
   teacher      → NOUN
   is           → AUX
   not          → PART
   as           → ADV
   handsome     → ADJ
   as           → ADP
   me           → PRON
