<a href="https://colab.research.google.com/github/Nimrat4/POSTaggingusingLSTM-GRU/blob/main/postaggingusinglstm%2Bgru.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import nltk
import torch
import torch.nn as nn
from nltk.corpus import treebank
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
import numpy as np
from collections import defaultdict


In [4]:
nltk.download('treebank')

sentences = treebank.tagged_sents()
train_data, test_data = train_test_split(sentences, test_size=0.2)


[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


In [16]:
word_to_ix = defaultdict(lambda: len(word_to_ix))
tag_to_ix = defaultdict(lambda: len(tag_to_ix))

word_to_ix["<PAD>"]
tag_to_ix["<PAD>"]


for sent in train_data:
    for word, tag in sent:
        word_to_ix[word]
        tag_to_ix[tag]


In [17]:
word_to_ix = dict(word_to_ix)
tag_to_ix = dict(tag_to_ix)
ix_to_tag = {v: k for k, v in tag_to_ix.items()}

In [18]:
class PosDataset(Dataset):
    def __init__(self, data, word_to_ix, tag_to_ix, max_len=50):
        self.data = data
        self.word_to_ix = word_to_ix

        self.tag_to_ix = tag_to_ix
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = self.data[idx]
        words = [self.word_to_ix.get(word, 0) for word, tag in sentence]
        tags = [self.tag_to_ix.get(tag, 0) for word, tag in sentence]


        while len(words) < self.max_len:
            words.append(self.word_to_ix["<PAD>"])
            tags.append(self.tag_to_ix["<PAD>"])
        return torch.tensor(words[:self.max_len]), torch.tensor(tags[:self.max_len])


In [19]:
class GRULSTMTagger(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim=128, hidden_dim=64):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, tagset_size)

    def forward(self, x):
        embedded = self.embedding(x)
        gru_out, _ = self.gru(embedded)
        lstm_out, _ = self.lstm(gru_out)
        out = self.fc(lstm_out)
        return out


In [20]:
EPOCHS = 10
BATCH_SIZE = 32
LEARNING_RATE = 0.001
MAX_LEN = 50

In [21]:
train_dataset = PosDataset(train_data, word_to_ix, tag_to_ix, MAX_LEN)
test_dataset = PosDataset(test_data, word_to_ix, tag_to_ix, MAX_LEN)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1)

In [22]:
model = GRULSTMTagger(len(word_to_ix), len(tag_to_ix))
loss_fn = nn.CrossEntropyLoss(ignore_index=tag_to_ix["<PAD>"])
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [23]:
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for x_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(x_batch)
        loss = loss_fn(outputs.view(-1, outputs.shape[-1]), y_batch.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} Loss: {total_loss:.4f}")


Epoch 1 Loss: 285.7796
Epoch 2 Loss: 161.4240
Epoch 3 Loss: 107.1666
Epoch 4 Loss: 77.9148
Epoch 5 Loss: 59.4022
Epoch 6 Loss: 46.6446
Epoch 7 Loss: 37.2377
Epoch 8 Loss: 30.1681
Epoch 9 Loss: 24.6363
Epoch 10 Loss: 20.3403


In [24]:
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for x_batch, y_batch in test_loader:
        outputs = model(x_batch)
        preds = torch.argmax(outputs, dim=-1)
        mask = y_batch != tag_to_ix["<PAD>"]
        correct += (preds[mask] == y_batch[mask]).sum().item()
        total += mask.sum().item()

accuracy = correct / total
print(f"Test Accuracy: {accuracy * 100:.2f}%")


Test Accuracy: 88.70%


In [27]:
import random

random_sent = random.choice(test_data)
print("Original sentence with gold tags:")
print(random_sent)


Original sentence with gold tags:
[('While', 'IN'), ('the', 'DT'), ('small', 'JJ'), ('deals', 'NNS'), ('are', 'VBP'), ('far', 'RB'), ('less', 'RBR'), ('conspicuous', 'JJ'), (',', ','), ('they', 'PRP'), ('add', 'VBP'), ('to', 'TO'), ('Japanese', 'JJ'), ('penetration', 'NN'), ('of', 'IN'), ('the', 'DT'), ('U.S.', 'NNP'), ('market', 'NN'), ('.', '.')]


In [28]:
words_only = [word for word, tag in random_sent]
def predict_on_tokens(words):
    model.eval()
    word_ids = [word_to_ix.get(w, 0) for w in words]
    while len(word_ids) < MAX_LEN:
        word_ids.append(word_to_ix["<PAD>"])
    input_tensor = torch.tensor([word_ids])
    with torch.no_grad():
        outputs = model(input_tensor)
    preds = torch.argmax(outputs, dim=-1)[0]
    tags = [ix_to_tag[p.item()] for p in preds[:len(words)]]
    return list(zip(words, tags))

predicted = predict_on_tokens(words_only)

print("\nPredicted POS Tags:")
print(predicted)



Predicted POS Tags:
[('While', 'IN'), ('the', 'DT'), ('small', 'JJ'), ('deals', 'NNS'), ('are', 'VBP'), ('far', 'RB'), ('less', 'JJR'), ('conspicuous', 'JJ'), (',', ','), ('they', 'PRP'), ('add', 'VBP'), ('to', 'TO'), ('Japanese', 'JJ'), ('penetration', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('U.S.', 'NNP'), ('market', 'NN'), ('.', '.')]
