In [None]:
!pip uninstall numpy -y
!pip uninstall gensim -y
!pip install numpy==1.24.3
!pip install gensim

Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Found existing installation: gensim 4.3.3
Uninstalling gensim-4.3.3:
  Successfully uninstalled gensim-4.3.3
Collecting numpy==1.24.3
  Downloading numpy-1.24.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Downloading numpy-1.24.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m94.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jaxlib 0.5.1 requires numpy>=1.25, but you have numpy 1.24.3 which is incompatible.
jax 0.5.2 requires numpy>=1.25, but you have numpy 1.24.3 which is incompatible.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but

Collecting gensim
  Using cached gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Using cached gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
Installing collected packages: gensim
Successfully installed gensim-4.3.3


In [None]:
# import libraries
!pip install numpy
import nltk
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gensim.downloader as api
from nltk.corpus import conll2000, brown, treebank
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

nltk.download('conll2000')
nltk.download('brown')
nltk.download('treebank')
nltk.download('universal_tagset')




[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [None]:
# loading 3 nltk libraries
tagged_sents = (
    conll2000.tagged_sents(tagset='universal') +
    brown.tagged_sents(tagset='universal') +
    treebank.tagged_sents(tagset='universal')
)

In [None]:
# making tag list and word vocabulary
word_to_ix = {}
tag_to_ix = {}

for sent in tagged_sents:
    for word, tag in sent:
        word = word.lower()
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)

ix_to_tag = {i: t for t, i in tag_to_ix.items()}


In [None]:
# loading gloves embedding for embedding size
glove_model = api.load("glove-wiki-gigaword-100")
embedding_dim = 100

embedding_matrix = np.random.normal(scale=0.6, size=(len(word_to_ix), embedding_dim))
for word, idx in word_to_ix.items():
    if word in glove_model:
        embedding_matrix[idx] = glove_model[word]

pretrained_embeddings = torch.tensor(embedding_matrix, dtype=torch.float32)




In [None]:
# data train test split
data = [
    ([word.lower() for word, tag in sent], [tag for word, tag in sent])
    for sent in tagged_sents
]
train_data, test_data = train_test_split(data, test_size=0.1, random_state=42)


In [None]:
# model definition
class BiLSTMTagger(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim, hidden_dim, pretrained_embeddings):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight = nn.Parameter(pretrained_embeddings)
        self.embedding.weight.requires_grad = False
        self.dropout = nn.Dropout(0.3)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, tagset_size)

    def forward(self, sentence):
        embeds = self.dropout(self.embedding(sentence))
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        lstm_out = self.dropout(lstm_out)
        tag_space = self.fc(lstm_out.view(len(sentence), -1))
        return tag_space


In [None]:
# model training
hidden_dim = 64
model = BiLSTMTagger(len(word_to_ix), len(tag_to_ix), embedding_dim, hidden_dim, pretrained_embeddings)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.01)

def prepare_sequence(seq, to_ix):
    return torch.tensor([to_ix[w] for w in seq], dtype=torch.long)


In [None]:

for epoch in range(5):
    total_loss = 0
    model.train()
    for words, tags in train_data:
        if any(w not in word_to_ix or t not in tag_to_ix for w, t in zip(words, tags)):
            continue
        model.zero_grad()
        inputs = prepare_sequence(words, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}: Loss = {total_loss:.2f}")


Epoch 1: Loss = 28164.05
Epoch 2: Loss = 24883.98
Epoch 3: Loss = 24004.04
Epoch 4: Loss = 23341.24
Epoch 5: Loss = 22897.83


In [None]:
# model evaluation
model.eval()
y_true = []
y_pred = []

with torch.no_grad():
    for words, tags in test_data:
        if any(w not in word_to_ix or t not in tag_to_ix for w, t in zip(words, tags)):
            continue
        inputs = prepare_sequence(words, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)
        outputs = model(inputs)
        predictions = torch.argmax(outputs, dim=1)
        y_true.extend(targets.tolist())
        y_pred.extend(predictions.tolist())

accuracy = accuracy_score(y_true, y_pred)
print(f"POS Tagging Accuracy: {accuracy:.4f}")


POS Tagging Accuracy: 0.9313
