In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from TorchCRF import CRF
from gensim.models import KeyedVectors
from sklearn.metrics import classification_report
import numpy as np
import time


In [15]:
# -----------------------
# 1. Load CoNLL file
# -----------------------
def load_conll(file_path):
    """
    Load a CoNLL file and return sentences and NER labels.

    Args:
        file_path (str): Path to the CoNLL file.

    Returns:
        Tuple[List[List[str]], List[List[str]]]:
            - sentences: List of sentences, each sentence is a list of tokens.
            - labels: List of NER tag sequences, each is a list of tags.
    """
    sentences = []
    labels = []
    with open(file_path, "r", encoding="utf-8") as f:
        sentence, ner_tags = [], []
        for line in f:
            line = line.strip()
            if not line:
                if sentence:
                    sentences.append(sentence)
                    labels.append(ner_tags)
                    sentence, ner_tags = [], []
            else:
                parts = line.split("\t")
                if len(parts) != 3:
                    continue
                token, _, ner = parts
                sentence.append(token)
                ner_tags.append(ner)
        if sentence:
            sentences.append(sentence)
            labels.append(ner_tags)
    return sentences, labels

In [16]:
# -----------------------
# 2. Load FastText embeddings
# -----------------------
ft_model = KeyedVectors.load("fasttext_gensim.model") 

In [21]:
mean = ft_model.wv.vectors.mean()
std = ft_model.wv.vectors.std()
print(mean)
print(std)
print(np.random.normal(mean, std, size=ft_model.vector_size))


0.014775803
0.34179795
[-0.44198282  0.02582968  0.33800261  0.06594879 -0.44878828  0.47088423
 -0.27932665  0.42560581 -0.05572311  0.4001316   0.13055134 -0.11151295
 -0.15306171 -0.14323097 -0.12913796  0.26138182 -0.63468155 -0.11789568
  0.65389852 -0.74062469  0.42172738 -0.11889626  0.23699606 -0.02890524
  0.35524373  0.02966917 -0.43205671 -0.37794963  0.00752425  0.49174917
 -0.47561019 -0.5359647  -0.25842451 -0.54694707  0.25038778  0.22674871
 -0.00955855 -0.27833973  0.34721668 -0.15114986 -0.0874848   0.25236214
 -0.30786855 -0.33711216  0.76460319  0.35511871  0.06550735  0.18226581
  0.58270976 -0.50396422 -0.04041729  0.24810192 -0.51318792  0.22582641
 -0.22378568  0.5351447   0.26858919  0.21304395 -0.18398857 -0.28159121
  0.69626335  0.34860435 -0.09992923  0.19538345 -0.55617664 -0.03142043
  0.30947933  0.03735474 -0.04113931  0.53330784 -0.249353   -0.1070699
  0.03153271 -0.22676216 -0.94700628 -0.65426211 -0.09662536  0.15883235
  0.04005759 -0.42273942 -0.0

In [17]:
# -----------------------
# 3. Prepare vocab and tag mappings
# -----------------------
sentences, labels = load_conll("ner_80train.conll")  # load training file

# Word vocab: use index lookup for embedding layer
# eg: vocab = {"<PAD>":0, "<UNK>":1, "John":2, "lives":3, "in":4, "Yangon":5,}
# Word embeddings lookup (vocab)
# NER label lookup (ner_tag_to_ix and id2tag)
vocab = {"<PAD>": 0, "<UNK>": 1}
for sent in sentences:  # word-to-index mapping to feed words into an embedding layer
    for w in sent:
        if w not in vocab:
            vocab[w] = len(vocab) # new word the next available integer ID, to ensure unique, sequential indices

# NER tag mapping
ner_tag_to_ix = {"<PAD>": 0}
for tag_seq in labels:
    for t in tag_seq:
        if t not in ner_tag_to_ix:
            ner_tag_to_ix[t] = len(ner_tag_to_ix)
id2tag = {v: k for k, v in ner_tag_to_ix.items()}
print(id2tag)

{0: '<PAD>', 1: 'O', 2: 'B-LOC', 3: 'I-LOC', 4: 'B-DATE', 5: 'I-DATE', 6: 'B-TIME', 7: 'I-TIME'}


In [18]:
# -----------------------
# 4. Dataset with dynamic padding
# -----------------------
class NERDataset(Dataset):
    """
    PyTorch Dataset for Burmese NER.

    Args:
        sentences (List[List[str]]): List of sentences (tokenized).
        labels (List[List[str]]): List of NER tag sequences.
        vocab (dict): Mapping from token to index.
        ner_tag_to_ix (dict): Mapping from NER tag to index.
    output:
        (["token1", "token2", ..., "tokenN"], ["label1", "label2", ..., "labelN"])
    """
    def __init__(self, sentences, labels, vocab, ner_tag_to_ix):
        self.sentences = sentences
        self.labels = labels
        self.vocab = vocab
        self.ner_tag_to_ix = ner_tag_to_ix

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx], self.labels[idx]

In [19]:
def collate_fn(batch):
    """
    Collate function for dynamic padding in DataLoader.

    Args:
        batch (List[Tuple[List[str], List[str]]]): Batch of sentences and NER tags.

    Returns:
        Tuple[Tensor, Tensor]: Padded token indices and tag indices.
    Steps:
    1. Receives a batch of sentences and their corresponding NER tag sequences.
    2. Finds the length of the longest sentence in the batch.
    3. Pads all sentences and tag sequences to this maximum length using "<PAD>".
    4. Converts words to their corresponding indices from `vocab` (unknown words get "<UNK>").
    5. Converts NER tags to their corresponding indices from `ner_tag_to_ix`.
    6. Returns two PyTorch tensors: (Shape: (batch_size, max_len))
   - Padded token indices: shape (batch_size, max_len)
   - Padded tag indices: shape (batch_size, max_len)
    """
    sentences, ner_tags = zip(*batch)
    max_len = max(len(s) for s in sentences)

    sent_idxs = []
    tag_idxs = []

    for s, t in zip(sentences, ner_tags):
        padded_s = s + ["<PAD>"] * (max_len - len(s))
        padded_t = t + ["<PAD>"] * (max_len - len(t))
        sent_idxs.append([vocab.get(w, vocab["<UNK>"]) for w in padded_s])
        tag_idxs.append([ner_tag_to_ix[tag] for tag in padded_t])

    return torch.tensor(sent_idxs, dtype=torch.long), torch.tensor(tag_idxs, dtype=torch.long)

In [None]:
# check dataset sample
batch_size = 4

dataset = NERDataset(sentences, labels, vocab, ner_tag_to_ix)
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
sentence, label = dataset[6]
print(f"Sentence: {sentence}")
print(f"Label: {label}")

for batch in loader:
    sentences, labels = batch
    print("Sentences batch:", sentences)
    print("Labels batch:", labels)
    break   # stop after first batch


Sentence: ['သူ', 'တို့', 'သည်', 'သေဒဏ်', 'ကို', 'ရင်', 'ဆိုင်', 'ရ', 'မည်', 'ကို', 'သိ', 'ခဲ့', 'သော', 'နေ', 'ရာ', '၊', 'အင်ဒိုးနီးရှား', 'သို့', 'ခရီးသွား', 'ရန်', 'ဘာလီ', 'နိုင်း', 'ကို', 'ခွင့်ပြု', 'ခြင်း', 'အတွက်', 'ဩစတြေးလျ', 'ဖယ်ဒရယ်', 'ရဲတပ်ဖွဲ့', 'သည်', 'ဝေဖန်', 'ခံ', 'ခဲ့', 'ရ', 'သည်', '။']
Label: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Sentences batch: tensor([[  623,   339,  9956,    18,  2664,     3,  2464,  1243,    95,   170,
            27,    29,  2464,   249,    86,    95,    44,   146,  1994,    92,
            46,   496,  5692,    58, 19262,  7241,    58, 10078,   208,   325,
          1003,    63,    22,  9448,   293,    95,  1374,   116,   525,  1114,
          1521, 17736,    39,    89,   227,   250,  1748,   125,     3,  1096,
          2518,  1811,  2340,  1602,    28,     3,    91,    53,     6,     8],
        [ 

In [None]:
# -----------------------
# 5. BiLSTM-CRF Model
# -----------------------
class BiLSTM_CRF(nn.Module):
    """
    BiLSTM-CRF model for sequence labeling with FastText embeddings.

    Args:
        vocab_size (int): Size of vocabulary.
        embedding_dim (int): Dimension of embeddings.
        hidden_dim (int): Hidden size of LSTM.
        tagset_size (int): Number of NER tags.
        ft_model (KeyedVectors): Pretrained FastText embeddings.
    Steps:
        1. Create an embedding layer to map word indices → vectors.
        2. Initialize embeddings from pretrained FastText:
        - Use FastText vector if available.
        - Otherwise, use a random vector.
        3. Create a bidirectional LSTM to capture context from both directions.
        4. Add a linear layer to map LSTM outputs → tag scores.
        5. Add a CRF layer to model valid tag transitions and sequence-level predictions.
    """
    def __init__(self, vocab_size, embedding_dim, hidden_dim, tagset_size, ft_model):
        super(BiLSTM_CRF, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # Initialize embedding weights from FastText
        emb_weights = np.zeros((vocab_size, embedding_dim))
        for w, idx in vocab.items():
            if w in ft_model.wv:            
                emb_weights[idx] = ft_model.wv[w]
            else:
                emb_weights[idx] = np.random.normal(scale=0.6, size=(embedding_dim,)) # scale = standard deviation (spread) of the distribution.
        self.embedding.weight.data.copy_(torch.tensor(emb_weights, dtype=torch.float32))
        
        self.bilstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                              num_layers=1, bidirectional=True, batch_first=True) # batch_first=True, batch_size, seq_len, input_size
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.crf = CRF(tagset_size, batch_first=True)


    def forward(self, x, tags=None, mask=None):
        """
        Forward pass for training or inference.

        Args:
            x (Tensor): Input token indices.
            tags (Tensor, optional): NER tag indices for training.
            mask (Tensor, optional): Mask for padding.

        Returns:
            If tags is provided: loss (Tensor).
            Else: decoded tag sequences (List[List[int]]).
        Steps:
            1. Look up embeddings for input token indices.
            2. Pass embeddings through the BiLSTM to get contextualized token representations.
            3. Use the linear layer to compute emission scores for each tag.
            4. If training (tags provided):
            - Compute CRF negative log-likelihood loss using emissions and true tags.
            5. If inference (tags not provided):
            - Use CRF to decode the most likely tag sequence for each sentence.
            6. Return:
            - Loss during training
            - Decoded tag sequences during inference
        """
        embeds = self.embedding(x)
        lstm_out, _ = self.bilstm(embeds)
        emissions = self.hidden2tag(lstm_out)
        if tags is not None:
            loss = -self.crf(emissions, tags, mask=mask, reduction='mean')
            return loss
        else:
            return self.crf.decode(emissions, mask=mask)

# -----------------------
# 6. Hyperparameters
# -----------------------
embedding_dim = ft_model.vector_size
hidden_dim = 128
vocab_size = len(vocab)
tagset_size = len(ner_tag_to_ix)
batch_size = 32
n_epochs = 10
lr = 0.001

# -----------------------
# 7. DataLoader
# -----------------------
dataset = NERDataset(sentences, labels, vocab, ner_tag_to_ix)
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

# -----------------------
# 8. Initialize model
# -----------------------
model = BiLSTM_CRF(vocab_size, embedding_dim, hidden_dim, tagset_size, ft_model)
optimizer = optim.Adam(model.parameters(), lr=lr)

# -----------------------
# 9. Training
# -----------------------
start_time = time.time()
for epoch in range(1, n_epochs + 1):
    model.train()
    total_loss = 0
    for X, y in loader:
        mask = (X != vocab["<PAD>"])
        optimizer.zero_grad() # Clears old gradients before computing new ones.
        loss = model(X, tags=y, mask=mask)
        loss.backward()
        optimizer.step() # Applies gradient descent to update model parameters
        total_loss += loss.item()
    print(f"Epoch {epoch}/{n_epochs}, Loss: {total_loss:.4f}")
print(f"Training completed in {time.time() - start_time:.2f} seconds")


Epoch 1/10, Loss: 4325.9877
Epoch 2/10, Loss: 1569.9689
Epoch 3/10, Loss: 1145.7503
Epoch 4/10, Loss: 920.1621
Epoch 5/10, Loss: 761.4495
Epoch 6/10, Loss: 641.9766
Epoch 7/10, Loss: 538.4350
Epoch 8/10, Loss: 449.4353
Epoch 9/10, Loss: 379.2052
Epoch 10/10, Loss: 319.4638
Training completed in 3808.49 seconds


In [7]:
# -----------------------
# 10. Save model
# -----------------------
torch.save(model.state_dict(), "bilstm_crf_fasttext_epoch10.pth")
print("Model saved as bilstm_crf_fasttext_epoch10.pth")

# -----------------------
# 11. Evaluation
# -----------------------
model.eval()
all_true, all_pred = [], []
with torch.no_grad():
    for X, y in loader:
        mask = (X != vocab["<PAD>"])
        preds = model(X, mask=mask)
        for i in range(len(preds)):
            length = mask[i].sum().item()
            all_pred.extend([id2tag[p] for p in preds[i][:length]])
            all_true.extend([id2tag[t.item()] for t in y[i][:length]])

print("NER Classification Report:")
print(classification_report(all_true, all_pred, digits=4, zero_division=0))

Model saved as bilstm_crf_fasttext_epoch10.pth
NER Classification Report:
              precision    recall  f1-score   support

      B-DATE     0.9874    0.9846    0.9860     10549
       B-LOC     0.9910    0.9838    0.9874     43951
      B-TIME     0.9774    0.9730    0.9752      2263
      I-DATE     0.9866    0.9796    0.9831     17095
       I-LOC     0.9787    0.9812    0.9799     31973
      I-TIME     0.9763    0.9763    0.9763      2872
           O     0.9992    0.9994    0.9993   1983478

    accuracy                         0.9985   2092181
   macro avg     0.9852    0.9826    0.9839   2092181
weighted avg     0.9985    0.9985    0.9985   2092181



In [None]:
# -----------------------
# 1. Load test set
# -----------------------
test_sentences, test_labels = load_conll("ner_20test.conll")  
test_dataset = NERDataset(test_sentences, test_labels, vocab, ner_tag_to_ix)
test_loader = DataLoader(test_dataset, batch_size=32, collate_fn=collate_fn)
# -----------------------
# 11. Evaluation
# -----------------------
model.eval() # Switches the model to evaluation mode, 
all_true, all_pred = [], []
with torch.no_grad(): # No gradients are computed
    for X, y in test_loader: # for test
        mask = (X != vocab["<PAD>"])
        preds = model(X, mask=mask)
        for i in range(len(preds)):
            length = mask[i].sum().item()
            all_pred.extend([id2tag[p] for p in preds[i][:length]])
            all_true.extend([id2tag[t.item()] for t in y[i][:length]])

print("NER Classification Report:")
print(classification_report(all_true, all_pred, digits=4, zero_division=0))

NER Classification Report:
              precision    recall  f1-score   support

      B-DATE     0.9570    0.9417    0.9493      2744
       B-LOC     0.9094    0.8910    0.9001     10967
      B-TIME     0.9479    0.9465    0.9472       635
      I-DATE     0.9687    0.9500    0.9592      4361
       I-LOC     0.8443    0.8362    0.8403      8158
      I-TIME     0.9146    0.9530    0.9334       809
           O     0.9946    0.9954    0.9950    500371

    accuracy                         0.9900    528045
   macro avg     0.9338    0.9305    0.9321    528045
weighted avg     0.9899    0.9900    0.9899    528045



In [44]:
# -----------------------
# 1. Load test set
# -----------------------
test_sentences, test_labels = load_conll("ner_20test.conll")  
test_dataset = NERDataset(test_sentences, test_labels, vocab, ner_tag_to_ix)
test_loader = DataLoader(test_dataset, batch_size=32, collate_fn=collate_fn)

# -----------------------
# 2. Evaluation with seqeval
# -----------------------
from seqeval.metrics import classification_report as seq_classification_report
from seqeval.metrics import f1_score, accuracy_score

model.eval()
all_true, all_pred = [], []
with torch.no_grad():
    for X, y in test_loader:
        mask = (X != vocab["<PAD>"])
        preds = model(X, mask=mask)
        for i in range(len(preds)):
            length = mask[i].sum().item()
            pred_tags = [id2tag[p] for p in preds[i][:length]]
            true_tags = [id2tag[t.item()] for t in y[i][:length]]
            all_pred.append(pred_tags)
            all_true.append(true_tags)

print("Seqeval NER Classification Report on Test Set:")
print(seq_classification_report(all_true, all_pred, digits=4))
print("F1-score:", f1_score(all_true, all_pred))
print("Accuracy:", accuracy_score(all_true, all_pred))


Seqeval NER Classification Report on Test Set:
              precision    recall  f1-score   support

        DATE     0.9470    0.9031    0.9246      2829
         LOC     0.8772    0.8585    0.8678     11374
        TIME     0.8775    0.8378    0.8571       641

   micro avg     0.8903    0.8661    0.8780     14844
   macro avg     0.9005    0.8665    0.8832     14844
weighted avg     0.8905    0.8661    0.8781     14844

F1-score: 0.8780304582394318
Accuracy: 0.9902148491132384


In [None]:
class tuned_BiLSTM_CRF(nn.Module):
    """
    Tuned BiLSTM-CRF model with dropout and multiple LSTM layers.

    Args:
        vocab_size (int): Size of vocabulary.
        embedding_dim (int): Dimension of embeddings.
        hidden_dim (int): Hidden size of LSTM.
        tagset_size (int): Number of NER tags.
        ft_model (KeyedVectors): Pretrained FastText embeddings.
        dropout (float): Dropout rate.
    """
    def __init__(self, vocab_size, embedding_dim, hidden_dim, tagset_size, ft_model, dropout=0.2):
        super(tuned_BiLSTM_CRF, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # Load FastText embeddings
        emb_weights = np.zeros((vocab_size, embedding_dim))
        for w, idx in vocab.items():
            if w in ft_model.wv:
                emb_weights[idx] = ft_model.wv[w]
            else:
                emb_weights[idx] = np.random.normal(scale=0.6, size=(embedding_dim,))
        self.embedding.weight.data.copy_(torch.tensor(emb_weights, dtype=torch.float32))

        self.bilstm = nn.LSTM(
            embedding_dim,
            hidden_dim // 2,
            num_layers=2,              # 2 layers
            bidirectional=True,
            batch_first=True,
            dropout=dropout              # dropout between layers
        )

        self.dropout = nn.Dropout(dropout)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.crf = CRF(tagset_size, batch_first=True)

    def forward(self, x, tags=None, mask=None):
        """
        Forward pass for training or inference.

        Args:
            x (Tensor): Input token indices.
            tags (Tensor, optional): NER tag indices for training.
            mask (Tensor, optional): Mask for padding.

        Returns:
            If tags is provided: loss (Tensor).
            Else: decoded tag sequences (List[List[int]]).
        """
        embeds = self.embedding(x)
        lstm_out, _ = self.bilstm(embeds)
        lstm_out = self.dropout(lstm_out)        # 🔧 dropout before linear
        emissions = self.hidden2tag(lstm_out)
        if tags is not None:
            loss = -self.crf(emissions, tags, mask=mask, reduction='mean')
            return loss
        else:
            return self.crf.decode(emissions, mask=mask)


In [None]:
from seqeval.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split
import torch.optim as optim

train_sents, val_sents, train_labels, val_labels = train_test_split(
    sentences, labels, test_size=0.1, random_state=42
) # train 90% / val 10%

train_dataset = NERDataset(train_sents, train_labels, vocab, ner_tag_to_ix)
val_dataset = NERDataset(val_sents, val_labels, vocab, ner_tag_to_ix)

embedding_dim = ft_model.vector_size
vocab_size = len(vocab)
tagset_size = len(ner_tag_to_ix)
batch_size = 32
n_epochs = 5
lr = 0.001
batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

tuned_model = tuned_BiLSTM_CRF(vocab_size, embedding_dim, hidden_dim=256,  # 🔧 larger hidden_dim
                   tagset_size=tagset_size, ft_model=ft_model, dropout=0.2)

optimizer = optim.Adam(tuned_model.parameters(), lr=lr)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', patience=2, factor=0.5)

id2tag = {v: k for k, v in ner_tag_to_ix.items()}

def evaluate(model, loader):
    """
    Evaluate a NER model and print classification report.

    Args:
        model (nn.Module): Trained NER model.
        loader (DataLoader): DataLoader for evaluation data.

    Returns:
        float: F1 score of the predictions.
    """
    model.eval()
    all_true, all_pred = [], []
    with torch.no_grad():
        for X, y in loader:
            mask = (X != vocab["<PAD>"])
            preds = model(X, mask=mask)

            # Convert ids → tags
            for true_seq, pred_seq, m in zip(y.tolist(), preds, mask.tolist()):
                true_tags = [id2tag[t] for t, mk in zip(true_seq, m) if mk]
                pred_tags = [id2tag[t] for t in pred_seq]
                all_true.append(true_tags)
                all_pred.append(pred_tags)

    f1 = f1_score(all_true, all_pred)
    print(classification_report(all_true, all_pred, digits=4))
    return f1

# -----------------------
# Training Loop
# -----------------------
start_time = time.time()
for epoch in range(1, n_epochs + 1):
    tuned_model.train()
    total_loss = 0
    for X, y in train_loader:
        mask = (X != vocab["<PAD>"])
        optimizer.zero_grad()
        loss = tuned_model(X, tags=y, mask=mask)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch}/{n_epochs}, Train Loss: {total_loss:.4f}")

    # Evaluate on validation set
    f1 = evaluate(tuned_model, val_loader)

    # LR scheduling
    scheduler.step(f1)
print(f"Training completed in {time.time() - start_time:.2f} seconds")


Epoch 1/5, Train Loss: 3537.0331
              precision    recall  f1-score   support

        DATE     0.8848    0.8918    0.8883      1128
         LOC     0.8506    0.7972    0.8230      4591
        TIME     0.7923    0.8408    0.8158       245

   micro avg     0.8547    0.8169    0.8354      5964
   macro avg     0.8426    0.8433    0.8424      5964
weighted avg     0.8546    0.8169    0.8351      5964

Epoch 2/5, Train Loss: 1363.0186
              precision    recall  f1-score   support

        DATE     0.9217    0.9184    0.9201      1128
         LOC     0.8789    0.8271    0.8522      4591
        TIME     0.8024    0.8122    0.8073       245

   micro avg     0.8840    0.8437    0.8634      5964
   macro avg     0.8677    0.8526    0.8599      5964
weighted avg     0.8839    0.8437    0.8632      5964

Epoch 3/5, Train Loss: 1006.4389
              precision    recall  f1-score   support

        DATE     0.9517    0.9078    0.9292      1128
         LOC     0.8684    0.8

In [17]:
# -----------------------
# 1. Load test set
# -----------------------
test_sentences, test_labels = load_conll("ner_20test.conll")  
test_dataset = NERDataset(test_sentences, test_labels, vocab, ner_tag_to_ix)
test_loader = DataLoader(test_dataset, batch_size=32, collate_fn=collate_fn)

# -----------------------
# 2. Evaluation with seqeval
# -----------------------
from seqeval.metrics import classification_report as seq_classification_report
from seqeval.metrics import f1_score, accuracy_score

tuned_model.eval()
all_true, all_pred = [], []
with torch.no_grad():
    for X, y in test_loader:
        mask = (X != vocab["<PAD>"])
        preds = tuned_model(X, mask=mask)
        for i in range(len(preds)):
            length = mask[i].sum().item()
            pred_tags = [id2tag[p] for p in preds[i][:length]]
            true_tags = [id2tag[t.item()] for t in y[i][:length]]
            all_pred.append(pred_tags)
            all_true.append(true_tags)

print("Seqeval NER Classification Report on Test Set:")
print(seq_classification_report(all_true, all_pred, digits=4))
print("F1-score:", f1_score(all_true, all_pred))
print("Accuracy:", accuracy_score(all_true, all_pred))


Seqeval NER Classification Report on Test Set:
              precision    recall  f1-score   support

        DATE     0.9265    0.9226    0.9245      2829
         LOC     0.8867    0.8538    0.8699     11374
        TIME     0.8451    0.8424    0.8438       641

   micro avg     0.8926    0.8664    0.8793     14844
   macro avg     0.8861    0.8729    0.8794     14844
weighted avg     0.8925    0.8664    0.8792     14844

F1-score: 0.8793244906331191
Accuracy: 0.9902110615572537


In [19]:

# -----------------------
# 11. Evaluation
# -----------------------
from sklearn.metrics import classification_report
tuned_model.eval()
all_true, all_pred = [], []
with torch.no_grad():
    for X, y in test_loader: # for test
        mask = (X != vocab["<PAD>"])
        preds = tuned_model(X, mask=mask)
        for i in range(len(preds)):
            length = mask[i].sum().item()
            all_pred.extend([id2tag[p] for p in preds[i][:length]])
            all_true.extend([id2tag[t.item()] for t in y[i][:length]])

print("NER Classification Report:")
print(classification_report(all_true, all_pred, digits=4, zero_division=0))

NER Classification Report:
              precision    recall  f1-score   support

      B-DATE     0.9488    0.9457    0.9473      2744
       B-LOC     0.9212    0.8885    0.9046     10967
      B-TIME     0.9407    0.9244    0.9325       635
      I-DATE     0.9424    0.9688    0.9555      4361
       I-LOC     0.8549    0.8335    0.8441      8158
      I-TIME     0.8948    0.9567    0.9247       809
           O     0.9947    0.9956    0.9951    500371

    accuracy                         0.9902    528045
   macro avg     0.9282    0.9305    0.9291    528045
weighted avg     0.9901    0.9902    0.9902    528045



In [2]:
import torch
print(torch.__version__)

2.8.0+cpu
