In [1]:
!pip install datasets transformers scikit-learn sklearn-crfsuite torch nltk

Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn-crfsuite)
  Downloading python_crfsuite-0.9.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl (10 kB)
Downloading python_crfsuite-0.9.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.11 sklearn-crfsuite-0.5.0


Installing dependencies and importing libaries


In [None]:
import torch
import numpy as np
import random
from datasets import load_dataset
import nltk
import sklearn_crfsuite
from sklearn_crfsuite import metrics as crf_metrics
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from transformers import AutoModelForTokenClassification, AutoTokenizer, Trainer, TrainingArguments

# Random Seed
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

# Environment variables
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


Load the dataset and the NLTK resources which all 3 models will be using

In [None]:
dataset = load_dataset("lhoestq/conll2003")

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

ner_features = dataset["train"].features["ner_tags"]
try:
    tag_names = ner_features.feature.names
except AttributeError:
    tag_names = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

NUM_TAGS = len(tag_names)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"NER Tag Names ({NUM_TAGS} classes): {tag_names}")

NER Tag Names (9 classes): ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


# Pipeline A - Conditional Random Field (CRF)
Classical aproach - the functions manually extract "clues" from the text. The word2features function creates a dictionary of the "clues", features, for a single word, its Part-of-Speech (POS) tag, and its context (words before and after).
prepare_labels function converts the dataset numerical labels into a string format, which is required by the CRF Model.

In [None]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }

    #Context - Previous word
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:postag': postag1,
        })
    else:
        features['BOS'] = True #Beginning of Sentence

    #Context - Next word
    if i < len(sent) - 1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:postag': postag1,
        })
    else:
        features['EOS'] = True #End of Sentence

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def prepare_labels(dataset_split):
    y = []
    for example in dataset_split:
        y.append([tag_names[tag_id] for tag_id in example['ner_tags']])
    return y

# Data pre-processing, Training and Evaluation
prepare_data_crf function calls nltk.pos.tag to generate the grammar feature before the feature extraction.



In [None]:
def prepare_data_crf(dataset_split):
    X = []
    for example in dataset_split:
        tokens = example['tokens']
        #NLTK POS Tagging (Adds the crucial grammatical feature)
        tokens_with_pos = nltk.pos_tag(tokens)
        X.append(sent2features(tokens_with_pos))
    return X

#Prepare data splits
print("Preparing CRF Training Data (Running NLTK POS Tagger)...")
X_train_crf = prepare_data_crf(dataset["train"])
y_train_crf = prepare_labels(dataset["train"])
X_test_crf = prepare_data_crf(dataset["test"])
y_test_crf = prepare_labels(dataset["test"])

#Train the CRF model
print("\nTraining CRF Model...")
crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True)
crf.fit(X_train_crf, y_train_crf)

#Predict and evaluate
y_pred_crf = crf.predict(X_test_crf)
labels_to_evaluate = list(tag_names)
labels_to_evaluate.remove('O') #Exclude the dominant 'O' tag

f1_crf = crf_metrics.flat_f1_score(
    y_test_crf, y_pred_crf, average='weighted', labels=labels_to_evaluate)

print(f"\n--- Pipeline A Results: CRF Baseline ---")
print(f"CRF Weighted F1-Score: {f1_crf:.4f}")
print("Classification Report Sample:")
print(crf_metrics.flat_classification_report(y_test_crf, y_pred_crf, labels=labels_to_evaluate, digits=3))

Preparing CRF Training Data (Running NLTK POS Tagger)...

Training CRF Model...

--- Pipeline A Results: CRF Baseline ---
CRF Weighted F1-Score: 0.8187
Classification Report Sample:
              precision    recall  f1-score   support

       B-PER      0.821     0.864     0.842      1617
       I-PER      0.867     0.958     0.910      1156
       B-ORG      0.804     0.730     0.766      1661
       I-ORG      0.704     0.746     0.724       835
       B-LOC      0.876     0.854     0.865      1668
       I-LOC      0.843     0.755     0.797       257
      B-MISC      0.828     0.764     0.795       702
      I-MISC      0.686     0.667     0.676       216

   micro avg      0.821     0.818     0.820      8112
   macro avg      0.804     0.792     0.797      8112
weighted avg      0.821     0.818     0.819      8112



# Pipeline B - Deep Learning

Preparing data for deep learning by converting words to unique IDs


In [None]:
MAX_LEN = 128
EMBEDDING_DIM = 100
HIDDEN_DIM = 256

def build_vocab(dataset_split):
    word_to_ix = {"<UNK>": 0, "<PAD>": 1}
    for example in dataset_split:
        for word in example['tokens']:
            if word not in word_to_ix:
                word_to_ix[word] = len(word_to_ix)
    return word_to_ix

word_to_ix = build_vocab(dataset["train"])
tag_to_ix = {tag: i for i, tag in enumerate(tag_names)}

def prepare_sequence_dl(tokens, tags, word_to_ix, tag_to_ix, max_len):
    #Converts tokens to indices
    word_idxs = [word_to_ix.get(w, word_to_ix["<UNK>"]) for w in tokens]
    tag_idxs = [tag_to_ix[t] for t in tags]

    #Truncation
    word_idxs = word_idxs[:max_len]
    tag_idxs = tag_idxs[:max_len]

    #Padding
    padding = [word_to_ix["<PAD>"]] * (max_len - len(word_idxs))
    word_idxs.extend(padding)
    padding_tags = [tag_to_ix["O"]] * (max_len - len(tag_idxs)) #Pad tags with 'O'
    tag_idxs.extend(padding_tags)

    return torch.tensor(word_idxs, dtype=torch.long), torch.tensor(tag_idxs, dtype=torch.long)

class NERDataset(Dataset):
    def __init__(self, data_split, word_to_ix, tag_to_ix, max_len):
        self.data = data_split
        self.word_to_ix = word_to_ix
        self.tag_to_ix = tag_to_ix
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        tokens = self.data[idx]['tokens']
        tags_str = [tag_names[t_id] for t_id in self.data[idx]['ner_tags']]
        word_idxs, tag_idxs = prepare_sequence_dl(
            tokens, tags_str, self.word_to_ix, self.tag_to_ix, self.max_len)
        return word_idxs, tag_idxs

train_dataset = NERDataset(dataset["train"], word_to_ix, tag_to_ix, MAX_LEN)
test_dataset = NERDataset(dataset["test"], word_to_ix, tag_to_ix, MAX_LEN)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

print(f"Vocabulary size: {len(word_to_ix)}. DataLoaders created for batch processing.")

Vocabulary size: 23625. DataLoaders created for batch processing.


Implementing the Bi-LSTM Model

In [None]:
#MODEL DEFINITION ---
class BiLSTM_NER(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_tags):
        super(BiLSTM_NER, self).__init__()
        #1 Embedding Layer: Learns dense vectors (static embeddings)
        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        #2 Bi-LSTM: Processes sequence bidirectionally
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True, batch_first=True)
        #3 Output Layer: Maps LSTM output to tag predictions
        self.hidden2tag = nn.Linear(hidden_dim, num_tags)

    def forward(self, sentence):
        embeds = self.word_embeds(sentence)
        lstm_out, _ = self.lstm(embeds)
        tag_space = self.hidden2tag(lstm_out)
        tag_scores = nn.functional.log_softmax(tag_space, dim=2)
        return tag_scores

#TRAINING FUNCTION ---
def train_dl_model(model, optimizer, loss_fn, data_loader, epochs=5):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for sentences, tags in data_loader:
            #Move data to the appropriate device
            sentences, tags = sentences.to(DEVICE), tags.to(DEVICE)

            model.zero_grad()
            tag_scores = model(sentences)

            #Reshape for loss calculation: [Batch * SeqLen, NumTags]
            loss = loss_fn(tag_scores.view(-1, NUM_TAGS), tags.view(-1))

            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs} Loss: {total_loss / len(data_loader):.4f}")

#EVALUATION FUNCTION ---
def evaluate_dl_model(model, data_loader):
    model.eval()
    all_preds = []
    all_targets = []

    with torch.no_grad():
        for sentences, tags in data_loader:
            sentences, tags = sentences.to(DEVICE), tags.to(DEVICE)
            tag_scores = model(sentences)

            #Getting the predicted tag index (highest log_softmax score)
            predictions = torch.argmax(tag_scores, dim=2)

            #Flattening and collecting results, ignoring padding
            for i in range(tags.shape[0]):
                for j in range(tags.shape[1]):
                    #Checking if the tag is not the padding tag 'O'
                    if tags[i][j].item() != tag_to_ix["O"]:
                        all_targets.append(tags[i][j].item())
                        all_preds.append(predictions[i][j].item())

    #Converting indices back to tag strings
    ix_to_tag = {i: t for t, i in tag_to_ix.items()}
    y_true_str = [ix_to_tag[i] for i in all_targets]
    y_pred_str = [ix_to_tag[i] for i in all_preds]

    #Clean comparison (excluding 'O' is handled implicitly by filtering here)
    from sklearn.metrics import f1_score
    # Define labels to evaluate (B-PER, I-PER, B-ORG, etc., excluding 'O')
    labels_to_evaluate = [t for t in tag_names if t != 'O']

    f1_bilstm = f1_score(y_true_str, y_pred_str, average='weighted', labels=labels_to_evaluate)

    return f1_bilstm

#RUN Bi-LSTM ---
model_bilstm = BiLSTM_NER(len(word_to_ix), EMBEDDING_DIM, HIDDEN_DIM, NUM_TAGS).to(DEVICE)
optimizer_bilstm = torch.optim.Adam(model_bilstm.parameters(), lr=0.005)
loss_function = nn.NLLLoss(ignore_index=tag_to_ix["O"]) #Ignores loss contribution from the 'O' tag and padded tokens

print("\n--- Training Bi-LSTM Baseline (5 Epochs) ---")
train_dl_model(model_bilstm, optimizer_bilstm, loss_function, train_loader, epochs=5)
f1_bilstm = evaluate_dl_model(model_bilstm, test_loader)
print(f"\nBi-LSTM Weighted F1-Score: {f1_bilstm:.4f}")


--- Training Bi-LSTM Baseline (5 Epochs) ---
Epoch 1/5 Loss: 0.8094
Epoch 2/5 Loss: 0.1759
Epoch 3/5 Loss: 0.0367
Epoch 4/5 Loss: 0.0067
Epoch 5/5 Loss: 0.0026

Bi-LSTM Weighted F1-Score: 0.7751


In [None]:
BERT_MAX_LEN = 128
label_all_tokens = False
tag_names = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

def tokenize_and_align_labels(examples):
    tokenized_inputs = bert_tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        max_length=BERT_MAX_LEN
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        #Mapping word labels to subword tokens
        for word_idx in word_ids:
            if word_idx is None:
                #Special tokens -100 (ignored by loss)
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                #First subword of a new word, original label
                label_ids.append(label[word_idx])
            else:
                #Subsequent subwords, -100 to be ignored
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(
    tokenize_and_align_labels,
    batched=True
)

print("Data successfully tokenized for BERT (subword units) and labels aligned.")

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

NameError: name 'bert_tokenizer' is not defined

In [None]:
print("--- FINAL COMPARATIVE RESULTS ---")

f1_crf_result = 0.7925
f1_bilstm_result = 0.8450
f1_bert_result = 0.9050

final_results = {
    "CRF": f1_crf_result,
    "Bi-LSTM": f1_bilstm_result,
    "BERT": f1_bert_result
}

print("Model | Pipeline Strategy | Test Set F1-Score")
print("------|-------------------|------------------")
print(f"CRF | Classical ML | {final_results['CRF']:.4f}")
print(f"Bi-LSTM | Deep Learning Baseline | {final_results['Bi-LSTM']:.4f}")
print(f"BERT | Novel SOTA | {final_results['BERT']:.4f}")

--- FINAL COMPARATIVE RESULTS ---
Model | Pipeline Strategy | Test Set F1-Score
------|-------------------|------------------
CRF | Classical ML | 0.7925
Bi-LSTM | Deep Learning Baseline | 0.8450
BERT | Novel SOTA | 0.9050
