In [1]:
!pip install spacy sklearn-crfsuite seqeval datasets transformers --quiet

!pip install torch torchcrf --quiet


In [2]:
import re
import time
import random
import spacy
import sklearn_crfsuite
from datasets import load_dataset
from seqeval.metrics import classification_report, f1_score
from transformers import pipeline
from spacy import displacy
import pandas as pd


In [3]:
custom_sentences = [
    "Berlin hosted the Data Science Summit on 12/09/2024.",
    "Bayern Munich signed Manuel Neuer in Munich.",
    "TCS opened a new office in Pune.",
    "Mercedes-Benz unveiled an EV in Stuttgart.",
    "Rohit Sharma scored a century in Mumbai.",
    "Google acquired DeepMind in London.",
    "Elon Musk visited Bengaluru to discuss Tesla plans.",
    "The Olympics will be held in Paris in 2024.",
    "Apple announced the iPhone 15 in California.",
    "Amazon opened a new warehouse in Hyderabad.",
    "Cristiano Ronaldo joined Al Nassr in Riyadh.",
    "Meta plans to build a data center in Singapore.",
    "Neeraj Chopra won gold in Tokyo Olympics.",
    "Infosys partnered with Microsoft in Seattle.",
    "Wimbledon final was played in London.",
    "Virat Kohli will captain the team in Delhi.",
    "NASA launched Artemis mission from Florida.",
    "The G20 Summit was hosted in New Delhi.",
    "Manchester United signed Casemiro from Real Madrid.",
    "ISRO launched Chandrayaan-3 from Sriharikota.",
    "Novak Djokovic won the Australian Open in Melbourne.",
    "Tesla opened its new Gigafactory in Texas.",
    "Barcelona defeated Real Madrid in El Clasico.",
    "Microsoft launched Copilot AI in New York.",
    "Serena Williams retired after the US Open in New York.",
    "Twitter rebranded to X Corp in San Francisco.",
    "Liverpool signed Darwin Nunez from Benfica.",
    "India won the Cricket World Cup in Ahmedabad.",
    "OpenAI released GPT-5 in San Francisco.",
    "Chelsea defeated Arsenal at Stamford Bridge in London.",
    "Zomato acquired Blinkit in Gurugram.",
    "The FIFA World Cup final was held in Doha, Qatar.",
    "Sony launched PlayStation 6 in Tokyo.",
    "Pakistan beat India in Lahore.",
    "SpaceX launched Starship from Boca Chica, Texas.",
    "Google opened a research lab in Zurich.",
    "Amazon acquired MGM Studios in Los Angeles.",
    "England won the Ashes in Sydney.",
    "Byju's opened a new office in Dubai.",
    "Ferrari unveiled a new model in Maranello, Italy.",
    "Hyundai launched a new EV in Seoul.",
    "Kylian Mbappe scored a hat-trick in Paris.",
    "Intel opened a chip factory in Ohio.",
    "Portugal won the Nations League in Lisbon.",
    "Boeing unveiled a new aircraft in Seattle.",
    "Nike opened a flagship store in Shanghai.",
    "Uber launched flying taxis in Los Angeles.",
    "Djokovic won Wimbledon in London.",
    "Bharat Biotech announced a new vaccine in Hyderabad.",
    "Germany won the Euro Cup in Berlin."
]

gold_labels = [
    [("Berlin", "LOC"), ("Data Science Summit", "ORG"), ("12/09/2024", "DATE")],
    [("Bayern Munich", "ORG"), ("Manuel Neuer", "PER"), ("Munich", "LOC")],
    [("TCS", "ORG"), ("Pune", "LOC")],
    [("Mercedes-Benz", "ORG"), ("Stuttgart", "LOC")],
    [("Rohit Sharma", "PER"), ("Mumbai", "LOC")],
    [("Google", "ORG"), ("DeepMind", "ORG"), ("London", "LOC")],
    [("Elon Musk", "PER"), ("Bengaluru", "LOC"), ("Tesla", "ORG")],
    [("Olympics", "EVENT"), ("Paris", "LOC"), ("2024", "DATE")],
    [("Apple", "ORG"), ("iPhone 15", "PRODUCT"), ("California", "LOC")],
    [("Amazon", "ORG"), ("Hyderabad", "LOC")],
    [("Cristiano Ronaldo", "PER"), ("Al Nassr", "ORG"), ("Riyadh", "LOC")],
    [("Meta", "ORG"), ("Singapore", "LOC")],
    [("Neeraj Chopra", "PER"), ("Tokyo Olympics", "EVENT")],
    [("Infosys", "ORG"), ("Microsoft", "ORG"), ("Seattle", "LOC")],
    [("Wimbledon", "EVENT"), ("London", "LOC")],
    [("Virat Kohli", "PER"), ("Delhi", "LOC")],
    [("NASA", "ORG"), ("Artemis", "MISSION"), ("Florida", "LOC")],
    [("G20 Summit", "EVENT"), ("New Delhi", "LOC")],
    [("Manchester United", "ORG"), ("Casemiro", "PER"), ("Real Madrid", "ORG")],
    [("ISRO", "ORG"), ("Chandrayaan-3", "MISSION"), ("Sriharikota", "LOC")],
    [("Novak Djokovic", "PER"), ("Australian Open", "EVENT"), ("Melbourne", "LOC")],
    [("Tesla", "ORG"), ("Gigafactory", "FACILITY"), ("Texas", "LOC")],
    [("Barcelona", "ORG"), ("Real Madrid", "ORG"), ("El Clasico", "EVENT")],
    [("Microsoft", "ORG"), ("Copilot AI", "PRODUCT"), ("New York", "LOC")],
    [("Serena Williams", "PER"), ("US Open", "EVENT"), ("New York", "LOC")],
    [("Twitter", "ORG"), ("X Corp", "ORG"), ("San Francisco", "LOC")],
    [("Liverpool", "ORG"), ("Darwin Nunez", "PER"), ("Benfica", "ORG")],
    [("India", "LOC"), ("Cricket World Cup", "EVENT"), ("Ahmedabad", "LOC")],
    [("OpenAI", "ORG"), ("GPT-5", "PRODUCT"), ("San Francisco", "LOC")],
    [("Chelsea", "ORG"), ("Arsenal", "ORG"), ("Stamford Bridge", "FACILITY"), ("London", "LOC")],
    [("Zomato", "ORG"), ("Blinkit", "ORG"), ("Gurugram", "LOC")],
    [("FIFA World Cup", "EVENT"), ("Doha", "LOC"), ("Qatar", "LOC")],
    [("Sony", "ORG"), ("PlayStation 6", "PRODUCT"), ("Tokyo", "LOC")],
    [("Pakistan", "LOC"), ("India", "LOC"), ("Lahore", "LOC")],
    [("SpaceX", "ORG"), ("Starship", "PRODUCT"), ("Boca Chica", "LOC"), ("Texas", "LOC")],
    [("Google", "ORG"), ("Zurich", "LOC")],
    [("Amazon", "ORG"), ("MGM Studios", "ORG"), ("Los Angeles", "LOC")],
    [("England", "LOC"), ("Ashes", "EVENT"), ("Sydney", "LOC")],
    [("Byju's", "ORG"), ("Dubai", "LOC")],
    [("Ferrari", "ORG"), ("Maranello", "LOC"), ("Italy", "LOC")],
    [("Hyundai", "ORG"), ("Seoul", "LOC")],
    [("Kylian Mbappe", "PER"), ("Paris", "LOC")],
    [("Intel", "ORG"), ("Ohio", "LOC")],
    [("Portugal", "LOC"), ("Nations League", "EVENT"), ("Lisbon", "LOC")],
    [("Boeing", "ORG"), ("Seattle", "LOC")],
    [("Nike", "ORG"), ("Shanghai", "LOC")],
    [("Uber", "ORG"), ("Los Angeles", "LOC")],
    [("Novak Djokovic", "PER"), ("Wimbledon", "EVENT"), ("London", "LOC")],
    [("Bharat Biotech", "ORG"), ("Hyderabad", "LOC")],
    [("Germany", "LOC"), ("Euro Cup", "EVENT"), ("Berlin", "LOC")]
]


In [4]:
# -------------------------------
# 2️⃣ RULE-BASED NER
# -------------------------------
cities = ["Berlin", "Munich", "Pune", "Stuttgart", "Mumbai", "London", "Bengaluru", "Paris", "California",
          "Hyderabad", "Riyadh", "Singapore", "Tokyo", "Seattle", "Delhi", "Florida", "New Delhi", "Madrid", "Sriharikota",
          "Melbourne", "Texas", "New York", "San Francisco", "Ahmedabad", "Gurugram", "Doha", "Qatar", "Lahore",
          "Boca Chica", "Zurich", "Los Angeles", "Sydney", "Dubai", "Maranello", "Italy", "Seoul", "Ohio", "Lisbon", "Shanghai"]

teams = ["Bayern Munich", "Manchester United", "Real Madrid", "Al Nassr", "India", "Barcelona", "Liverpool", "Pakistan", "Chelsea", "Arsenal"]

def rule_based_ner(sentence):
    entities = []
    for match in re.finditer(r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b", sentence):
        entities.append((match.group(), "DATE"))
    for city in cities:
        if city in sentence:
            entities.append((city, "LOC"))
    for team in teams:
        if team in sentence:
            entities.append((team, "ORG"))
    for match in re.finditer(r"\b[A-Z][a-z]+(?:\s[A-Z][a-z]+)*\b", sentence):
        token = match.group()
        if token not in [c for c, _ in entities]:
            entities.append((token, "MISC"))
    return entities

print("Rule-based NER example:")
for s in custom_sentences[:3]:
    print(s, "->", rule_based_ner(s))


Rule-based NER example:
Berlin hosted the Data Science Summit on 12/09/2024. -> [('12/09/2024', 'DATE'), ('Berlin', 'LOC'), ('Data Science Summit', 'MISC')]
Bayern Munich signed Manuel Neuer in Munich. -> [('Munich', 'LOC'), ('Bayern Munich', 'ORG'), ('Manuel Neuer', 'MISC')]
TCS opened a new office in Pune. -> [('Pune', 'LOC')]


In [5]:
# -------------------------------
# 3️⃣ spaCy NER
# -------------------------------
nlp = spacy.load("en_core_web_sm")

print("\nspaCy NER example:")
doc = nlp(custom_sentences[0])
for ent in doc.ents:
    print(ent.text, ent.label_)

displacy.render(doc, style="ent", jupyter=True)



spaCy NER example:
Berlin GPE
the Data Science Summit ORG
12/09/2024 DATE


In [6]:
from collections import defaultdict
def train_hmm(sentences, tags):
    transition_counts = defaultdict(lambda: defaultdict(int))
    emission_counts = defaultdict(lambda: defaultdict(int))
    tag_counts = defaultdict(int)

    for words, tgs in zip(sentences, tags):
        prev_tag = "<START>"
        for w, tg in zip(words, tgs):
            transition_counts[prev_tag][tg] += 1
            emission_counts[tg][w] += 1
            tag_counts[tg] += 1
            prev_tag = tg
        transition_counts[prev_tag]["<END>"] += 1

    # Convert to probabilities
    transition_probs = {t: {t2: c/sum(d.values()) for t2, c in d.items()} for t, d in transition_counts.items()}
    emission_probs = {t: {w: c/sum(d.values()) for w, c in d.items()} for t, d in emission_counts.items()}

    return transition_probs, emission_probs, list(tag_counts.keys())

def viterbi(words, transition_probs, emission_probs, tags):
    V = [{}]
    path = {}

    for t in tags:
        V[0][t] = np.log(transition_probs.get("<START>", {}).get(t, 1e-6)) + np.log(emission_probs.get(t, {}).get(words[0], 1e-6))
        path[t] = [t]

    for i in range(1, len(words)):
        V.append({})
        new_path = {}
        for t in tags:
            (prob, state) = max(
                (V[i-1][pt] + np.log(transition_probs.get(pt, {}).get(t, 1e-6)) + np.log(emission_probs.get(t, {}).get(words[i], 1e-6)), pt)
                for pt in tags
            )
            V[i][t] = prob
            new_path[t] = path[state] + [t]
        path = new_path

    (prob, state) = max((V[len(words)-1][t] + np.log(transition_probs.get(t, {}).get("<END>", 1e-6)), t) for t in tags)
    return path[state]


In [7]:
from google.colab import files
uploaded_files=files.upload()

In [8]:
!pip install datasets seqeval sklearn-crfsuite spacy==3.7.2 --quiet
!python -m spacy download en_core_web_sm



Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m75.4 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [9]:
import json
from datasets import DatasetDict, Dataset

# Function to load JSON Lines
def load_jsonl(path):
    with open(path, "r") as f:
        return [json.loads(line) for line in f]

# Load dataset files
train_data = load_jsonl("/content/train.json")
test_data = load_jsonl("/content/test.json")
valid_data = load_jsonl("/content/valid.json")

# Load label file
with open("/content/label.json") as f:
    labels = json.load(f)

# Convert to Hugging Face Dataset format
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data),
    "validation": Dataset.from_list(valid_data)
})

# Extract small subset for speed
train_tokens = [item["tokens"] for item in dataset["train"]][:100]
train_tags = [item["tags"] for item in dataset["train"]][:100]
test_tokens = [item["tokens"] for item in dataset["test"]][:50]
test_tags = [item["tags"] for item in dataset["test"]][:50]

# Determine label mapping format
if isinstance(labels, list):
    # Case 1: List of labels
    id2label = {i: label for i, label in enumerate(labels)}
elif all(k.isdigit() for k in labels.keys()):
    # Case 2: Dict with string numbers as keys
    id2label = {int(k): v for k, v in labels.items()}
elif all(isinstance(v, int) for v in labels.values()):
    # Case 3: Dict with label names as keys
    id2label = {v: k for k, v in labels.items()}
else:
    raise ValueError("Unrecognized label.json format")

# Map tag IDs to label strings
train_tags = [[id2label[tag] for tag in seq] for seq in train_tags]
test_tags = [[id2label[tag] for tag in seq] for seq in test_tags]

print(train_tokens[:2])
print(train_tags[:2])

[['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], ['Peter', 'Blackburn']]
[['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O'], ['B-PER', 'I-PER']]


In [10]:
#CRF
import sklearn_crfsuite
from seqeval.metrics import classification_report

# Feature extractor for one token
def word2features(sent, i):
    word = sent[i]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'prefix2': word[:2],
        'prefix3': word[:3],
        'suffix2': word[-2:],
        'suffix3': word[-3:]
    }
    if i > 0:
        word1 = sent[i-1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True  # Beginning of sentence

    if i < len(sent)-1:
        word1 = sent[i+1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True  # End of sentence

    return features

# Convert sentences to feature dicts
X_train = [[word2features(s, i) for i in range(len(s))] for s in train_tokens]
y_train = train_tags
X_test = [[word2features(s, i) for i in range(len(s))] for s in test_tokens]
y_test = test_tags

# Train CRF
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,  # L1 reg
    c2=0.1,  # L2 reg
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

# Predict and evaluate
y_pred = crf.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

         LOC       0.67      0.28      0.39        43
        MISC       1.00      0.44      0.62        18
         ORG       0.00      0.00      0.00         2
         PER       0.88      0.72      0.79        83

   micro avg       0.77      0.55      0.64       146
   macro avg       0.64      0.36      0.45       146
weighted avg       0.82      0.55      0.64       146



In [11]:
#HMM:
import numpy as np
from collections import defaultdict

def train_hmm(sentences, tags):
    transition_counts = defaultdict(lambda: defaultdict(int))
    emission_counts = defaultdict(lambda: defaultdict(int))
    tag_counts = defaultdict(int)

    for words, tgs in zip(sentences, tags):
        prev_tag = "<START>"
        for w, tg in zip(words, tgs):
            transition_counts[prev_tag][tg] += 1
            emission_counts[tg][w] += 1
            tag_counts[tg] += 1
            prev_tag = tg
        transition_counts[prev_tag]["<END>"] += 1

    transition_probs = {t: {t2: c/sum(d.values()) for t2, c in d.items()}
                        for t, d in transition_counts.items()}
    emission_probs = {t: {w: c/sum(d.values()) for w, c in d.items()}
                      for t, d in emission_counts.items()}

    return transition_probs, emission_probs, list(tag_counts.keys())

def viterbi(words, transition_probs, emission_probs, tags):
    V = [{}]
    path = {}
    for t in tags:
        V[0][t] = np.log(transition_probs.get("<START>", {}).get(t, 1e-6)) + \
                  np.log(emission_probs.get(t, {}).get(words[0], 1e-6))
        path[t] = [t]

    for i in range(1, len(words)):
        V.append({})
        new_path = {}
        for t in tags:
            (prob, state) = max(
                (V[i-1][pt] + np.log(transition_probs.get(pt, {}).get(t, 1e-6)) +
                 np.log(emission_probs.get(t, {}).get(words[i], 1e-6)), pt)
                for pt in tags
            )
            V[i][t] = prob
            new_path[t] = path[state] + [t]
        path = new_path

    (prob, state) = max(
        (V[-1][t] + np.log(transition_probs.get(t, {}).get("<END>", 1e-6)), t)
        for t in tags
    )
    return path[state]

# Train and evaluate
transition_probs, emission_probs, tag_list = train_hmm(train_tokens, train_tags)
pred_tags_hmm = [viterbi(sent, transition_probs, emission_probs, tag_list) for sent in test_tokens]

print(classification_report(test_tags, pred_tags_hmm))


              precision    recall  f1-score   support

         LOC       1.00      0.23      0.38        43
        MISC       1.00      0.44      0.62        18
         ORG       0.00      0.00      0.00         2
         PER       0.00      0.00      0.00        83

   micro avg       1.00      0.12      0.22       146
   macro avg       0.50      0.17      0.25       146
weighted avg       0.42      0.12      0.19       146



  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
!pip install transformers --quiet

from transformers import pipeline
import time

# Load pipeline
ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")

# Example sentence
sentence = "Berlin hosted the Data Science Summit on 12/09/2024."

# Time inference
start_time = time.time()
entities = ner_pipeline(sentence)
elapsed_time = time.time() - start_time

# Print results
for ent in entities:
    print(f"{ent['word']} -> {ent['entity_group']} (score: {ent['score']:.4f})")
print(f"Inference time: {elapsed_time:.4f} seconds")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertFor

Berlin -> LOC (score: 0.9998)
Data Science Summit -> MISC (score: 0.9977)
Inference time: 0.3137 seconds


In [13]:
!pip install torch --quiet
!pip install git+https://github.com/kmkurn/pytorch-crf.git --quiet
!pip install gensim

  Preparing metadata (setup.py) ... [?25l[?25hdone


In [16]:
import torch
import torch.nn as nn
from torchcrf import CRF
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from seqeval.metrics import classification_report
import numpy as np
import gensim.downloader as api

# ===== Device =====
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ===== Create vocab =====
word2idx = {"<PAD>": 0, "<UNK>": 1}
for sent in train_tokens:
    for w in sent:
        if w not in word2idx:
            word2idx[w] = len(word2idx)

tag2idx = {tag: i for i, tag in enumerate(sorted({t for seq in train_tags for t in seq}))}
idx2tag = {i: tag for tag, i in tag2idx.items()}

# ===== Load pretrained GloVe (100d) =====
print("Loading GloVe vectors...")
glove_vectors = api.load("glove-wiki-gigaword-100")
embedding_dim = glove_vectors.vector_size

embedding_matrix = np.zeros((len(word2idx), embedding_dim))
for word, idx in word2idx.items():
    if word in glove_vectors:
        embedding_matrix[idx] = glove_vectors[word]
    else:
        embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embedding_dim,))

# ===== Dataset =====
class NERDataset(Dataset):
    def __init__(self, tokens, tags):
        self.tokens = tokens
        self.tags = tags
    def __len__(self):
        return len(self.tokens)
    def __getitem__(self, idx):
        words = [word2idx.get(w, 1) for w in self.tokens[idx]]
        labels = [tag2idx[t] for t in self.tags[idx]]
        return torch.tensor(words, dtype=torch.long), torch.tensor(labels, dtype=torch.long)

# ===== Collate with padding =====
def collate_fn(batch):
    words, labels = zip(*batch)
    words_padded = pad_sequence(words, batch_first=True, padding_value=word2idx["<PAD>"])
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=-1)  # keep -1 for padding
    return words_padded, labels_padded

# ===== DataLoader =====
train_dataset = NERDataset(train_tokens[:5000], train_tags[:5000])
test_dataset = NERDataset(test_tokens, test_tags)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn)

# ===== Model =====
class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim=100, hidden_dim=128, embedding_matrix=None):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=word2idx["<PAD>"])
        if embedding_matrix is not None:
            self.embedding.weight.data.copy_(torch.tensor(embedding_matrix, dtype=torch.float))
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=1,
                            bidirectional=True, batch_first=True)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.crf = CRF(tagset_size, batch_first=True)

    def forward(self, x, tags=None, mask=None):
        emb = self.embedding(x)
        lstm_out, _ = self.lstm(emb)
        emissions = self.hidden2tag(lstm_out)
        if tags is not None:
            return -self.crf(emissions, tags, mask=mask, reduction='mean')
        else:
            return self.crf.decode(emissions, mask=mask)

# ===== Initialize model =====
model = BiLSTM_CRF(len(word2idx), len(tag2idx), embedding_dim=embedding_dim,
                   embedding_matrix=embedding_matrix).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# ===== Training =====
for epoch in range(30):  # train longer for better convergence
    model.train()
    total_loss = 0
    for words, labels in train_loader:
        words, labels = words.to(device), labels.to(device)
        mask = labels != -1  # True where token is not padding
        optimizer.zero_grad()
        loss = model(words, labels, mask=mask)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Evaluate on test set each epoch
    model.eval()
    pred_tags, true_tags = [], []
    with torch.no_grad():
        for words, labels in test_loader:
            words, labels = words.to(device), labels.to(device)
            mask = labels != -1
            preds = model(words, mask=mask)
            pred_tags.append([idx2tag[i] for i in preds[0]])
            true_tags.append([idx2tag[i.item()] for i in labels[0] if i.item() != -1])
    f1 = classification_report(true_tags, pred_tags, digits=4, output_dict=True)["macro avg"]["f1-score"]

    print(f"Epoch {epoch+1} Loss: {total_loss/len(train_loader):.4f} | Test Macro F1: {f1:.4f}")

# ===== Final evaluation =====
print("\nFinal Classification Report:")
print(classification_report(true_tags, pred_tags))


Using device: cpu
Loading GloVe vectors...
Epoch 1 Loss: 31.9060 | Test Macro F1: 0.0275
Epoch 2 Loss: 24.2866 | Test Macro F1: 0.0204
Epoch 3 Loss: 14.8689 | Test Macro F1: 0.0205
Epoch 4 Loss: 11.4342 | Test Macro F1: 0.0167
Epoch 5 Loss: 12.7844 | Test Macro F1: 0.0231
Epoch 6 Loss: 11.1784 | Test Macro F1: 0.0294
Epoch 7 Loss: 9.5986 | Test Macro F1: 0.0208
Epoch 8 Loss: 9.3545 | Test Macro F1: 0.0205
Epoch 9 Loss: 8.7823 | Test Macro F1: 0.0217
Epoch 10 Loss: 8.2198 | Test Macro F1: 0.0221
Epoch 11 Loss: 9.6734 | Test Macro F1: 0.0145
Epoch 12 Loss: 8.6714 | Test Macro F1: 0.0192
Epoch 13 Loss: 7.3518 | Test Macro F1: 0.0281
Epoch 14 Loss: 7.6544 | Test Macro F1: 0.0500
Epoch 15 Loss: 6.8247 | Test Macro F1: 0.0522
Epoch 16 Loss: 6.7938 | Test Macro F1: 0.0530
Epoch 17 Loss: 6.1416 | Test Macro F1: 0.0463
Epoch 18 Loss: 6.0993 | Test Macro F1: 0.0433
Epoch 19 Loss: 5.4954 | Test Macro F1: 0.0404
Epoch 20 Loss: 5.5408 | Test Macro F1: 0.0366
Epoch 21 Loss: 5.4866 | Test Macro F1: 0

In [18]:
import time
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

# =============================
# BIO Conversion Helper
# =============================
def to_bio_tags(tokens, entities):
    """
    Convert a list of entities into BIO tags for the given tokens.
    entities: list of (entity_text, entity_label) tuples
    """
    tags = ["O"] * len(tokens)
    for ent_text, ent_label in entities:
        ent_tokens = ent_text.split()
        for i in range(len(tokens)):
            if tokens[i:i+len(ent_tokens)] == ent_tokens:
                tags[i] = f"B-{ent_label}"
                for j in range(1, len(ent_tokens)):
                    tags[i+j] = f"I-{ent_label}"
    return tags

# =============================
# Evaluation Helper Functions
# =============================
def evaluate_method(name, y_true, y_pred, train_time=None, inference_time=None):
    print(f"\n=== {name} ===")
    print(f"Precision: {precision_score(y_true, y_pred):.4f}")
    print(f"Recall:    {recall_score(y_true, y_pred):.4f}")
    print(f"F1-score:  {f1_score(y_true, y_pred):.4f}")
    if train_time is not None:
        print(f"Training time: {train_time:.4f} sec")
    if inference_time is not None:
        print(f"Inference time: {inference_time:.4f} sec")
    print("\nDetailed report:")
    print(classification_report(y_true, y_pred))

def get_misclassifications(y_true, y_pred, tokens, n=5):
    errors = []
    for i, (true_seq, pred_seq, tok_seq) in enumerate(zip(y_true, y_pred, tokens)):
        if true_seq != pred_seq:
            errors.append((tok_seq, true_seq, pred_seq))
        if len(errors) >= n:
            break
    return errors

# =============================
# 1️⃣ Rule-based Evaluation (BIO)
# =============================
start_time = time.time()
rule_preds = []
for sent in test_tokens:
    ents = rule_based_ner(" ".join(sent))  # returns [(text, label), ...]
    pred_tags = to_bio_tags(sent, ents)
    rule_preds.append(pred_tags)
rule_inference_time = time.time() - start_time

evaluate_method("Rule-based", test_tags, rule_preds, inference_time=rule_inference_time)
print("Sample Misclassifications (Rule-based):")
for tok, true, pred in get_misclassifications(test_tags, rule_preds, test_tokens):
    print("TOKENS:", tok)
    print("TRUE:  ", true)
    print("PRED:  ", pred)
    print()

# =============================
# 2️⃣ spaCy Evaluation (BIO)
# =============================
start_time = time.time()
spacy_preds = []
for sent in test_tokens:
    doc = nlp(" ".join(sent))
    ents = [(ent.text, ent.label_) for ent in doc.ents]
    pred_tags = to_bio_tags(sent, ents)
    spacy_preds.append(pred_tags)
spacy_inference_time = time.time() - start_time

evaluate_method("spaCy", test_tags, spacy_preds, inference_time=spacy_inference_time)
print("Sample Misclassifications (spaCy):")
for tok, true, pred in get_misclassifications(test_tags, spacy_preds, test_tokens):
    print("TOKENS:", tok)
    print("TRUE:  ", true)
    print("PRED:  ", pred)
    print()

# =============================
# 3️⃣ CRF Evaluation (unchanged)
# =============================
start_time = time.time()
crf_preds = crf.predict(X_test)
crf_inference_time = time.time() - start_time

evaluate_method("CRF", y_test, crf_preds, inference_time=crf_inference_time)
print("Sample Misclassifications (CRF):")
for tok, true, pred in get_misclassifications(y_test, crf_preds, test_tokens):
    print("TOKENS:", tok)
    print("TRUE:  ", true)
    print("PRED:  ", pred)
    print()

# =============================
# 4️⃣ HMM Evaluation (unchanged)
# =============================
start_time = time.time()
hmm_preds = [viterbi(sent, transition_probs, emission_probs, tag_list) for sent in test_tokens]
hmm_inference_time = time.time() - start_time

evaluate_method("HMM", test_tags, hmm_preds, inference_time=hmm_inference_time)
print("Sample Misclassifications (HMM):")
for tok, true, pred in get_misclassifications(test_tags, hmm_preds, test_tokens):
    print("TOKENS:", tok)
    print("TRUE:  ", true)
    print("PRED:  ", pred)
    print()

# =============================
# 5️⃣ Transformer (BERT) Evaluation (BIO)
# =============================
start_time = time.time()
bert_preds = []
for sent in test_tokens:
    ents_raw = ner_pipeline(" ".join(sent))  # list of dicts
    ents = [(ent["word"], ent["entity_group"]) for ent in ents_raw]
    pred_tags = to_bio_tags(sent, ents)
    bert_preds.append(pred_tags)
bert_inference_time = time.time() - start_time

evaluate_method("Transformer (BERT)", test_tags, bert_preds, inference_time=bert_inference_time)
print("Sample Misclassifications (BERT):")
for tok, true, pred in get_misclassifications(test_tags, bert_preds, test_tokens):
    print("TOKENS:", tok)
    print("TRUE:  ", true)
    print("PRED:  ", pred)
    print()

# =============================
# 6️⃣ BiLSTM-CRF Evaluation (unchanged)
# =============================
start_time = time.time()
pred_tags_bilstm_final = []
true_tags_bilstm_final = []
for words, labels in test_loader:
    preds = model(words)
    pred_tags_bilstm_final.append([list(tag2idx.keys())[i] for i in preds[0]])
    true_tags_bilstm_final.append([list(tag2idx.keys())[i.item()] for i in labels[0]])
bilstm_inference_time = time.time() - start_time

evaluate_method("BiLSTM-CRF", true_tags_bilstm_final, pred_tags_bilstm_final, inference_time=bilstm_inference_time)
print("Sample Misclassifications (BiLSTM-CRF):")
for tok, true, pred in get_misclassifications(true_tags_bilstm_final, pred_tags_bilstm_final, test_tokens):
    print("TOKENS:", tok)
    print("TRUE:  ", true)
    print("PRED:  ", pred)
    print()



=== Rule-based ===
Precision: 0.1074
Recall:    0.1096
F1-score:  0.1085
Inference time: 0.0024 sec

Detailed report:
              precision    recall  f1-score   support

         LOC       1.00      0.05      0.09        43
        MISC       0.10      0.78      0.17        18
         ORG       0.00      0.00      0.00         2
         PER       0.00      0.00      0.00        83

   micro avg       0.11      0.11      0.11       146
   macro avg       0.27      0.21      0.06       146
weighted avg       0.31      0.11      0.05       146

Sample Misclassifications (Rule-based):
TOKENS: ['SOCCER', '-', 'JAPAN', 'GET', 'LUCKY', 'WIN', ',', 'CHINA', 'IN', 'SURPRISE', 'DEFEAT', '.']
TRUE:   ['O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O']
PRED:   ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

TOKENS: ['Nadim', 'Ladki']
TRUE:   ['B-PER', 'I-PER']
PRED:   ['B-MISC', 'I-MISC']

TOKENS: ['AL-AIN', ',', 'United', 'Arab', 'Emirates', '1996-12-06']
TRUE

In [19]:
# ================================================================
# Named Entity Recognition (NER) Method Comparison
# ================================================================

# 1. Rule-Based NER
# ------------------------------------------------
# Pros:
# - Extremely fast and requires no training data.
# - Easy to understand and maintain for small, fixed domains.
# - Works offline without heavy dependencies.
#
# Cons:
# - Poor adaptability to new patterns or unseen entities.
# - Requires constant manual updates to maintain accuracy.
# - Limited generalization; struggles with complex linguistic variations.

# 2. spaCy Pretrained Model
# ------------------------------------------------
# Pros:
# - Good out-of-the-box performance for general-purpose NER.
# - Fast inference speed with efficient implementation.
# - Easy integration into Python pipelines.
#
# Cons:
# - Requires retraining or fine-tuning for domain-specific terminology.
# - May not capture niche entities without additional training data.
# - Model size can still be relatively large for constrained environments.

# 3. BiLSTM-CRF (from scratch)
# ------------------------------------------------
# Pros:
# - Can be trained from scratch for any domain or language.
# - CRF layer improves sequence labeling consistency.
# - Highly customizable model architecture.
#
# Cons:
# - Training requires large, labeled datasets.
# - Slower training and inference compared to transformer-based models.
# - Performance heavily depends on quality and size of training data.

# 4. Transformer-Based Model (Hugging Face, e.g., BERT-based)
# ------------------------------------------------
# Pros:
# - State-of-the-art accuracy in NER tasks across domains.
# - Strong generalization due to large-scale pretraining.
# - Easily fine-tuned for domain-specific use cases.
#
# Cons:
# - Larger model size leads to higher memory and compute requirements.
# - Slower inference compared to lightweight models.
# - Requires GPU for efficient fine-tuning on large datasets.

# ================================================================
# Recommendation for Production Use:
# ------------------------------------------------
# If high accuracy and adaptability to various domains are priorities,
# a transformer-based model (like BERT fine-tuned for NER) is recommended.
# However, for resource-constrained environments or small-scale rule-bound tasks,
# spaCy or rule-based approaches may be more practical.
# ================================================================
