In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
from TorchCRF import CRF
from gensim.models import KeyedVectors
from sklearn.metrics import classification_report
import time
import json

In [3]:
# -----------------------
# 1. Load CoNLL file
# -----------------------
def load_conll(file_path):
    sentences, labels = [], []
    with open(file_path, "r", encoding="utf-8") as f:
        sentence, ner_tags = [], []
        for line in f:
            line = line.strip()
            if not line:
                if sentence:
                    sentences.append(sentence)
                    labels.append(ner_tags)
                    sentence, ner_tags = [], []
            else:
                parts = line.split("\t")
                if len(parts) != 3:
                    continue
                token, _, ner = parts
                sentence.append(token)
                ner_tags.append(ner)
        if sentence:
            sentences.append(sentence)
            labels.append(ner_tags)
    return sentences, labels

In [4]:
# -----------------------
# 2. Load FastText embeddings
# -----------------------
ft_model = KeyedVectors.load("fasttext_gensim.model")  # pretrained FastText

# -----------------------
# 3. Build vocabs
# -----------------------
"""
Word vocab → converts words → indices for embedding lookup.
Char vocab → converts characters → indices for CharCNN embeddings.
NER tag vocab → converts tags → indices for CRF.
"""
train_sents, train_labels = load_conll("ner_80train.conll")
test_sents, test_labels = load_conll("ner_20test.conll")

# Word vocab
vocab = {"<PAD>":0, "<UNK>":1}
for sent in train_sents:
    for w in sent:
        if w not in vocab:
            vocab[w] = len(vocab)

# Char vocab
char_vocab = {"<PAD>":0, "<UNK>":1}
for sent in train_sents:
    for w in sent:
        for c in w:
            if c not in char_vocab:
                char_vocab[c] = len(char_vocab)

# Tag vocab
ner_tag_to_ix = {"<PAD>":0}
for tag_seq in train_labels:
    for t in tag_seq:
        if t not in ner_tag_to_ix:
            ner_tag_to_ix[t] = len(ner_tag_to_ix)
id2tag = {v:k for k,v in ner_tag_to_ix.items()}

In [None]:
# -----------------------
# 4. Dataset and collate_fn
# -----------------------
max_word_len = 10  # max chars per word

class NERDataset(Dataset):
    """
    PyTorch Dataset for NER with word-level and character-level representations.
    Steps:
        1. Stores tokenized sentences, labels, and vocabularies (word, char, tag).
        2. Converts each sentence into:
        - Word indices (for word embeddings)
        - Character indices per word (for CharCNN/CharLSTM)
        - Label indices (for CRF)
        3. Pads characters per word up to `max_word_len`.
        4. Returns tensors: word indices, char indices, and label indices for a single sentence.
    Purpose:
        - Provides a consistent way to access and convert raw NER sentences into tensors 
        suitable for a BiLSTM+CharCNN+CRF model.
    """

    def __init__(self, sentences, labels, vocab, char_vocab, ner_tag_to_ix, max_word_len=10):
        self.sentences = sentences
        self.labels = labels
        self.vocab = vocab
        self.char_vocab = char_vocab
        self.ner_tag_to_ix = ner_tag_to_ix
        self.max_word_len = max_word_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sent = self.sentences[idx]
        label = self.labels[idx]

        # Word indices
        word_idx = [self.vocab.get(w, self.vocab["<UNK>"]) for w in sent]

        # Char indices
        char_idx = []
        for w in sent:
            chars = [self.char_vocab.get(c, self.char_vocab["<UNK>"]) for c in w][:self.max_word_len]
            # pad chars
            chars += [0]*(self.max_word_len - len(chars))
            char_idx.append(chars)

        # Label indices
        label_idx = [self.ner_tag_to_ix[t] for t in label]

        return torch.tensor(word_idx, dtype=torch.long), torch.tensor(char_idx, dtype=torch.long), torch.tensor(label_idx, dtype=torch.long)

def collate_fn(batch):
    """
        Collate function for batching sentences with dynamic padding.

        Steps:
        1. Receives a batch of sentences from NERDataset (word indices, char indices, tag indices).
        2. Finds the length of the longest sentence in the batch.
        3. Pads all sentences, character sequences, and tags to this maximum length.
        - Words → pad with 0
        - Characters → pad with 0 arrays of shape (max_word_len)
        - Tags → pad with 0
        4. Stacks padded tensors into batch tensors.

        Returns:
        - batch_words: Tensor of shape (batch_size, max_seq_len)
        - batch_chars: Tensor of shape (batch_size, max_seq_len, max_word_len)
        - batch_tags: Tensor of shape (batch_size, max_seq_len)

        Purpose:
        - Ensures all sequences in a batch have the same length for model input,
        while preserving both word-level and character-level information.
    """

    words, chars, tags = zip(*batch)
    max_len = max(len(s) for s in words)
    batch_words, batch_chars, batch_tags = [], [], []

    for w, c, t in zip(words, chars, tags):
        # pad words
        pad_len = max_len - len(w)
        batch_words.append(torch.cat([w, torch.zeros(pad_len, dtype=torch.long)]))
        # pad chars
        pad_chars = torch.zeros((pad_len, max_word_len), dtype=torch.long)
        batch_chars.append(torch.cat([c, pad_chars], dim=0))
        # pad tags
        batch_tags.append(torch.cat([t, torch.zeros(pad_len, dtype=torch.long)]))

    return torch.stack(batch_words), torch.stack(batch_chars), torch.stack(batch_tags)

# -----------------------
# 5. CharCNN + BiLSTM + CRF Model
# -----------------------
"""
BiLSTM-CRF model with CharCNN for NER.

Steps:
    1. Word embeddings: map words to pretrained FastText vectors (with dropout)
    2. Character embeddings: map characters to vectors, apply CNN + max-pooling + dropout
    3. Combine word and char embeddings into a single representation
    4. BiLSTM: contextualize each token using bidirectional LSTM
    5. Linear layer: project LSTM outputs to tag scores
    6. CRF: model valid tag sequences and compute loss or decode predictions
"""

class BiLSTM_CRF_CharCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, tagset_size,
                 ft_model, char_vocab_size, char_embedding_dim=30,
                 char_out_channels=30, max_word_len=10, dropout=0.1):
        super(BiLSTM_CRF_CharCNN, self).__init__()

        # Word embedding
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        emb_weights = np.zeros((vocab_size, embedding_dim))
        for w, idx in vocab.items():
            if w in ft_model.wv:
                emb_weights[idx] = ft_model.wv[w]
            else:
                emb_weights[idx] = np.random.normal(scale=0.6, size=(embedding_dim,))
        self.embedding.weight.data.copy_(torch.tensor(emb_weights, dtype=torch.float32))

        self.word_dropout = nn.Dropout(dropout)

        # Char embedding + CNN
        self.char_embedding = nn.Embedding(char_vocab_size, char_embedding_dim, padding_idx=0) # padding character is ignored
        self.char_cnn = nn.Conv1d(char_embedding_dim, char_out_channels, kernel_size=3, padding=1) # keeps output length same as input length (so every char has a feature).
        self.char_dropout = nn.Dropout(dropout)

        # BiLSTM
        self.bilstm = nn.LSTM(embedding_dim + char_out_channels,
                              hidden_dim//2,
                              num_layers=1,
                              bidirectional=True,
                              batch_first=True)
        self.lstm_dropout = nn.Dropout(dropout)

        # Linear + CRF
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.crf = CRF(tagset_size, batch_first=True)
        self.max_word_len = max_word_len

    def forward(self, words, chars, tags=None, mask=None):
        """
        Forward pass for training or inference.

        Steps:
            1. Get word embeddings and apply dropout
            2. Get char embeddings for each word, apply CNN + max-pooling + dropout
            3. Concatenate word and char embeddings
            4. Pass through BiLSTM + dropout
            5. Project to tag scores via linear layer
            6. If tags provided:
                - compute CRF loss
                - The CRF computes the log-likelihood of the correct tag sequence given the emission scores.
                - But PyTorch minimizes loss, so we take the negative log-likelihood when calculating gradient descent.
            Else:
                - decode best tag sequence using CRF
        """

        # Word embeddings
        word_embeds = self.word_dropout(self.embedding(words))  # [B,L,E]

        # Char embeddings + CNN
        B, L, W = chars.size() # batch size, sentence length (#words), max characters per word
        chars_flat = chars.view(B*L, W) # Flatten batch + sentence dims so we can process all words in the batch together.
        char_embeds = self.char_embedding(chars_flat)  # [B*L, W, C], Map char indices → embeddings
        char_embeds = char_embeds.transpose(1,2)       # [B*L, C, W], nn.Conv1d expects (batch, channels, seq_len), so we swap dim 1 and 2
        char_cnn_out = F.relu(self.char_cnn(char_embeds)) # [B*L, char_out_channels, W]
        char_cnn_out, _ = torch.max(char_cnn_out, dim=2) # Max pooling over the character dimension ,[B*L, char_out_channels]
        char_cnn_out = self.char_dropout(char_cnn_out)
        char_cnn_out = char_cnn_out.view(B, L, -1) # Reshape back to [batch_size, seq_len, char_out_channels], to be concatenated with word embeddings for BiLSTM input

        # Combine
        combined = torch.cat([word_embeds, char_cnn_out], dim=2)

        # BiLSTM + dropout
        lstm_out, _ = self.bilstm(combined)
        lstm_out = self.lstm_dropout(lstm_out)

        # Linear
        emissions = self.hidden2tag(lstm_out)

        if tags is not None:
            loss = -self.crf(emissions, tags, mask=mask, reduction='mean')
            return loss
        else:
            return self.crf.decode(emissions, mask=mask)

In [None]:
# -----------------------
# 6. Hyperparameters
# -----------------------
embedding_dim = ft_model.vector_size
hidden_dim = 128
char_embedding_dim = 30
char_out_channels = 30
batch_size = 32
n_epochs = 5
lr = 0.001
max_word_len = 10

# -----------------------
# 7. DataLoader
# -----------------------
train_dataset = NERDataset(train_sents, train_labels, vocab, char_vocab, ner_tag_to_ix, max_word_len)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

test_dataset = NERDataset(test_sents, test_labels, vocab, char_vocab, ner_tag_to_ix, max_word_len)
test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn)

# -----------------------
# 8. Initialize model
# -----------------------
model = BiLSTM_CRF_CharCNN(len(vocab), embedding_dim, hidden_dim, len(ner_tag_to_ix),
                           ft_model, len(char_vocab), char_embedding_dim, char_out_channels,
                           max_word_len, dropout=0.2)
optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)  # ✅ weight decay

# -----------------------
# 9. Training
# -----------------------
start_time = time.time()
for epoch in range(1, n_epochs+1):
    model.train()
    total_loss = 0
    for Xw, Xc, y in train_loader:
        mask = (Xw != 0)  # padding mask, ignore padded positions during loss computation.
        optimizer.zero_grad() # Clears old gradients before computing new ones.
        loss = model(Xw, Xc, tags=y, mask=mask)
        loss.backward() # Computes gradients of loss w.r.t. model parameters.
        # keeps gradients within a reasonable range, stabilizing training
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)  # Prevents exploding gradients in LSTM/BiLSTM models.
        optimizer.step() # Applies gradient descent to update model parameters.
        total_loss += loss.item()
    print(f"Epoch {epoch}/{n_epochs}, Loss: {total_loss:.4f}")
print(f"Training completed in {time.time()-start_time:.2f} seconds")

Epoch 1/5, Loss: 3937.7961
Epoch 2/5, Loss: 1666.0485
Epoch 3/5, Loss: 1316.8349
Epoch 4/5, Loss: 1120.6721
Epoch 5/5, Loss: 992.3842
Training completed in 3356.47 seconds


In [None]:
# -----------------------
# 10. Evaluation on test set
# -----------------------
model.eval()
all_true, all_pred = [], []
with torch.no_grad():
    for Xw, Xc, y in test_loader:
        mask = (Xw != 0)
        preds = model(Xw, Xc, mask=mask)
        for i in range(len(preds)):
            length = mask[i].sum().item()
            all_pred.extend([id2tag[p] for p in preds[i][:length]])
            all_true.extend([id2tag[t.item()] for t in y[i][:length]])

print("NER Classification Report:")
print(classification_report(all_true, all_pred, digits=4, zero_division=0))

NER Classification Report:
              precision    recall  f1-score   support

      B-DATE     0.9675    0.9319    0.9493      2744
       B-LOC     0.9155    0.8880    0.9016     10967
      B-TIME     0.9356    0.9386    0.9371       635
      I-DATE     0.9679    0.9415    0.9546      4361
       I-LOC     0.8332    0.8392    0.8362      8158
      I-TIME     0.9301    0.9543    0.9420       809
           O     0.9945    0.9954    0.9950    500371

    accuracy                         0.9899    528045
   macro avg     0.9349    0.9270    0.9308    528045
weighted avg     0.9898    0.9899    0.9899    528045



In [7]:
# -----------------------
# 11. Save model + vocabs
# -----------------------
torch.save(model.state_dict(), "bilstm_crf_charcnn_fasttext.pth")
with open("word_vocab.json","w",encoding="utf-8") as f:
    json.dump(vocab,f,ensure_ascii=False,indent=2)
with open("char_vocab.json","w",encoding="utf-8") as f:
    json.dump(char_vocab,f,ensure_ascii=False,indent=2)
with open("tag2id.json","w",encoding="utf-8") as f:
    json.dump(ner_tag_to_ix,f,ensure_ascii=False,indent=2)
print("Model and vocabs saved successfully.")

Model and vocabs saved successfully.
