In [3]:
# ner_pipeline.py (with optional BiLSTM+CRF on top of BERT)

import os
# For precise CUDA errors
os.environ['CUDA_LAUNCH_BLOCKING'] = '1' 
# Hugging Face tokenizers parallelism
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

import logging
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch.nn import CrossEntropyLoss
from sklearn.model_selection import train_test_split
from seqeval.metrics import classification_report
from transformers import BertTokenizerFast, BertModel, get_linear_schedule_with_warmup
from tqdm.auto import tqdm

# Optional CRF (Conditional Random Field)
try:
    from torchcrf import CRF
    crf_available = True
except ImportError:
    crf_available = False
    logging.warning("CRF disabled: install torchcrf for CRF integration")

# Logging setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s')
logger = logging.getLogger(__name__)

# ─── CONFIG ─────────────────────────────────────────────────────────────────────

# Hyperparameters for the model and training
DATA_PATH        = 'ner_dataset.csv'              # Path to the NER dataset
PRETRAINED_MODEL = 'bert-base-uncased'            # BERT uncased model
MAX_LEN          = 128                             # Maximum token length for each sentence
BATCH_SIZE       = 32                              # Number of samples per batch
NUM_WORKERS      = 4                               # Number of CPU cores used for loading data
EPOCHS           = 3                               # Number of epochs for training
LEARNING_RATE    = 3e-5                            # Learning rate for optimizer
WARMUP_RATIO     = 0.1                             # Ratio of steps for learning rate warmup
GRAD_ACCUM       = 2                               # Gradient accumulation steps
SAVE_PATH        = 'ner_model4_new.pth'           # Path to save the trained model
IGNORE_INDEX     = -100                            # Label to ignore during training

# Toggle this to add BiLSTM before CRF/classifier
USE_BILSTM = True                                   # Whether to use BiLSTM before CRF
LSTM_HIDDEN = 256                                  # Hidden size for the BiLSTM

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
logger.info(f"Using device: {device}")

# ─── DATA UTILITIES ──────────────────────────────────────────────────────────────

def load_data(path):
    """
    Loads the dataset and processes it by filling missing values, 
    ensuring that all words are strings, and grouping by sentence.
    """
    df = pd.read_csv(path, encoding='latin-1', engine='python')
    
    # Fill NaN values in the 'Sentence #' column by forward filling the sentence number
    df['Sentence #'] = df['Sentence #'].fillna(method='ffill')

    # Fill NaN values in the 'Word' column with a placeholder (e.g., 'UNKNOWN')
    df['Word'] = df['Word'].fillna('UNKNOWN')

    # Ensure all values in 'Word' column are strings
    df['Word'] = df['Word'].astype(str)
    df['Word'] = df['Word'].str.lower()  # Lowercasing words
    
    df = df[['Sentence #', 'Word', 'Tag']].dropna()  # Remove rows with missing values
    sentences, tags = [], []

    # Group by sentence and append to sentences and tags lists
    for _, grp in df.groupby('Sentence #'):
        sentences.append(grp['Word'].astype(str).tolist())
        tags.append(grp['Tag'].tolist())
    
    return sentences, tags  # Return tokenized sentences and tags

def build_maps(tags):
    """
    Builds mappings between tags and their corresponding indices
    """
    unique = sorted({t for doc in tags for t in doc})  # Get unique tags
    t2i = {t: i for i, t in enumerate(unique)}  # Map tags to indices
    i2t = {i: t for t, i in t2i.items()}  # Reverse map of indices to tags
    return t2i, i2t  # Return both mappings

def prepare_dataset(sentences, tags, tokenizer, tag2idx):
    """
    Prepares the dataset by encoding sentences using the tokenizer 
    and converting tags to numerical values.
    """
    enc = tokenizer(
        sentences,
        is_split_into_words=True,
        padding='max_length',  # Pad sequences to max length
        truncation=True,       # Truncate sequences longer than max length
        max_length=MAX_LEN,    # Max length of tokenized sentences
        return_tensors='pt'    # Return PyTorch tensors
    )
    
    all_labels = []
    for i, seq in enumerate(tags):
        word_ids = enc.word_ids(batch_index=i)  # Get word ids for this batch
        label_ids = np.full(len(word_ids), IGNORE_INDEX, dtype=int)  # Initialize label array
        
        for j, wid in enumerate(word_ids):  # Assign tags to the words
            if wid is not None:
                label_ids[j] = tag2idx[seq[wid]]
        
        all_labels.append(torch.tensor(label_ids, dtype=torch.long))  # Convert labels to tensor
    
    return enc, all_labels  # Return the tokenized input and label tensors

def compute_class_weights(flat_labels, num_labels):
    """
    Compute class weights to handle label imbalance in the dataset.
    """
    counts = np.bincount(flat_labels[flat_labels >= 0], minlength=num_labels)  # Count occurrences of each label
    weights = 1.0 / (counts + 1e-6)  # Inverse frequency (with a small epsilon to avoid division by zero)
    weights = weights / weights.sum() * num_labels  # Normalize the weights
    return torch.tensor(weights, dtype=torch.float)  # Return as tensor for use in training

class NERDataset(Dataset):
    """
    Custom dataset class to handle tokenized input and label tensors.
    """
    def __init__(self, encodings, labels):
        self.encodings = encodings  # Store tokenized input data
        self.labels = labels       # Store corresponding labels

    def __len__(self):
        return len(self.labels)  # Return the number of samples

    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}  # Get the item for this index
        item['labels'] = self.labels[idx]  # Add the corresponding label
        return item

# ─── MODEL ───────────────────────────────────────────────────────────────────────

class BertBiLSTMCRF(torch.nn.Module):
    """
    BERT + BiLSTM + CRF model architecture for NER task
    """
    def __init__(self, model_name, num_labels, lstm_hidden):
        super().__init__()
        self.bert = BertModel.from_pretrained(model_name)  # Load BERT model
        self.dropout1 = torch.nn.Dropout(0.1)  # Dropout layer after BERT
        self.lstm = torch.nn.LSTM(
            input_size=self.bert.config.hidden_size,  # BERT hidden size
            hidden_size=lstm_hidden,                  # BiLSTM hidden size
            num_layers=1,                             # Single-layer LSTM
            batch_first=True,                         # Batch-first LSTM
            bidirectional=True                        # Bi-directional LSTM
        )
        self.dropout2 = torch.nn.Dropout(0.1)  # Dropout layer after LSTM
        self.classifier = torch.nn.Linear(2 * lstm_hidden, num_labels)  # Final classifier
        if not crf_available:
            raise ImportError("CRF requested but torchcrf not installed")
        self.crf = CRF(num_labels, batch_first=True)  # CRF layer for sequence prediction

    def forward(self, input_ids, attention_mask, labels=None):
        # BERT encoding
        out = self.bert(input_ids, attention_mask=attention_mask).last_hidden_state
        out = self.dropout1(out)  # Apply dropout after BERT
        
        # BiLSTM encoding
        out, _ = self.lstm(out)
        out = self.dropout2(out)  # Apply dropout after LSTM
        logits = self.classifier(out)  # Classify the output from LSTM
        
        mask = attention_mask.bool()  # Mask out padding tokens
        if labels is not None:
            lbl = labels.clone()
            lbl[lbl < 0] = 0  # Ignore the labels with index -100
            return -self.crf(logits, lbl, mask=mask, reduction='mean')  # CRF loss
        return self.crf.decode(logits, mask=mask)  # CRF decoding for predictions

class BertCRFOnly(torch.nn.Module):
    """
    BERT + CRF model architecture for NER task (without BiLSTM)
    """
    def __init__(self, model_name, num_labels):
        super().__init__()
        self.bert = BertModel.from_pretrained(model_name)  # Load BERT model
        self.dropout = torch.nn.Dropout(0.1)  # Dropout layer after BERT
        self.classifier = torch.nn.Linear(self.bert.config.hidden_size, num_labels)  # Classifier
        if not crf_available:
            raise ImportError("CRF requested but torchcrf not installed")
        self.crf = CRF(num_labels, batch_first=True)  # CRF layer for sequence prediction

    def forward(self, input_ids, attention_mask, labels=None):
        out = self.bert(input_ids, attention_mask=attention_mask).last_hidden_state  # BERT encoding
        logits = self.classifier(self.dropout(out))  # Apply dropout and classify
        mask = attention_mask.bool()  # Mask out padding tokens
        if labels is not None:
            lbl = labels.clone()
            lbl[lbl < 0] = 0  # Ignore the labels with index -100
            return -self.crf(logits, lbl, mask=mask, reduction='mean')  # CRF loss
        return self.crf.decode(logits, mask=mask)  # CRF decoding for predictions

# ─── TRAIN / EVAL ────────────────────────────────────────────────────────────────

def train_epoch(model, loader, optimizer, scheduler, scaler, epoch):
    """
    Train the model for one epoch
    """
    model.train()
    total_loss = 0
    optimizer.zero_grad()
    for step, batch in enumerate(tqdm(loader, desc=f"Train {epoch}")):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.cuda.amp.autocast():
            if crf_available:
                loss = model(batch['input_ids'], batch['attention_mask'], labels=batch['labels'])
            else:
                loss = model(**batch).loss
            loss = loss / GRAD_ACCUM  # Apply gradient accumulation
        scaler.scale(loss).backward()  # Backpropagate the loss
        if (step + 1) % GRAD_ACCUM == 0:
            scaler.step(optimizer)  # Update optimizer
            scaler.update()
            scheduler.step()  # Update learning rate scheduler
            optimizer.zero_grad()  # Zero gradients after each step
        total_loss += loss.item() * GRAD_ACCUM

    avg = total_loss / len(loader)  # Calculate average loss
    print(f"[Epoch {epoch}] Train Loss: {avg:.4f}")
    return avg

def eval_epoch(model, loader, idx2tag, epoch, split="Val"):
    """
    Evaluate the model after each epoch
    """
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            if crf_available:
                _ = model(batch['input_ids'], batch['attention_mask'], labels=batch['labels'])
                preds = model(batch['input_ids'], batch['attention_mask'])
            else:
                out = model(**batch)
                preds = torch.argmax(out.logits, dim=-1).cpu().tolist()  # Get predictions
            labels = batch['labels'].cpu().tolist()  # Get true labels
            for p_seq, l_seq in zip(preds, labels):
                valid = [i for i, t in enumerate(l_seq) if t != IGNORE_INDEX]
                all_labels.append([idx2tag[l_seq[i]] for i in valid])  # Append true labels
                all_preds.append([idx2tag[p_seq[i]] for i in valid])   # Append predicted labels

    report = classification_report(all_labels, all_preds)  # Generate classification report
    print(f"[Epoch {epoch}] {split} Report:\n{report}")
    return report

# ─── MAIN ───────────────────────────────────────────────────────────────────────

if __name__ == '__main__':
    # Load data and tokenize
    sentences, tags = load_data(DATA_PATH)
    tag2idx, idx2tag = build_maps(tags)
    tokenizer = BertTokenizerFast.from_pretrained(PRETRAINED_MODEL)
    encodings, labels_all = prepare_dataset(sentences, tags, tokenizer, tag2idx)

    # 60/20/20 split
    idxs = list(range(len(labels_all)))
    train_val, test_idx = train_test_split(idxs, test_size=0.2, random_state=42)
    train_idx, val_idx  = train_test_split(train_val, test_size=0.25, random_state=42)

    enc_train = {k: v[train_idx] for k, v in encodings.items()}
    enc_val   = {k: v[val_idx] for k, v in encodings.items()}
    enc_test  = {k: v[test_idx] for k, v in encodings.items()}

    labels_tr = [labels_all[i] for i in train_idx]
    labels_vl = [labels_all[i] for i in val_idx]
    labels_ts = [labels_all[i] for i in test_idx]

    # Compute class weights to handle class imbalance
    flat = torch.cat([l.view(-1) for l in labels_tr]).numpy()
    weights = compute_class_weights(flat, len(tag2idx)).to(device)

    # Select model
    if USE_BILSTM:
        model = BertBiLSTMCRF(PRETRAINED_MODEL, len(tag2idx), LSTM_HIDDEN).to(device)
    else:
        model = BertCRFOnly(PRETRAINED_MODEL, len(tag2idx)).to(device)

    # Optimizer, scheduler, and scaler setup
    optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)
    total_steps = (len(labels_tr) // BATCH_SIZE + 1) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(WARMUP_RATIO * total_steps),
        num_training_steps=total_steps
    )
    scaler = torch.cuda.amp.GradScaler()

    # Dataloaders for training, validation, and testing
    train_loader = DataLoader(NERDataset(enc_train, labels_tr), batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True)
    val_loader   = DataLoader(NERDataset(enc_val, labels_vl), batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, pin_memory=True)
    test_loader  = DataLoader(NERDataset(enc_test, labels_ts), batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, pin_memory=True)

    # Training and validation loop
    for epoch in range(1, EPOCHS + 1):
        train_epoch(model, train_loader, optimizer, scheduler, scaler, epoch)
        eval_epoch(model, val_loader, idx2tag, epoch, split="Val")

    # Final test evaluation
    print("=== Final Test Evaluation ===")
    eval_epoch(model, test_loader, idx2tag, epoch="final", split="Test")

    # Save the trained model
    torch.save(model.state_dict(), SAVE_PATH)
    print(f"Saved model to {SAVE_PATH}")


2025-04-25 17:31:38,536 INFO: Using device: cuda
  df['Sentence #'] = df['Sentence #'].fillna(method='ffill')
  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():
Train 1: 100%|██████████| 900/900 [07:37<00:00,  1.97it/s]


[Epoch 1] Train Loss: 20.8549


  _warn_prf(average, modifier, msg_start, len(result))


[Epoch 1] Val Report:
              precision    recall  f1-score   support

         art       0.00      0.00      0.00       147
         eve       0.00      0.00      0.00        62
         geo       0.73      0.85      0.79     11376
         gpe       0.87      0.90      0.88      3346
         nat       0.00      0.00      0.00        50
         org       0.60      0.57      0.58      6486
         per       0.71      0.76      0.73      5226
         tim       0.58      0.69      0.63      4368

   micro avg       0.69      0.75      0.72     31061
   macro avg       0.44      0.47      0.45     31061
weighted avg       0.69      0.75      0.72     31061



Train 2: 100%|██████████| 900/900 [07:33<00:00,  1.99it/s]

[Epoch 2] Train Loss: 4.5335





[Epoch 2] Val Report:
              precision    recall  f1-score   support

         art       0.00      0.00      0.00       147
         eve       0.00      0.00      0.00        62
         geo       0.83      0.89      0.85     11376
         gpe       0.91      0.93      0.92      3346
         nat       0.00      0.00      0.00        50
         org       0.67      0.62      0.65      6486
         per       0.77      0.78      0.78      5226
         tim       0.81      0.81      0.81      4368

   micro avg       0.79      0.80      0.80     31061
   macro avg       0.50      0.50      0.50     31061
weighted avg       0.78      0.80      0.79     31061



Train 3: 100%|██████████| 900/900 [07:29<00:00,  2.00it/s]

[Epoch 3] Train Loss: 3.3455





[Epoch 3] Val Report:
              precision    recall  f1-score   support

         art       0.75      0.06      0.11       147
         eve       0.00      0.00      0.00        62
         geo       0.84      0.88      0.86     11376
         gpe       0.90      0.93      0.92      3346
         nat       0.00      0.00      0.00        50
         org       0.68      0.66      0.67      6486
         per       0.76      0.80      0.78      5226
         tim       0.86      0.83      0.84      4368

   micro avg       0.80      0.81      0.81     31061
   macro avg       0.60      0.52      0.52     31061
weighted avg       0.80      0.81      0.80     31061

=== Final Test Evaluation ===
[Epoch final] Test Report:
              precision    recall  f1-score   support

         art       0.00      0.00      0.00       149
         eve       0.00      0.00      0.00        78
         geo       0.84      0.89      0.86     11596
         gpe       0.92      0.93      0.93      3412

In [4]:
sentences, tags       = load_data(DATA_PATH)  # Load data and split into sentences and tags
tag2idx, idx2tag      = build_maps(tags)  # Build mapping from tags to indices
tokenizer             = BertTokenizerFast.from_pretrained(PRETRAINED_MODEL)  # Load BERT tokenizer
encodings, labels_all = prepare_dataset(sentences, tags, tokenizer, tag2idx)  # Prepare the tokenized dataset

# 60/20/20 split (train/val/test)
indices        = list(range(len(labels_all)))  # List of indices for the data
train_val_idx, test_idx = train_test_split(indices, test_size=0.2, random_state=42)  # Split into train and test
train_idx, val_idx      = train_test_split(train_val_idx, test_size=0.25, random_state=42)  # Split train into train and validation

  df['Sentence #'] = df['Sentence #'].fillna(method='ffill')


In [5]:
tags

[['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-geo',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-geo',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-gpe',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B-gpe',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-tim',
  'O',
  'O',
  'O',
  'B-org',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'B-tim',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-geo',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-org',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-geo',
  'I-geo',
  'O'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['B-geo',
  'O',
  'O',
  'B-per',
  'I-per',
  'O',
  'B-tim',
  'O',
  'B-geo',
  'O',
  'B-gpe',
  'O',
  'B-gpe',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-geo',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B-per',
  'I-per',
  'O',
  'O',
  'O',
  'O',


In [6]:
tag2idx

{'B-art': 0,
 'B-eve': 1,
 'B-geo': 2,
 'B-gpe': 3,
 'B-nat': 4,
 'B-org': 5,
 'B-per': 6,
 'B-tim': 7,
 'I-art': 8,
 'I-eve': 9,
 'I-geo': 10,
 'I-gpe': 11,
 'I-nat': 12,
 'I-org': 13,
 'I-per': 14,
 'I-tim': 15,
 'O': 16}

In [7]:
idx2tag 

{0: 'B-art',
 1: 'B-eve',
 2: 'B-geo',
 3: 'B-gpe',
 4: 'B-nat',
 5: 'B-org',
 6: 'B-per',
 7: 'B-tim',
 8: 'I-art',
 9: 'I-eve',
 10: 'I-geo',
 11: 'I-gpe',
 12: 'I-nat',
 13: 'I-org',
 14: 'I-per',
 15: 'I-tim',
 16: 'O'}

In [8]:
labels_all

[tensor([-100,   16,   16,   16,   16,   16,   16,    2,   16,   16,   16,   16,
           16,    2,   16,   16,   16,   16,   16,    3,   16,   16,   16,   16,
           16, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100]),
 tensor([-100,    3,   16,   16,   16,   16,   16,   16,   16,   16,   16,   16,
           16,   16,   16,   16,    7,   16,   16,

In [9]:
encodings

{'input_ids': tensor([[ 101, 5190, 1997,  ...,    0,    0,    0],
        [ 101, 7726, 4584,  ...,    0,    0,    0],
        [ 101, 7739, 4409,  ...,    0,    0,    0],
        ...,
        [ 101, 2206, 4238,  ...,    0,    0,    0],
        [ 101, 2144, 2059,  ...,    0,    0,    0],
        [ 101, 1996, 2142,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}