In [10]:
import pandas as pd

# Load the dataset
df = pd.read_csv('ner_dataset.csv', encoding='latin-1', engine='python')

# Check for missing values in any column
missing_values = df.isnull().sum()
print("Missing values per column:")
print(missing_values)


Missing values per column:
Sentence #    1000616
Word               10
POS                 0
Tag                 0
dtype: int64


In [11]:
df["Tag"].value_counts()

Tag
O        887908
B-geo     37644
B-tim     20333
B-org     20143
I-per     17251
B-per     16990
I-org     16784
B-gpe     15870
I-geo      7414
I-tim      6528
B-art       402
B-eve       308
I-art       297
I-eve       253
B-nat       201
I-gpe       198
I-nat        51
Name: count, dtype: int64

In [12]:
df["Tag"].unique()

array(['O', 'B-geo', 'B-gpe', 'B-per', 'I-geo', 'B-org', 'I-org', 'B-tim',
       'B-art', 'I-art', 'I-per', 'I-gpe', 'I-tim', 'B-nat', 'B-eve',
       'I-eve', 'I-nat'], dtype=object)

In [13]:
missing_values_rows = df[df.isnull().any(axis=1)]

In [15]:
missing_values_rows

Unnamed: 0,Sentence #,Word,POS,Tag
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
5,,through,IN,O
...,...,...,...,...
1048570,,they,PRP,O
1048571,,responded,VBD,O
1048572,,to,TO,O
1048573,,the,DT,O


In [16]:
# Check for duplicate rows based on Sentence #, Word, and Tag columns
duplicates = df[df.duplicated(subset=['Sentence #', 'Word', 'Tag'])]
print("Duplicate rows:")
print(duplicates)


Duplicate rows:
        Sentence #       Word  POS Tag
15             NaN        the   DT   O
17             NaN         of   IN   O
25             NaN         of   IN   O
28             NaN         in   IN   O
29             NaN        the   DT   O
...            ...        ...  ...  ..
1048570        NaN       they  PRP   O
1048571        NaN  responded  VBD   O
1048572        NaN         to   TO   O
1048573        NaN        the   DT   O
1048574        NaN     attack   NN   O

[958659 rows x 4 columns]


In [17]:
# Check if sentences are grouped correctly by Sentence #
sentence_counts = df['Sentence #'].value_counts()
print("Sentence counts (how many words per sentence):")
print(sentence_counts)

# Look for sentences with missing words (i.e., empty rows for specific Sentence #)
missing_sentence_words = df[df['Word'].isnull()]['Sentence #']
print("Sentences with missing words:")
print(missing_sentence_words)


Sentence counts (how many words per sentence):
Sentence #
Sentence: 47959    1
Sentence: 1        1
Sentence: 2        1
Sentence: 3        1
Sentence: 4        1
                  ..
Sentence: 12       1
Sentence: 11       1
Sentence: 10       1
Sentence: 9        1
Sentence: 8        1
Name: count, Length: 47959, dtype: int64
Sentences with missing words:
197658      Sentence: 9047
256026     Sentence: 11709
257069     Sentence: 11759
571211     Sentence: 26129
613777     Sentence: 28049
747019     Sentence: 34152
901758     Sentence: 41181
903054     Sentence: 41244
944880     Sentence: 43177
1003438    Sentence: 45887
Name: Sentence #, dtype: object


In [None]:
# ner_pipeline.py (Updated: train/val/test split, no TensorBoard)

import os
# Force synchronous CUDA calls for precise error reporting
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'  # Ensures errors in CUDA are reported properly and synchronously
# Hugging Face tokenizers parallelism
os.environ['TOKENIZERS_PARALLELISM'] = 'false'  # Disable parallelism in tokenization to prevent potential issues

import logging
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch.nn import CrossEntropyLoss
from sklearn.model_selection import train_test_split
from seqeval.metrics import classification_report  # For evaluating the NER model performance
from transformers import BertTokenizerFast, BertModel, get_linear_schedule_with_warmup  # BERT model utilities
from tqdm.auto import tqdm  # For progress bars in training loops

# Optional CRF (Conditional Random Field) support
try:
    from torchcrf import CRF  # Optional CRF layer for better sequence labeling
    crf_available = True
except ImportError:
    crf_available = False
    logging.warning("CRF disabled: install torchcrf for CRF integration")

# Logging configuration
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)  # Setting up a logger for the script

# Hyperparameters for training
DATA_PATH        = 'ner_dataset.csv'  # Path to the NER dataset
PRETRAINED_MODEL = 'bert-base-cased'  # Pretrained BERT model (uncased means lowercased)
MAX_LEN          = 128  # Maximum sequence length for input to BERT
BATCH_SIZE       = 32   # Batch size during training
NUM_WORKERS      = 4    # Number of workers for data loading
EPOCHS           = 3    # Number of training epochs
LEARNING_RATE    = 3e-5 # Learning rate for optimization
PATIENCE         = 2    # Early stopping patience
WARMUP_RATIO     = 0.1  # Ratio of warmup steps for learning rate scheduler
GRAD_ACCUM       = 2    # Gradient accumulation steps
SAVE_PATH        = 'ner_model.pth'  # Path to save the trained model
IGNORE_INDEX     = -100  # Used to ignore certain tokens during loss computation

# Device (GPU or CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # Automatically uses GPU if available
logger.info(f"Device set to: {device}")

# --- Data utilities ---

def load_data(path):
    # Load the dataset from CSV file
    df = pd.read_csv(path, encoding='latin-1', engine='python')  
    # Fill NaN values in the 'Sentence #' column by forward filling the sentence number
    df['Sentence #'] = df['Sentence #'].fillna(method='ffill')

    # Fill NaN values in the 'Word' column with a placeholder (e.g., 'UNKNOWN')
    df['Word'] = df['Word'].fillna('UNKNOWN')

    # Ensure all values in 'Word' column are strings
    df['Word'] = df['Word'].astype(str)
    
    
    # Drop any rows with missing 'Tag' values (for completeness)
    df = df[['Sentence #', 'Word', 'Tag']].dropna()
    sentences, tags = [], []
    # Group the dataframe by 'Sentence #' and extract words and their respective tags
    for _, grp in df.groupby('Sentence #'):
        sentences.append(grp['Word'].astype(str).tolist())  # List of words in the sentence
        tags.append(grp['Tag'].tolist())  # Corresponding tags for the sentence
    return sentences, tags  # Return the sentences and their respective tags

def build_maps(tags):
    # Build mappings from tag to index and index to tag
    unique = sorted({t for doc in tags for t in doc})  # Get all unique tags from the dataset
    t2i = {t: i for i, t in enumerate(unique)}  # Map tags to indices
    i2t = {i: t for t, i in t2i.items()}  # Reverse mapping from indices to tags
    return t2i, i2t  # Return both mappings

def prepare_dataset(sentences, tags, tokenizer, tag2idx):
    # Tokenize the sentences and align the labels (tags) with the tokens
    enc = tokenizer(
        sentences,
        is_split_into_words=True,
        padding='max_length',
        truncation=True,
        max_length=MAX_LEN,
        return_tensors='pt'
    )
    all_labels = []
    # Loop through each sentence's tags
    for i, seq in enumerate(tags):
        word_ids = enc.word_ids(batch_index=i)  # Get word-level ids for tokens in the sentence
        label_ids = np.full(len(word_ids), IGNORE_INDEX, dtype=int)  # Initialize label ids with the ignore index
        # Map the tag labels to the token ids
        for j, wid in enumerate(word_ids):
            if wid is not None:
                label_ids[j] = tag2idx[seq[wid]]  # Assign tag ids from the tag2idx mapping
        all_labels.append(torch.tensor(label_ids, dtype=torch.long))  # Add to label list
    return enc, all_labels  # Return the tokenized sentences and their corresponding labels

def compute_class_weights(flat_labels, num_labels):
    # Compute class weights to address label imbalance
    counts = np.bincount(flat_labels[flat_labels >= 0], minlength=num_labels)  # Count occurrences of each label
    weights = 1.0 / (counts + 1e-6)  # Inverse frequency (with a small epsilon to avoid division by zero)
    weights = weights / weights.sum() * num_labels  # Normalize the weights
    return torch.tensor(weights, dtype=torch.float)  # Return as tensor for use in training

def early_stopping(val_losses, patience):
    # Implement early stopping to avoid overfitting
    if len(val_losses) <= patience:
        return False
    recent = val_losses[-patience-1:]
    return all(x <= recent[0] for x in recent[1:])  # Check if the recent validation losses are stable

class NERDataset(Dataset):
    # Custom dataset class for NER tasks
    def __init__(self, encodings, labels):
        self.encodings = encodings  # Encoded tokens
        self.labels    = labels  # Corresponding labels (tags)
    def __len__(self):
        return len(self.labels)  # Return the number of samples
    def __getitem__(self, idx):
        # Return a sample (input ids, attention mask, and labels)
        item = {k: v[idx] for k, v in self.encodings.items()}
        item['labels'] = self.labels[idx]  # Add label to the sample
        return item

# --- Model definitions ---

class BertCRF(torch.nn.Module):
    # Custom model class integrating BERT and CRF (Conditional Random Field) for sequence tagging
    def __init__(self, model_name, num_labels):
        super().__init__()
        self.bert       = BertModel.from_pretrained(model_name)  # Load pretrained BERT model
        self.dropout    = torch.nn.Dropout(0.1)  # Dropout layer to prevent overfitting
        self.classifier = torch.nn.Linear(self.bert.config.hidden_size, num_labels)  # Classifier layer
        if not crf_available:
            raise ImportError("CRF requested but torchcrf not installed")
        self.crf = CRF(num_labels, batch_first=True)  # CRF layer for sequence labeling

    def forward(self, input_ids, attention_mask, labels=None):
        hidden = self.bert(input_ids, attention_mask=attention_mask).last_hidden_state  # Get hidden states from BERT
        logits = self.classifier(self.dropout(hidden))  # Apply dropout and classify logits
        mask   = attention_mask.bool()  # Attention mask to focus on non-padding tokens
        if labels is not None:
            lbl = labels.clone()
            lbl[lbl < 0] = 0  # Ignore padding
            return -self.crf(logits, lbl, mask=mask, reduction='mean')  # Compute CRF loss
        return self.crf.decode(logits, mask=mask)  # Decode the predictions using CRF

# --- Training & evaluation loops ---

def train_epoch(model, loader, optimizer, scheduler, scaler, epoch):
    model.train()  # Set the model to training mode
    total_loss = 0  # Track total loss during the epoch
    optimizer.zero_grad()  # Zero the gradients
    for step, batch in enumerate(tqdm(loader, desc=f"Train {epoch}")):  # Loop through training batches
        batch = {k: v.to(device) for k, v in batch.items()}  # Move batch to device (GPU/CPU)
        with torch.cuda.amp.autocast():  # Automatic mixed precision for faster training on GPUs
            if crf_available and isinstance(model, BertCRF):
                loss = model(batch['input_ids'], batch['attention_mask'], labels=batch['labels'])
            else:
                loss = model(**batch).loss  # Compute loss
            loss = loss / GRAD_ACCUM  # Gradient accumulation
        scaler.scale(loss).backward()  # Backpropagate loss with mixed precision
        if (step + 1) % GRAD_ACCUM == 0:
            scaler.step(optimizer)  # Update model parameters
            scaler.update()  # Update the scaler for mixed precision
            scheduler.step()  # Update learning rate scheduler
            optimizer.zero_grad()  # Zero gradients for next step
        total_loss += loss.item() * GRAD_ACCUM  # Accumulate loss

    avg_loss = total_loss / len(loader)  # Compute average loss for the epoch
    print(f"[Epoch {epoch}] Train Loss: {avg_loss:.4f}")  # Print the loss
    return avg_loss

def eval_epoch(model, loader, idx2tag, epoch, split="Val"):
    model.eval()  # Set the model to evaluation mode
    all_preds, all_labels = [], []  # Store predictions and true labels
    with torch.no_grad():  # Disable gradients during evaluation
        for batch in loader:  # Loop through the validation/test batches
            batch = {k: v.to(device) for k, v in batch.items()}  # Move batch to device
            if crf_available and isinstance(model, BertCRF):
                _    = model(batch['input_ids'], batch['attention_mask'], labels=batch['labels'])
                preds = model(batch['input_ids'], batch['attention_mask'])
            else:
                output = model(**batch)  # Get the model output
                logits = output.logits  # Get the logits (predictions)
                preds  = torch.argmax(logits, dim=-1).cpu().tolist()  # Get predicted labels

            labels = batch['labels'].cpu().tolist()  # Get true labels
            for p_seq, l_seq in zip(preds, labels):
                valid = [i for i, t in enumerate(l_seq) if t != IGNORE_INDEX]  # Remove ignored indices (padding)
                all_labels.append([idx2tag[t] for i, t in enumerate(l_seq) if i in valid])
                all_preds.append([idx2tag[p] for i, p in enumerate(p_seq) if i in valid])

    report = classification_report(all_labels, all_preds)  # Generate classification report
    print(f"[Epoch {epoch}] {split} Classification Report:\n{report}")  # Print the report
    return report

# --- Main script ---

if __name__ == '__main__':
    # Load & tokenize
    sentences, tags       = load_data(DATA_PATH)  # Load data and split into sentences and tags
    tag2idx, idx2tag      = build_maps(tags)  # Build mapping from tags to indices
    tokenizer             = BertTokenizerFast.from_pretrained(PRETRAINED_MODEL)  # Load BERT tokenizer
    encodings, labels_all = prepare_dataset(sentences, tags, tokenizer, tag2idx)  # Prepare the tokenized dataset

    # 60/20/20 split (train/val/test)
    indices        = list(range(len(labels_all)))  # List of indices for the data
    train_val_idx, test_idx = train_test_split(indices, test_size=0.2, random_state=42)  # Split into train and test
    train_idx, val_idx      = train_test_split(train_val_idx, test_size=0.25, random_state=42)  # Split train into train and validation
    # 0.25 * 0.8 = 0.2 → yields 60% train, 20% val, 20% test

    enc_train = {k: v[train_idx] for k, v in encodings.items()}  # Get training data
    enc_val   = {k: v[val_idx]   for k, v in encodings.items()}  # Get validation data
    enc_test  = {k: v[test_idx]  for k, v in encodings.items()}  # Get test data

    labels_tr = [labels_all[i] for i in train_idx]  # Get training labels
    labels_vl = [labels_all[i] for i in val_idx]  # Get validation labels
    labels_ts = [labels_all[i] for i in test_idx]  # Get test labels

    # Class weights & loss
    flat    = torch.cat([l.view(-1) for l in labels_tr]).numpy()  # Flatten labels for class weighting
    weights = compute_class_weights(flat, len(tag2idx)).to(device)  # Compute class weights
    loss_fn = CrossEntropyLoss(weight=weights, ignore_index=IGNORE_INDEX)  # Define loss function

    # Model
    if crf_available:
        model = BertCRF(PRETRAINED_MODEL, len(tag2idx)).to(device)  # If CRF available, use CRF-based model
    else:
        from transformers import BertForTokenClassification
        model = BertForTokenClassification.from_pretrained(
            PRETRAINED_MODEL, num_labels=len(tag2idx)
        ).to(device)  # Use standard BERT for token classification

    optimizer   = optim.AdamW(model.parameters(), lr=LEARNING_RATE)  # AdamW optimizer
    total_steps = (len(labels_tr) // BATCH_SIZE + 1) * EPOCHS  # Total number of training steps
    scheduler   = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(WARMUP_RATIO * total_steps),
        num_training_steps=total_steps
    )  # Learning rate scheduler
    scaler      = torch.cuda.amp.GradScaler()  # Mixed precision scaling

    # Dataloaders
    train_loader = DataLoader(NERDataset(enc_train, labels_tr),
                              batch_size=BATCH_SIZE, shuffle=True,
                              num_workers=NUM_WORKERS, pin_memory=True)
    val_loader   = DataLoader(NERDataset(enc_val,   labels_vl),
                              batch_size=BATCH_SIZE,
                              num_workers=NUM_WORKERS, pin_memory=True)
    test_loader  = DataLoader(NERDataset(enc_test,  labels_ts),
                              batch_size=BATCH_SIZE,
                              num_workers=NUM_WORKERS, pin_memory=True)

    # Training + validation
    val_reports = []
    for epoch in range(1, EPOCHS + 1):
        train_epoch(model, train_loader, optimizer, scheduler, scaler, epoch)  # Train the model for one epoch
        report = eval_epoch(model, val_loader, idx2tag, epoch, split="Val")  # Evaluate on the validation set
        val_reports.append(report)

    # Final test evaluation
    print("=== Final Test Set Evaluation ===")
    _ = eval_epoch(model, test_loader, idx2tag, epoch="final", split="Test")  # Final test evaluation

    # Save the trained model
    torch.save(model.state_dict(), SAVE_PATH)
    print(f"Model saved to {SAVE_PATH}")  # Model saved to disk


  from .autonotebook import tqdm as notebook_tqdm
2025-04-25 15:46:13,328 INFO: Device set to: cuda
  df['Sentence #'] = df['Sentence #'].fillna(method='ffill')
  scaler      = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():
Train 1: 100%|██████████| 900/900 [14:40<00:00,  1.02it/s]

[Epoch 1] Train Loss: 14.1303



  _warn_prf(average, modifier, msg_start, len(result))


[Epoch 1] Val Classification Report:
              precision    recall  f1-score   support

         art       0.00      0.00      0.00       167
         eve       0.00      0.00      0.00        65
         geo       0.81      0.89      0.85     11859
         gpe       0.91      0.91      0.91      3444
         nat       0.00      0.00      0.00        51
         org       0.71      0.63      0.67      6879
         per       0.76      0.75      0.76      5525
         tim       0.77      0.81      0.79      4403

   micro avg       0.79      0.79      0.79     32393
   macro avg       0.50      0.50      0.50     32393
weighted avg       0.78      0.79      0.79     32393



Train 2: 100%|██████████| 900/900 [07:06<00:00,  2.11it/s]

[Epoch 2] Train Loss: 3.4261





[Epoch 2] Val Classification Report:
              precision    recall  f1-score   support

         art       0.62      0.03      0.06       167
         eve       0.46      0.20      0.28        65
         geo       0.83      0.90      0.86     11859
         gpe       0.92      0.93      0.92      3444
         nat       0.00      0.00      0.00        51
         org       0.73      0.68      0.71      6879
         per       0.79      0.78      0.78      5525
         tim       0.84      0.84      0.84      4403

   micro avg       0.81      0.82      0.82     32393
   macro avg       0.65      0.54      0.56     32393
weighted avg       0.81      0.82      0.81     32393



Train 3: 100%|██████████| 900/900 [07:12<00:00,  2.08it/s]

[Epoch 3] Train Loss: 2.6800





[Epoch 3] Val Classification Report:
              precision    recall  f1-score   support

         art       0.47      0.19      0.27       167
         eve       0.40      0.22      0.28        65
         geo       0.84      0.90      0.87     11859
         gpe       0.94      0.94      0.94      3444
         nat       0.32      0.24      0.27        51
         org       0.74      0.70      0.72      6879
         per       0.79      0.79      0.79      5525
         tim       0.82      0.85      0.83      4403

   micro avg       0.82      0.83      0.82     32393
   macro avg       0.66      0.60      0.62     32393
weighted avg       0.81      0.83      0.82     32393

=== Final Test Set Evaluation ===
[Epoch final] Test Classification Report:
              precision    recall  f1-score   support

         art       0.30      0.09      0.14       166
         eve       0.39      0.20      0.26        80
         geo       0.84      0.91      0.87     12109
         gpe       

In [3]:
# ner_pipeline.py (Updated: train/val/test split, no TensorBoard)

import os
# Force synchronous CUDA calls for precise error reporting
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
# Hugging Face tokenizers parallelism
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

import logging
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch.nn import CrossEntropyLoss
from sklearn.model_selection import train_test_split
from seqeval.metrics import classification_report
from transformers import BertTokenizerFast, BertModel, get_linear_schedule_with_warmup
from tqdm.auto import tqdm

# Optional CRF
try:
    from torchcrf import CRF
    crf_available = True
except ImportError:
    crf_available = False
    logging.warning("CRF disabled: install torchcrf for CRF integration")

# Logging
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

# Hyperparameters
DATA_PATH        = 'ner_dataset.csv'
PRETRAINED_MODEL = 'bert-base-uncased'
MAX_LEN          = 128
BATCH_SIZE       = 32
NUM_WORKERS      = 4
EPOCHS           = 3
LEARNING_RATE    = 3e-5
PATIENCE         = 2
WARMUP_RATIO     = 0.1
GRAD_ACCUM       = 2
SAVE_PATH        = 'ner_model3_new2.pth'
IGNORE_INDEX     = -100

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
logger.info(f"Device set to: {device}")

# --- Data utilities ---

def load_data(path):
    df = pd.read_csv(path, encoding='latin-1', engine='python')
        # Fill NaN values in the 'Sentence #' column by forward filling the sentence number
    df['Sentence #'] = df['Sentence #'].fillna(method='ffill')

    # Fill NaN values in the 'Word' column with a placeholder (e.g., 'UNKNOWN')
    df['Word'] = df['Word'].fillna('UNKNOWN')

    # Ensure all values in 'Word' column are strings
    df['Word'] = df['Word'].astype(str)
    df['Word'] = df['Word'].str.lower()
    
    df = df[['Sentence #', 'Word', 'Tag']].dropna()
    sentences, tags = [], []
    for _, grp in df.groupby('Sentence #'):
        sentences.append(grp['Word'].astype(str).tolist())
        tags.append(grp['Tag'].tolist())
    return sentences, tags

def build_maps(tags):
    unique = sorted({t for doc in tags for t in doc})
    t2i = {t: i for i, t in enumerate(unique)}
    i2t = {i: t for t, i in t2i.items()}
    return t2i, i2t

def prepare_dataset(sentences, tags, tokenizer, tag2idx):
    enc = tokenizer(
        sentences,
        is_split_into_words=True,
        padding='max_length',
        truncation=True,
        max_length=MAX_LEN,
        return_tensors='pt'
    )
    all_labels = []
    for i, seq in enumerate(tags):
        word_ids = enc.word_ids(batch_index=i)
        label_ids = np.full(len(word_ids), IGNORE_INDEX, dtype=int)
        for j, wid in enumerate(word_ids):
            if wid is not None:
                label_ids[j] = tag2idx[seq[wid]]
        all_labels.append(torch.tensor(label_ids, dtype=torch.long))
    return enc, all_labels

def compute_class_weights(flat_labels, num_labels):
    counts = np.bincount(flat_labels[flat_labels >= 0], minlength=num_labels)
    weights = 1.0 / (counts + 1e-6)
    weights = weights / weights.sum() * num_labels
    return torch.tensor(weights, dtype=torch.float)

def early_stopping(val_losses, patience):
    if len(val_losses) <= patience:
        return False
    recent = val_losses[-patience-1:]
    return all(x <= recent[0] for x in recent[1:])

class NERDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels    = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

# --- Model definitions ---

class BertCRF(torch.nn.Module):
    def __init__(self, model_name, num_labels):
        super().__init__()
        self.bert       = BertModel.from_pretrained(model_name)
        self.dropout    = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(self.bert.config.hidden_size, num_labels)
        if not crf_available:
            raise ImportError("CRF requested but torchcrf not installed")
        self.crf = CRF(num_labels, batch_first=True)

    def forward(self, input_ids, attention_mask, labels=None):
        hidden = self.bert(input_ids, attention_mask=attention_mask).last_hidden_state
        logits = self.classifier(self.dropout(hidden))
        mask   = attention_mask.bool()
        if labels is not None:
            lbl = labels.clone()
            lbl[lbl < 0] = 0
            return -self.crf(logits, lbl, mask=mask, reduction='mean')
        return self.crf.decode(logits, mask=mask)

# --- Training & evaluation loops ---

def train_epoch(model, loader, optimizer, scheduler, scaler, epoch):
    model.train()
    total_loss = 0
    optimizer.zero_grad()
    for step, batch in enumerate(tqdm(loader, desc=f"Train {epoch}")):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.cuda.amp.autocast():
            if crf_available and isinstance(model, BertCRF):
                loss = model(batch['input_ids'], batch['attention_mask'], labels=batch['labels'])
            else:
                loss = model(**batch).loss
            loss = loss / GRAD_ACCUM
        scaler.scale(loss).backward()
        if (step + 1) % GRAD_ACCUM == 0:
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            optimizer.zero_grad()
        total_loss += loss.item() * GRAD_ACCUM

    avg_loss = total_loss / len(loader)
    print(f"[Epoch {epoch}] Train Loss: {avg_loss:.4f}")
    return avg_loss

def eval_epoch(model, loader, idx2tag, epoch, split="Val"):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            if crf_available and isinstance(model, BertCRF):
                _    = model(batch['input_ids'], batch['attention_mask'], labels=batch['labels'])
                preds = model(batch['input_ids'], batch['attention_mask'])
            else:
                output = model(**batch)
                logits = output.logits
                preds  = torch.argmax(logits, dim=-1).cpu().tolist()

            labels = batch['labels'].cpu().tolist()
            for p_seq, l_seq in zip(preds, labels):
                valid = [i for i, t in enumerate(l_seq) if t != IGNORE_INDEX]
                all_labels.append([idx2tag[t] for i, t in enumerate(l_seq) if i in valid])
                all_preds.append([idx2tag[p] for i, p in enumerate(p_seq) if i in valid])

    report = classification_report(all_labels, all_preds)
    print(f"[Epoch {epoch}] {split} Classification Report:\n{report}")
    return report

# --- Main script ---

if __name__ == '__main__':
    # Load & tokenize
    sentences, tags       = load_data(DATA_PATH)
    tag2idx, idx2tag      = build_maps(tags)
    tokenizer             = BertTokenizerFast.from_pretrained(PRETRAINED_MODEL)
    encodings, labels_all = prepare_dataset(sentences, tags, tokenizer, tag2idx)

    # 60/20/20 split
    indices        = list(range(len(labels_all)))
    train_val_idx, test_idx = train_test_split(indices, test_size=0.2, random_state=42)
    train_idx, val_idx      = train_test_split(train_val_idx, test_size=0.25, random_state=42)
    # 0.25 * 0.8 = 0.2 → yields 60% train, 20% val, 20% test

    enc_train = {k: v[train_idx] for k, v in encodings.items()}
    enc_val   = {k: v[val_idx]   for k, v in encodings.items()}
    enc_test  = {k: v[test_idx]  for k, v in encodings.items()}

    labels_tr = [labels_all[i] for i in train_idx]
    labels_vl = [labels_all[i] for i in val_idx]
    labels_ts = [labels_all[i] for i in test_idx]

    # Class weights & loss
    flat    = torch.cat([l.view(-1) for l in labels_tr]).numpy()
    weights = compute_class_weights(flat, len(tag2idx)).to(device)
    loss_fn = CrossEntropyLoss(weight=weights, ignore_index=IGNORE_INDEX)

    # Model
    if crf_available:
        model = BertCRF(PRETRAINED_MODEL, len(tag2idx)).to(device)
    else:
        from transformers import BertForTokenClassification
        model = BertForTokenClassification.from_pretrained(
            PRETRAINED_MODEL, num_labels=len(tag2idx)
        ).to(device)

    optimizer   = optim.AdamW(model.parameters(), lr=LEARNING_RATE)
    total_steps = (len(labels_tr) // BATCH_SIZE + 1) * EPOCHS
    scheduler   = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(WARMUP_RATIO * total_steps),
        num_training_steps=total_steps
    )
    scaler      = torch.cuda.amp.GradScaler()

    # Dataloaders
    train_loader = DataLoader(NERDataset(enc_train, labels_tr),
                              batch_size=BATCH_SIZE, shuffle=True,
                              num_workers=NUM_WORKERS, pin_memory=True)
    val_loader   = DataLoader(NERDataset(enc_val,   labels_vl),
                              batch_size=BATCH_SIZE,
                              num_workers=NUM_WORKERS, pin_memory=True)
    test_loader  = DataLoader(NERDataset(enc_test,  labels_ts),
                              batch_size=BATCH_SIZE,
                              num_workers=NUM_WORKERS, pin_memory=True)

    # Training + validation
    val_reports = []
    for epoch in range(1, EPOCHS + 1):
        train_epoch(model, train_loader, optimizer, scheduler, scaler, epoch)
        report = eval_epoch(model, val_loader, idx2tag, epoch, split="Val")
        val_reports.append(report)
        # you could hook up a real val_loss list here; for now we skip early stopping logic

    # Final test evaluation
    print("=== Final Test Set Evaluation ===")
    _ = eval_epoch(model, test_loader, idx2tag, epoch="final", split="Test")

    # Save
    torch.save(model.state_dict(), SAVE_PATH)
    print(f"Model saved to {SAVE_PATH}")


2025-04-25 16:30:56,823 INFO: Device set to: cuda
  df['Sentence #'] = df['Sentence #'].fillna(method='ffill')
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
  scaler      = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():
Train 1: 100%|██████████| 900/900 [12:48<00:00,  1.17it/s]

[Epoch 1] Train Loss: 14.4532



  _warn_prf(average, modifier, msg_start, len(result))


[Epoch 1] Val Classification Report:
              precision    recall  f1-score   support

         art       0.00      0.00      0.00       147
         eve       0.00      0.00      0.00        62
         geo       0.83      0.86      0.84     11376
         gpe       0.91      0.91      0.91      3346
         nat       0.00      0.00      0.00        50
         org       0.62      0.65      0.63      6486
         per       0.77      0.74      0.75      5226
         tim       0.83      0.78      0.81      4368

   micro avg       0.78      0.78      0.78     31061
   macro avg       0.50      0.49      0.49     31061
weighted avg       0.78      0.78      0.78     31061



Train 2: 100%|██████████| 900/900 [07:09<00:00,  2.09it/s]

[Epoch 2] Train Loss: 3.7218





[Epoch 2] Val Classification Report:
              precision    recall  f1-score   support

         art       0.36      0.05      0.09       147
         eve       0.60      0.19      0.29        62
         geo       0.83      0.89      0.86     11376
         gpe       0.92      0.93      0.92      3346
         nat       0.23      0.14      0.17        50
         org       0.69      0.65      0.67      6486
         per       0.78      0.78      0.78      5226
         tim       0.83      0.83      0.83      4368

   micro avg       0.80      0.81      0.80     31061
   macro avg       0.65      0.56      0.58     31061
weighted avg       0.80      0.81      0.80     31061



Train 3: 100%|██████████| 900/900 [07:13<00:00,  2.07it/s]

[Epoch 3] Train Loss: 2.9698





[Epoch 3] Val Classification Report:
              precision    recall  f1-score   support

         art       0.31      0.06      0.10       147
         eve       0.52      0.23      0.31        62
         geo       0.81      0.90      0.85     11376
         gpe       0.94      0.93      0.94      3346
         nat       0.50      0.26      0.34        50
         org       0.68      0.66      0.67      6486
         per       0.78      0.77      0.78      5226
         tim       0.85      0.84      0.85      4368

   micro avg       0.80      0.82      0.81     31061
   macro avg       0.67      0.58      0.61     31061
weighted avg       0.80      0.82      0.80     31061

=== Final Test Set Evaluation ===
[Epoch final] Test Classification Report:
              precision    recall  f1-score   support

         art       0.12      0.02      0.03       149
         eve       0.44      0.21      0.28        78
         geo       0.81      0.91      0.86     11596
         gpe       

In [4]:
# Load & tokenize
sentences, tags       = load_data(DATA_PATH)
tag2idx, idx2tag      = build_maps(tags)
tokenizer             = BertTokenizerFast.from_pretrained(PRETRAINED_MODEL)
encodings, labels_all = prepare_dataset(sentences, tags, tokenizer, tag2idx)

# 60/20/20 split
indices        = list(range(len(labels_all)))
train_val_idx, test_idx = train_test_split(indices, test_size=0.2, random_state=42)
train_idx, val_idx      = train_test_split(train_val_idx, test_size=0.25, random_state=42)

  df['Sentence #'] = df['Sentence #'].fillna(method='ffill')


In [2]:
# Loads the dataset and processes it by filling missing values, 
# ensuring that all words are strings, and grouping by sentence.
import pandas as pd
df = pd.read_csv('ner_dataset.csv', encoding='latin-1', engine='python')

# Fill NaN values in the 'Sentence #' column by forward filling the sentence number
df['Sentence #'] = df['Sentence #'].fillna(method='ffill')

# Fill NaN values in the 'Word' column with a placeholder (e.g., 'UNKNOWN')
df['Word'] = df['Word'].fillna('UNKNOWN')

# Ensure all values in 'Word' column are strings
df['Word'] = df['Word'].astype(str)
df['Word'] = df['Word'].str.lower()  # Lowercasing words

df = df[['Sentence #', 'Word', 'Tag']].dropna()  # Remove rows with missing values
sentences, tags = [], []

  df['Sentence #'] = df['Sentence #'].fillna(method='ffill')


In [4]:
for _, grp in df.groupby('Sentence #'):
        
        sentences.append(grp['Word'].astype(str).tolist())
        tags.append(grp['Tag'].tolist())
    

In [5]:
tags

[['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-geo',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-geo',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-gpe',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B-gpe',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-tim',
  'O',
  'O',
  'O',
  'B-org',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'B-tim',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-geo',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-org',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-geo',
  'I-geo',
  'O'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['B-geo',
  'O',
  'O',
  'B-per',
  'I-per',
  'O',
  'B-tim',
  'O',
  'B-geo',
  'O',
  'B-gpe',
  'O',
  'B-gpe',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-geo',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B-per',
  'I-per',
  'O',
  'O',
  'O',
  'O',


In [6]:
sentences, tags       = load_data(DATA_PATH)  # Load data and split into sentences and tags
tag2idx, idx2tag      = build_maps(tags)  # Build mapping from tags to indices
tokenizer             = BertTokenizerFast.from_pretrained(PRETRAINED_MODEL)  # Load BERT tokenizer
encodings, labels_all = prepare_dataset(sentences, tags, tokenizer, tag2idx)  # Prepare the tokenized dataset

# 60/20/20 split (train/val/test)
indices        = list(range(len(labels_all)))  # List of indices for the data
train_val_idx, test_idx = train_test_split(indices, test_size=0.2, random_state=42)  # Split into train and test
train_idx, val_idx      = train_test_split(train_val_idx, test_size=0.25, random_state=42)  # Split train into train and validation

NameError: name 'load_data' is not defined