In [1]:
#!pip install transformers torchcrf seqeval -q
!pip install pytorch-crf

Collecting pytorch-crf
  Downloading pytorch_crf-0.7.2-py3-none-any.whl.metadata (2.4 kB)
Downloading pytorch_crf-0.7.2-py3-none-any.whl (9.5 kB)
Installing collected packages: pytorch-crf
Successfully installed pytorch-crf-0.7.2


In [12]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import AutoModel, AutoTokenizer, get_linear_schedule_with_warmup
from torchcrf import CRF
import time
from tqdm import tqdm
import numpy as np
from transformers import DistilBertTokenizerFast, DistilBertModel, get_linear_schedule_with_warmup

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Device: cuda


In [3]:
# -----------------------
# 1️⃣ Dataset & Tokenization
# -----------------------
def parse_conll(path):
    tokens, labels = [], []
    with open(path, encoding="utf-8") as f:
        s_tok, s_lab = [], []
        for line in f:
            line = line.strip()
            if not line:
                if s_tok:
                    tokens.append(s_tok)
                    labels.append(s_lab)
                    s_tok, s_lab = [], []
                continue
            parts = line.split("\t")
            if len(parts) >= 3:
                tok, _, ner = parts[0], parts[1], parts[2]
                s_tok.append(tok)
                s_lab.append(ner)
        if s_tok:
            tokens.append(s_tok)
            labels.append(s_lab)
    return tokens, labels

dataset_path = "/kaggle/input/myner-mmdt/"
train_tokens, train_labels = parse_conll(dataset_path + "ner_train.conll")
val_tokens, val_labels = parse_conll(dataset_path + "ner_val.conll")
test_tokens, test_labels = parse_conll(dataset_path + "ner_test.conll")

# Combine train + val for full training
train_tokens += val_tokens
train_labels += val_labels

# Build label mapping
uniq_labels = sorted({t for seq in train_labels for t in seq})
if "O" not in uniq_labels: uniq_labels = ["O"] + [l for l in uniq_labels if l != "O"]
label2id = {l: i for i, l in enumerate(uniq_labels)}
id2label = {i: l for l, i in label2id.items()}

num_labels = len(label2id)
print("Labels:", label2id)


Labels: {'B-DATE': 0, 'B-LOC': 1, 'B-TIME': 2, 'I-DATE': 3, 'I-LOC': 4, 'I-TIME': 5, 'O': 6}


In [5]:
# Tokenizer
model_name = "distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
MAX_LEN = 128

def tokenize_and_align(tokens_list, labels_list):
    enc = tokenizer(
        tokens_list,
        is_split_into_words=True,
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN,
        return_attention_mask=True
    )
    all_label_ids = []
    for i, labels in enumerate(labels_list):
        word_ids = enc.word_ids(batch_index=i)
        prev = None
        label_ids = []
        for w in word_ids:
            if w is None:
                label_ids.append(-100)
            elif w != prev:
                label_ids.append(label2id[labels[w]])
            else:
                curr = labels[w]
                if curr.startswith("B-"):
                    curr = "I-" + curr[2:]
                label_ids.append(label2id.get(curr, label2id["O"]))
            prev = w
        all_label_ids.append(label_ids)
    enc["labels"] = all_label_ids
    return {k: torch.tensor(v) for k, v in enc.items()}

train_enc = tokenize_and_align(train_tokens, train_labels)
test_enc  = tokenize_and_align(test_tokens, test_labels)


In [25]:
# -----------------------
# 2️⃣ PyTorch Dataset
# -----------------------
# Suppose your unique labels are
#unique_labels = sorted(set(l for seq in all_labels for l in seq))
# label_ids = [label2id[label] for label in labels]
# label2id = {label: idx for idx, label in enumerate(unique_labels)}
# id2label = {idx: label for label, idx in label2id.items()}

class NERDataset(Dataset):
    def __init__(self, tokens, labels, tokenizer, label2id, max_len=128):
        self.tokens = tokens
        self.labels = labels
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.max_len = max_len

    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, idx):
        words = self.tokens[idx]
        labels = self.labels[idx]

        # Convert labels to IDs
        label_ids = [self.label2id[label] for label in labels]

        encoding = self.tokenizer(
            words,
            is_split_into_words=True,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )

        word_ids = encoding.word_ids(batch_index=0)
        aligned_labels = []

        for i, word_idx in enumerate(word_ids):
            if word_idx is None or word_idx >= len(label_ids):
                aligned_labels.append(-100)  # ignore
            else:
                aligned_labels.append(label_ids[word_idx])

        encoding['labels'] = torch.tensor(aligned_labels, dtype=torch.long)
        return {k: v.squeeze(0) for k, v in encoding.items()}


# train_ds = NERDataset(train_enc)
# test_ds  = NERDataset(test_enc)

# train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)
# test_loader  = DataLoader(test_ds, batch_size=16, shuffle=False)


In [19]:
class DistilBertBiLSTMCRF(nn.Module):
    def __init__(self, model_name, num_labels, hidden_dim=128):
        super().__init__()
        self.distilbert = DistilBertModel.from_pretrained(model_name)
        self.lstm = nn.LSTM(
            input_size=self.distilbert.config.hidden_size,
            hidden_size=hidden_dim // 2,
            num_layers=1,
            bidirectional=True,
            batch_first=True
        )
        self.classifier = nn.Linear(hidden_dim, num_labels)
        self.crf = CRF(num_labels, batch_first=True)

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state  # (batch, seq_len, hidden)

        lstm_out, _ = self.lstm(sequence_output)
        emissions = self.classifier(lstm_out)

        # Ensure labels are >=0 for CRF
        if labels is not None:
            labels = torch.where(labels < 0, torch.tensor(0, device=labels.device), labels)
            mask = attention_mask.bool() if attention_mask is not None else torch.ones_like(labels).bool()
            loss = -self.crf(emissions, labels, mask=mask, reduction='mean')
            return loss
        else:
            mask = attention_mask.bool() if attention_mask is not None else torch.ones(emissions.size()[:2], dtype=torch.bool, device=emissions.device)
            return self.crf.decode(emissions, mask=mask)


In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-multilingual-cased")

# tokens and labels are lists of lists from your .conll files
train_dataset = NERDataset(train_tokens, train_labels, tokenizer, label2id, max_len=128)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=2)

num_labels = len(label2id)
model = DistilBertBiLSTMCRF("distilbert-base-multilingual-cased", num_labels).to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

epochs = 5
grad_accum_steps = 2
scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())

num_training_steps = epochs * len(train_loader) // grad_accum_steps
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1*num_training_steps),
    num_training_steps=num_training_steps
)

# -----------------------
# 4️⃣ Training Loop
# -----------------------
best_train_loss = float("inf")
start_training = time.time()

for epoch in range(epochs):
    model.train()
    epoch_start = time.time()
    total_loss = 0

    for step, batch in enumerate(tqdm(train_loader)):
        batch = {k:v.to(device) for k,v in batch.items()}
        optimizer.zero_grad()

        with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
            loss = model(**batch) / grad_accum_steps

        scaler.scale(loss).backward()

        if (step+1) % grad_accum_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            optimizer.zero_grad()

        total_loss += loss.item() * grad_accum_steps

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} | Loss: {avg_loss:.4f} | Time: {time.time()-epoch_start:.2f} sec")

    if avg_loss < best_train_loss:
        best_train_loss = avg_loss
        torch.save(model.state_dict(), "best_model.pt")

print(f"Total training time: {(time.time()-start_training)/60:.2f} min")

  scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())
  0%|          | 0/3586 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
100%|██████████| 3586/3586 [11:19<00:00,  5.28it/s]


Epoch 1 | Loss: 19.5153 | Time: 679.09 sec


  0%|          | 0/3586 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 3586/3586 [11:20<00:00,  5.27it/s]


Epoch 2 | Loss: 5.4560 | Time: 680.66 sec


  0%|          | 0/3586 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 3586/3586 [11:20<00:00,  5.27it/s]


Epoch 3 | Loss: 3.9854 | Time: 680.17 sec


  0%|          | 0/3586 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 3586/3586 [11:16<00:00,  5.30it/s]


Epoch 4 | Loss: 3.1557 | Time: 676.35 sec


  0%|          | 0/3586 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 3586/3586 [11:16<00:00,  5.30it/s]


Epoch 5 | Loss: 2.5993 | Time: 676.93 sec
Total training time: 56.68 min


In [35]:
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# print("Tokenizer vocab size:", tokenizer.vocab_size)

# print("Model embedding vocab size:", model.bert.embeddings.word_embeddings.num_embeddings)
torch.save({
    "epoch": epoch,
    "model_state_dict": model.state_dict(),
    "optimizer_state_dict": optimizer.state_dict(),
    "scheduler_state_dict": scheduler.state_dict(),
    "best_train_loss": best_train_loss,
    "label2id": label2id
}, "best_model_full.pt")


In [37]:
from IPython.display import FileLink

# Make sure current directory is /kaggle/working
%cd /kaggle/working

# Provide the relative path
FileLink('best_model_full.pt')

/kaggle/working


In [33]:
# -----------------------
# 6️⃣ Evaluation
# -----------------------
test_dataset = NERDataset(test_tokens, test_labels, tokenizer, label2id, max_len=128)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

from seqeval.metrics import classification_report

model.eval()
all_true, all_pred = [], []

with torch.no_grad():
    for batch in test_loader:
        # Move tensors to device
        batch = {k: v.to(device) for k, v in batch.items()}

        # REMOVE labels so forward() returns decoded sequences
        batch_eval = {k: v for k, v in batch.items() if k != "labels"}
        preds = model(**batch_eval)  # now returns list of sequences

        labels = batch["labels"]
        mask = labels != -100  # ignore padding tokens

        # Ensure preds is a list of sequences
        if isinstance(preds, torch.Tensor):
            preds = preds.tolist()
        elif isinstance(preds, list) and all(isinstance(p, int) for p in preds):
            preds = [preds]  # single sequence edge case

        batch_size = labels.size(0)
        for i in range(batch_size):
            pred_seq_ids = preds[i] if i < len(preds) else []

            true_seq = [id2label[l.item()] for l, m in zip(labels[i], mask[i]) if m]
            pred_seq = [id2label[p] for p, m in zip(pred_seq_ids, mask[i]) if m]

            all_true.append(true_seq)
            all_pred.append(pred_seq)

print("NER Classification Report:")
print(classification_report(all_true, all_pred, digits=4, zero_division=0))


NER Classification Report:
              precision    recall  f1-score   support

        DATE     0.9402    0.9390    0.9396     10473
         LOC     0.8874    0.9030    0.8951     46219
        TIME     0.8658    0.8976    0.8814      1445

   micro avg     0.8962    0.9094    0.9027     58137
   macro avg     0.8978    0.9132    0.9054     58137
weighted avg     0.8963    0.9094    0.9028     58137



In [34]:
from sklearn.metrics import classification_report

# Flattened token-level evaluation
true_labels_flat, pred_labels_flat = [], []

model.eval()
with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        batch_eval = {k: v for k, v in batch.items() if k != "labels"}
        preds = model(**batch_eval)  # list of sequences

        labels = batch["labels"]
        mask = labels != -100  # ignore padding tokens

        # Ensure preds is list of sequences
        if isinstance(preds, torch.Tensor):
            preds = preds.tolist()
        elif isinstance(preds, list) and all(isinstance(p, int) for p in preds):
            preds = [preds]

        batch_size = labels.size(0)
        for i in range(batch_size):
            pred_seq_ids = preds[i] if i < len(preds) else []

            for true_id, pred_id, m in zip(labels[i], pred_seq_ids, mask[i]):
                if m:  # only consider non-padding
                    true_labels_flat.append(id2label[true_id.item()])
                    pred_labels_flat.append(id2label[pred_id])

# Token-level classification report
print("Token-level classification report:")
print(classification_report(true_labels_flat, pred_labels_flat, digits=4, zero_division=0))


Token-level classification report:
              precision    recall  f1-score   support

      B-DATE     0.9506    0.9484    0.9495     10397
       B-LOC     0.9026    0.9159    0.9092     45861
      B-TIME     0.9060    0.9312    0.9184      1439
      I-DATE     0.9468    0.9570    0.9518      8437
       I-LOC     0.8330    0.8381    0.8356     23790
      I-TIME     0.9340    0.9615    0.9476      1870
           O     0.9933    0.9925    0.9929   1162839

    accuracy                         0.9860   1254633
   macro avg     0.9238    0.9349    0.9293   1254633
weighted avg     0.9861    0.9860    0.9860   1254633

