In [4]:
!pip install pytorch-crf

Collecting pytorch-crf
  Downloading pytorch_crf-0.7.2-py3-none-any.whl.metadata (2.4 kB)
Downloading pytorch_crf-0.7.2-py3-none-any.whl (9.5 kB)
Installing collected packages: pytorch-crf
Successfully installed pytorch-crf-0.7.2


In [1]:
import os, json, numpy as np, torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoConfig, AutoModel, get_linear_schedule_with_warmup
from tqdm.auto import tqdm
from seqeval.metrics import classification_report
#from torchcrf import CRF 
from TorchCRF import CRF

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


  from .autonotebook import tqdm as notebook_tqdm


Device: cpu


In [8]:
def parse_conll(path):
    tokens, labels = [], []
    with open(path, encoding="utf-8") as f:
        s_tok, s_lab = [], []
        for line in f:
            line = line.strip()
            if not line:
                if s_tok:
                    tokens.append(s_tok); labels.append(s_lab)
                    s_tok, s_lab = [], []
                continue
            parts = line.split("\t")
            if len(parts) >= 3:
                tok, _, ner = parts[0], parts[1], parts[2]
                s_tok.append(tok); s_lab.append(ner)
        if s_tok:
            tokens.append(s_tok); labels.append(s_lab)
    return tokens, labels

dataset_path = "/kaggle/input/myner-mmdt/"
train_tokens, train_labels = parse_conll(dataset_path + "ner_train.conll")
val_tokens,   val_labels = parse_conll(dataset_path + "ner_val.conll")
test_tokens,  test_labels  = parse_conll(dataset_path + "ner_test.conll")


print(len(train_tokens), len(val_tokens), len(test_tokens))

45894 11474 14343


In [9]:
# === 2) Build label vocab ===
all_labels = train_labels + val_labels  # build from train+val (test unseen)
uniq = sorted({t for seq in all_labels for t in seq})
if "O" not in uniq: uniq = ["O"] + [l for l in uniq if l != "O"]

label2id = {l:i for i,l in enumerate(uniq)}
id2label = {i:l for l,i in label2id.items()}
num_labels = len(label2id)
print("Labels:", label2id)


Labels: {'B-DATE': 0, 'B-LOC': 1, 'B-TIME': 2, 'I-DATE': 3, 'I-LOC': 4, 'I-TIME': 5, 'O': 6}


In [2]:
label2id = {'B-DATE': 0, 'B-LOC': 1, 'B-TIME': 2, 'I-DATE': 3, 'I-LOC': 4, 'I-TIME': 5, 'O': 6}
id2label = {i:l for l,i in label2id.items()}
print(id2label)

{0: 'B-DATE', 1: 'B-LOC', 2: 'B-TIME', 3: 'I-DATE', 4: 'I-LOC', 5: 'I-TIME', 6: 'O'}


Tokenize pre-segmented text (already split into word-level tokens) and align the corresponding NER labels to the resulting subword tokens.
        
Input:

        tokens_list: List of sentences (each sentence = list of word-level tokens, e.g., from Burmese word segmenter)

        labels_list: List of sentences (each sentence = list of BIO/NER labels aligned to tokens)

Output:

        Dictionary containing tensors, each of shape [num_sentences, MAX_LEN]:

        "input_ids" → token IDs (subword tokenized, padded/truncated)

        "attention_mask" → 1 for real tokens, 0 for padding

        "labels" → aligned label IDs, with -100 for ignored tokens (CLS, SEP, padding, etc.)
    This function is used in sequence labeling tasks such as Named Entity Recognition (NER),
    where each word has a label (BIO/BIOES). Since BERT-based tokenizers (like DistilBERT)
    may split a word into multiple subword tokens, we must ensure that labels are properly
    aligned with the subword sequence.

    Steps performed:
    ----------------
    1. Use a Hugging Face tokenizer to encode each list of word tokens into
       input_ids, attention_mask, etc. (`is_split_into_words=True` ensures
       the tokenizer treats each element of tokens_list as a pre-tokenized word).
    
    2. For each word:
        - The first subword inherits the word’s original label.
        - Any additional subwords are assigned the corresponding I- label
          (if the word’s label was B-XXX) or the same label if it was already I-/O.
    
    3. Special tokens ([CLS], [SEP], padding) are assigned a label of -100,
       which tells the model to ignore them during loss calculation.
    
    4. Return a dictionary with:
        - "input_ids": tensor of token IDs padded/truncated to `MAX_LEN`
        - "attention_mask": tensor indicating valid tokens vs. padding
        - "labels": tensor of aligned label IDs with -100 for ignored tokens

    Parameters:
    -----------
    tokens_list : List[List[str]]
        List of sentences, where each sentence is a list of word-level tokens
        (already segmented, e.g., with a Myanmar word segmenter).
    
    labels_list : List[List[str]]
        List of sentences, where each sentence is a list of labels (BIO tags)
        aligned to the `tokens_list`.

    Returns:
    --------
    encodings : Dict[str, torch.Tensor]
        Dictionary containing batched tensors for:
        - "input_ids"      : shape [num_sentences, max_len]
        - "attention_mask" : shape [num_sentences, max_len]
        - "labels"         : shape [num_sentences, max_len]
        Ready to be wrapped into a PyTorch Dataset for training.
    
    Example:
    --------
    tokens_list = [["ရန်ကုန်", "မြို့"], ["မြန်မာနိုင်ငံ", "စာ"]]
    labels_list = [["B-LOC", "I-LOC"], ["B-LOC", "O"]]

    encoded = tokenize_and_align(tokens_list, labels_list)

    encoded["input_ids"].shape      # torch.Size([2, MAX_LEN])
    encoded["attention_mask"].shape # torch.Size([2, MAX_LEN])
    encoded["labels"].shape         # torch.Size([2, MAX_LEN])
    
        {
        "input_ids": tensor([[   101, 34567, 45678, 78910, 56789,   102,     0,     0, ...]]),
        "attention_mask": tensor([[1, 1, 1, 1, 1, 1, 0, 0, ...]]),
        "labels": tensor([[-100,  1,  2,  2,  0, -100, -100, -100, ...]])
        }
tokens_list + labels_list
        │
        ▼
tokenize_and_align()  → dict of tensors
        │
        ▼
NERDataset(enc)       → single-sentence dicts
        │
        ▼
DataLoader(dataset)   → batched dicts → feed into model

┌─────────────────────────────┐
│ Raw Sentences (word-segmented)
│ ["ရန်ကုန်", "မြို့"]             │
│ ["မြန်မာနိုင်ငံ", "စာ"]           │
└───────────────┬─────────────┘
                │
                ▼
┌─────────────────────────────┐
│ tokenize_and_align()        │
│ - Tokenizer converts words to subwords │
│ - Aligns NER labels to subwords      │
│ - Pads/truncates to MAX_LEN          │
│ - Special tokens ([CLS],[SEP],PAD) → -100 │
└───────────────┬─────────────┘
                │
                ▼
┌─────────────────────────────┐
│ NERDataset(enc)             │
│ - Wraps each sentence into  │
│   {"input_ids", "attention_mask", "labels"} │
│ - __getitem__ returns 1 sentence │
└───────────────┬─────────────┘
                │
                ▼
┌─────────────────────────────┐
│ DataLoader(dataset, batch_size, shuffle) │
│ - Stacks sentences into batch_size       │
│ - Output shape: [batch_size, MAX_LEN]    │
│ Example batch:                           │
│ input_ids:      [[101,9011,8402,102,0...]] │
│ attention_mask: [[1,1,1,1,0,...]]         │
│ labels:         [[-100,3,4,-100,...]]     │
└───────────────┬─────────────┘
                │
                ▼
┌─────────────────────────────┐
│ Feed batch into model       │
│ - input_ids → embeddings    │
│ - attention_mask → mask padding │
│ - labels → compute loss (-100 ignored) │
└─────────────────────────────┘


In [None]:
# === 3) Tokenizer (FAST) ===
model_name = "distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

MAX_LEN = 128

def tokenize_and_align(tokens_list, labels_list):
    enc = tokenizer(
        tokens_list,
        is_split_into_words=True,
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN,
        return_attention_mask=True
    )
    all_label_ids = []
    for i, labels in enumerate(labels_list):
        word_ids = enc.word_ids(batch_index=i) # map subword → word index
        prev = None
        label_ids = []
        for w in word_ids:
            if w is None:
                label_ids.append(-100)  # # ignore [CLS], [SEP], [PAD] during loss
            elif w != prev:
                # first subword of the word → keep original label
                label_ids.append(label2id[labels[w]])      # first piece -> true label
            else:
                # subword -> give I- tag if B-*, # continuation subwords → switch B- to I-
                curr = labels[w]
                if curr.startswith("B-"):
                    curr = "I-" + curr[2:]
                label_ids.append(label2id.get(curr, label2id["O"]))
            prev = w
        all_label_ids.append(label_ids)
    enc["labels"] = all_label_ids
    return {k: torch.tensor(v) for k, v in enc.items()}

train_enc = tokenize_and_align(train_tokens, train_labels)
val_enc   = tokenize_and_align(val_tokens,   val_labels)
test_enc  = tokenize_and_align(test_tokens,  test_labels)


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

NERDataset = wrapper around tokenized encodings → gives one sentence at a time.
DataLoader = stacks sentences into batches automatically.

DataLoader
batch = {
  "input_ids": tensor([
    [ 101, 5001, 6205, 7123, 102,    0,    0,    0],
    [ 101, 9011, 8402, 102,   0,    0,    0,    0]
  ]),
  "attention_mask": tensor([
    [1, 1, 1, 1, 1, 0, 0, 0],
    [1, 1, 1, 1, 0, 0, 0, 0]
  ]),
  "labels": tensor([
    [-100, 1, 2, 2, -100, -100, -100, -100],
    [-100, 3, 4, -100, -100, -100, -100, -100]
  ])
  ....
}


Raw Sentences (word-segmented)
───────────────────────────────
["ရန်ကုန်", "မြို့"]
["မြန်မာနိုင်ငံ", "စာ"]
        │
        ▼
tokenize_and_align()
───────────────────────────────
- Tokenizer converts each word into subwords if needed
- Aligns NER labels to subwords
- Special tokens ([CLS], [SEP], PAD) get label = -100
- Pads/truncates sentences to MAX_LEN
Output:
{
  "input_ids":      tensor([num_sentences, MAX_LEN]),
  "attention_mask": tensor([num_sentences, MAX_LEN]),
  "labels":         tensor([num_sentences, MAX_LEN])
}
        │
        ▼
NERDataset(enc)
───────────────────────────────
- Wraps each sentence into a dictionary:
{
  "input_ids": tensor([MAX_LEN]),
  "attention_mask": tensor([MAX_LEN]),
  "labels": tensor([MAX_LEN])
}
- Allows indexing via __getitem__ (one sentence at a time)
        │
        ▼
DataLoader(dataset, batch_size, shuffle)
───────────────────────────────
- Automatically stacks multiple sentences into a batch
- Each batch has shape: [batch_size, MAX_LEN]
Batch example (batch_size=2, MAX_LEN=8):
{
  "input_ids": tensor([
      [101, 9011, 8402, 102, 0, 0, 0, 0],
      [101, 5001, 6205, 7123, 102, 0, 0, 0]
  ]),
  "attention_mask": tensor([
      [1, 1, 1, 1, 0, 0, 0, 0],
      [1, 1, 1, 1, 1, 0, 0, 0]
  ]),
  "labels": tensor([
      [-100, 3, 4, -100, -100, -100, -100, -100],
      [-100, 1, 2, 2, -100, -100, -100, -100]
  ])
}
        │
        ▼
Feed batch into model (DistilBERT / mBERT)
───────────────────────────────
- input_ids → embeddings
- attention_mask → tells model which tokens to attend
- labels → used for loss calculation (ignore -100)



In [11]:
# === 4) PyTorch Datasets ===
class NERDataset(Dataset):
    def __init__(self, enc):
        self.enc = enc
    def __len__(self): return self.enc["input_ids"].shape[0]
    def __getitem__(self, idx):
        return {
            "input_ids": self.enc["input_ids"][idx],
            "attention_mask": self.enc["attention_mask"][idx],
            "labels": self.enc["labels"][idx]
        }

train_ds = NERDataset(train_enc)
val_ds   = NERDataset(val_enc)
test_ds  = NERDataset(test_enc)

batch_size = 16
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False)
test_loader  = DataLoader(test_ds,  batch_size=batch_size, shuffle=False)


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import ConcatDataset, DataLoader
from torch.optim import AdamW
from transformers import AutoModel, get_linear_schedule_with_warmup
from torchcrf import CRF
import time

# -----------------------
# 1️⃣ Define DistilBERT + CRF model
# -----------------------
class DistilBertCRF(nn.Module):
    def __init__(self, model_name, num_labels, dropout=0.1):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        hidden_size = self.bert.config.hidden_size # Number of hidden units from DistilBERT. Typically 768 for base models.
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(hidden_size, num_labels)  # emission scores
        self.crf = CRF(num_labels, batch_first=True)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = self.dropout(outputs.last_hidden_state)  # (B, L, H)
        emissions = self.classifier(sequence_output)              # (B, L, num_labels)

        if labels is not None:
            # Create mask for valid tokens
            mask = labels.ne(-100)
            mask[:, 0] = True  # Ensure first timestep is always unmasked

            # Replace masked labels with a safe value 0
            safe_labels = labels.clone()
            safe_labels[~mask] = 0
            safe_labels[:, 0] = safe_labels[:, 0].clamp(0, self.crf.num_tags-1)

            log_likelihood = self.crf(emissions, safe_labels, mask=mask, reduction='mean') # CRF computes sequence-level log-likelihood over the batch.
            return -log_likelihood # loss to minimize for training
        else:
            mask = attention_mask.bool() # Only attend to real tokens (ignore padding)
            mask[:, 0] = True  # first timestep must be on
            return self.crf.decode(emissions, mask=mask)  # Best label sequences, decoding to find most likely label sequence per sentence
            # Output shape: [batch_size, seq_len] (list of predicted labels per token).


Scheduler: Linear WarmupPurpose: control learning rate over training steps.

Transformers usually train better if the learning rate starts small and gradually increases → warmup.

Parameters:
    num_training_steps
    Total number of optimization steps = epochs * batches_per_epoch.
    num_warmup_steps
    Initial steps where learning rate linearly increases from 0 → lr
    Here, 10% of total steps (0.1 * num_training_steps)

How it works:
    Warmup phase: LR grows linearly → prevents sudden large updates at the start.
    Decay phase: After warmup, LR decreases linearly → stabilizes training and improves convergence.

In [None]:
# -----------------------
# 2️⃣ Prepare DataLoaders
# -----------------------
full_train_dataset = ConcatDataset([train_ds, val_ds])
batch_size = 16
train_loader = DataLoader(full_train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

# -----------------------
# 3️⃣ Initialize model, optimizer, scheduler
# -----------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "distilbert-base-multilingual-cased"
num_labels = len(label2id)

model = DistilBertCRF(model_name, num_labels).to(device)
epochs = 5

optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01) # penalizes large weights to reduce overfitting.
num_training_steps = epochs * len(train_loader)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * num_training_steps),
    num_training_steps=num_training_steps
)

scaler = torch.cuda.amp.GradScaler(enabled=(device.type == "cuda"))

# -----------------------
# 4️⃣ Training Loop (GPU-safe)
# -----------------------
for epoch in range(epochs):
    model.train()
    start_time = time.time()
    total_loss = 0

    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        with torch.cuda.amp.autocast(enabled=(device.type == "cuda")):
            loss = model(input_ids, attention_mask=attention_mask, labels=labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    elapsed = time.time() - start_time
    print(f"Epoch {epoch+1}/{epochs} | Loss: {avg_loss:.4f} | Time: {elapsed/60:.2f} min")

print(f"Training completed in {(time.time() - start_time)/60:.2f} minutes")

# -----------------------
# 5️⃣ Save model
# -----------------------
torch.save(model.state_dict(), "distilbert_crf_ner.pth")

2025-08-18 13:45:36.250332: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755524736.590415      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755524736.688254      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

  scaler = torch.cuda.amp.GradScaler(enabled=(device.type == "cuda"))
  with torch.cuda.amp.autocast(enabled=(device.type == "cuda")):


Epoch 1/5 | Loss: 10.4489 | Time: 11.97 min
Epoch 2/5 | Loss: 3.5365 | Time: 11.93 min
Epoch 3/5 | Loss: 2.3943 | Time: 11.91 min
Epoch 4/5 | Loss: 1.6262 | Time: 11.89 min
Epoch 5/5 | Loss: 1.0462 | Time: 11.91 min
Training completed in 11.91 minutes


In [14]:
from seqeval.metrics import classification_report
import numpy as np

# Make sure model is in evaluation mode
model.eval()

true_labels_all = []
pred_labels_all = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Get predictions from CRF
        predictions = model(input_ids, attention_mask=attention_mask, labels=None)  # list of lists

        # Convert label ids to actual labels
        for i in range(len(labels)):
            true_seq = []
            pred_seq = []

            for j in range(labels[i].size(0)):
                if labels[i, j] == -100:
                    continue  # skip padding
                true_seq.append(id2label[labels[i, j].item()])
                pred_seq.append(id2label[predictions[i][j]])

            true_labels_all.append(true_seq)
            pred_labels_all.append(pred_seq)

# Print classification report
print(classification_report(true_labels_all, pred_labels_all))


              precision    recall  f1-score   support

        DATE       0.92      0.94      0.93      2614
         LOC       0.84      0.86      0.85     10638
        TIME       0.87      0.90      0.88       557

   micro avg       0.86      0.88      0.87     13809
   macro avg       0.87      0.90      0.89     13809
weighted avg       0.86      0.88      0.87     13809



In [None]:
from sklearn.metrics import classification_report

# # Flatten sequences (BIO format preserved)
# y_true = [label for seq in all_true for label in seq]
# y_pred = [label for seq in all_pred for label in seq]

y_true = [label for seq in true_labels_all for label in seq]
y_pred = [label for seq in pred_labels_all for label in seq]

# Print classification report with BIO tags
print(classification_report(y_true, y_pred, digits=4, zero_division=0))


              precision    recall  f1-score   support

      B-DATE     0.9431    0.9602    0.9516      2538
       B-LOC     0.8913    0.9089    0.9000     10280
      B-TIME     0.9293    0.9546    0.9418       551
      I-DATE     0.9562    0.9664    0.9613     16296
       I-LOC     0.8902    0.9098    0.8999     59371
      I-TIME     0.9528    0.9804    0.9664      2758
           O     0.9942    0.9927    0.9934   1162839

    accuracy                         0.9876   1254633
   macro avg     0.9367    0.9533    0.9449   1254633
weighted avg     0.9877    0.9876    0.9877   1254633

