In [1]:
# Restart session before running
!pip install transformers==4.30.2 peft==0.10.0 datasets seqeval torch
!pip install -U spacy
!python -m spacy download en_core_web_sm

Collecting transformers==4.30.2
  Downloading transformers-4.30.2-py3-none-any.whl.metadata (113 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/113.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m113.6/113.6 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft==0.10.0
  Downloading peft-0.10.0-py3-none-any.whl.metadata (13 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.30.2)
  Downloading tokenizers-0.13.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5

In [2]:
!pip uninstall -y accelerate
!pip install -U accelerate==0.27.2

Found existing installation: accelerate 1.9.0
Uninstalling accelerate-1.9.0:
  Successfully uninstalled accelerate-1.9.0
Collecting accelerate==0.27.2
  Downloading accelerate-0.27.2-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: accelerate
Successfully installed accelerate-0.27.2


In [3]:
!pip uninstall -y numpy
# !pip install numpy==1.24.3
!pip install numpy==1.26.0

Found existing installation: numpy 2.0.2
Uninstalling numpy-2.0.2:
  Successfully uninstalled numpy-2.0.2
Collecting numpy==1.26.0
  Downloading numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.5/58.5 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m58.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opencv-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.0 which is incompatible.
opencv-contrib-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9

In [1]:
import torch, numpy as np
print("NumPy now:", np.__version__)     # should be 1.24.3
# Test round-trip conversion
a = np.arange(5, dtype=np.float32)
t = torch.from_numpy(a)
print("Round-trip okay:", t.numpy())


NumPy now: 1.26.0
Round-trip okay: [0. 1. 2. 3. 4.]


In [2]:
# Restore missing aliases
np.dtypes = np.core.numerictypes
np.object = object
np.float  = float
# (add others as needed)

from transformers import BertTokenizerFast  # or whatever was failing


In [2]:
import os
import random

### Parsing data & splitting the tokens and tags based on whitespace

In [3]:
def load_bioes_pairs(path):
    sents, tags = [], []
    # 1) Read all non-blank lines
    with open(path, encoding='utf8') as f:
        lines = [line.rstrip() for line in f if line.strip()]

    # 2) Process in pairs
    for i in range(0, len(lines), 2):
        sent_line = lines[i]
        tag_line  = lines[i+1] if i+1 < len(lines) else ""
        # 3) Split into tokens and tags
        tokens = sent_line.split()
        tag_seq = tag_line.split()
        if len(tokens) != len(tag_seq):
            # Warn if lengths mismatch
            print(f"Line-pair starting at {2*i+1}: token/tag count mismatch "
                  f"({len(tokens)} vs {len(tag_seq)})")
        sents.append(tokens)
        tags.append(tag_seq)

    return sents, tags


In [5]:
sentences, labels = load_bioes_pairs("hard_ner.txt")

for idx, (tok_seq, tag_seq) in enumerate(zip(sentences, labels)):
    print(f"\nSentence {idx}:")
    print("Tokens:", tok_seq)
    print("Tags:  ", tag_seq)


Sentence 0:
Tokens: ['কাজী', 'নজরুল', 'ইসলামের', 'ডাক', 'নাম', 'ছিল', 'দুখু', 'মিয়া']
Tags:   ['B-PER', 'I-PER', 'E-PER', 'O', 'O', 'O', 'B-PER', 'E-PER']

Sentence 1:
Tokens: ['১৯১৭', 'সাল', 'পর্যন্ত', 'এখানেই', 'পড়াশোনা', 'করেন']
Tags:   ['B-DATE', 'E-DATE', 'O', 'O', 'O', 'O']

Sentence 2:
Tokens: ['এ', 'সময়', 'নজরুলের', 'বাহিনীর', 'ইরাক', 'যাবার', 'কথা', 'ছিল']
Tags:   ['O', 'O', 'S-PER', 'O', 'S-LOC', 'O', 'O', 'O']

Sentence 3:
Tokens: ['১৯২০', 'খ্রিস্টাব্দে', 'যুদ্ধ', 'শেষ', 'হলে', '৪৯', 'বেঙ্গল', 'রেজিমেন্ট', 'ভেঙে', 'দেয়া', 'হয়']
Tags:   ['B-DATE', 'E-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

Sentence 4:
Tokens: ['১৯২০', 'খ্রিস্টাব্দের', 'জুলাই', '১২', 'তারিখে', 'নবযুগ', 'নামক', 'একটি', 'সান্ধ্য', 'দৈনিক', 'পত্রিকা', 'প্রকাশিত', 'হওয়া', 'শুরু', 'করে']
Tags:   ['B-DATE', 'I-DATE', 'I-DATE', 'I-DATE', 'E-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

Sentence 5:
Tokens: ['একইসাথে', 'মুজফ্\u200cফর', 'আহমদের', 'সাথে', 'বিভিন্ন', 'রাজনৈতিক', 'সভা', 'সমি

### Creating dataset artifact

In [6]:
import os
import random

# from your_module import load_bioes_pairs

def split_and_log_multiple_datasets(dataset_dict, project="bangla-ner", out_root="."):
    """
    Split multiple datasets into train/val/test and write them to disk.
    NOTE: 'project' is ignored (kept only for drop-in compatibility). No W&B used.

    Args:
        dataset_dict (dict): {
            "dataset_name": {
                "path": "<path/to/file>",
                "source": "<optional metadata, ignored>"
            },
            ...
        }
        project (str): Ignored. Kept for API compatibility.
        out_root (str): Root directory where split files will be saved.

    Returns:
        dict: {dataset_name: { "training": [...], "validation": [...], "test": [...] }}
              where each list item is a (tokens, tags) tuple.
    """
    all_splits = {}

    for dataset_name, dataset_info in dataset_dict.items():
        file_path = dataset_info["path"]

        # 1) Parse tokens & tags into Python lists
        sents, tags = load_bioes_pairs(file_path)

        # (optional) sanity check
        if len(sents) != len(tags):
            raise ValueError(
                f"[{dataset_name}] Number of sentences ({len(sents)}) "
                f"!= number of tag sequences ({len(tags)})"
            )

        # 2) Zip into examples and shuffle at the sentence level
        examples = list(zip(sents, tags))
        random.shuffle(examples)

        # 3) Compute split indices (80/10/10)
        n = len(examples)
        train_end = int(n * 0.8)
        val_end   = int(n * 0.9)
        splits = {
            "training":   examples[:train_end],
            "validation": examples[train_end:val_end],
            "test":       examples[val_end:]
        }

        all_splits[dataset_name] = splits

        # 4) Write each split back out in token␣tag format
        out_dir = os.path.join(out_root, dataset_name)
        os.makedirs(out_dir, exist_ok=True)

        for split_name, ex_list in splits.items():
            out_path = os.path.join(out_dir, f"{split_name}.txt")
            with open(out_path, "w", encoding="utf-8") as fout:
                for tok_seq, tag_seq in ex_list:
                    # ensure equal length per example
                    if len(tok_seq) != len(tag_seq):
                        raise ValueError(
                            f"[{dataset_name}:{split_name}] token/tag length mismatch "
                            f"({len(tok_seq)} vs {len(tag_seq)})"
                        )
                    for tok, tag in zip(tok_seq, tag_seq):
                        fout.write(f"{tok} {tag}\n")
                    fout.write("\n")  # blank line between sentences

    print("✅ Done. Wrote train/val/test files locally for all datasets (no W&B).")
    return all_splits


In [7]:
dataset_dict = {
    "NER Dump (Active)": {
        "path": "/content/hard_ner.txt",
        "source": (
            "GitHub: https://github.com/Foysal87/Bangla-NLP-Dataset\n"
            "Drive: https://drive.google.com/file/d/1AT4FkyqyioLIc6wy8mo7cv_Q2ZTnGS_1/view\n"
            "Author: Towhid Ahmed Foysal"
        )
    }
    # "bangla_papers": {
    #     "path": "bangla_papers.txt",
    #     "source": (
    #         "GitHub: https://github.com/Foysal87/Bangla-NLP-Dataset\n"
    #         "Research paper collection dataset from the same repo"
    #     )
    # }
}


In [8]:
# 1) Split & log your datasets
all_splits = split_and_log_multiple_datasets(dataset_dict)

✅ Done. Wrote train/val/test files locally for all datasets (no W&B).


### Logging a Table of sample sentences
To peek at a few examples in the W&B UI on training run

In [9]:
import csv
from typing import List, Tuple, Optional

def log_token_tag_table(
    split_examples: List[Tuple[List[str], List[str]]],
    project: str = "bangla-ner",          # ignored; kept for drop-in compatibility
    job_type: str = "inspect-samples",    # ignored
    key: str = "train_samples_table",     # ignored
    max_samples: int = 20,
    out_path: Optional[str] = None,       # e.g., "train_samples_table.csv"
    print_rows: bool = True
):
    """
    Given a list of (tokens, tags) tuples, prepare up to max_samples rows and
    optionally write them to CSV and/or print to console. No W&B used.

    Returns:
        List[dict]: [{"tokens": "...", "tags": "..."}, ...]
    """
    rows = []
    for i, (tokens, tags) in enumerate(split_examples[:max_samples], start=1):
        if len(tokens) != len(tags):
            raise ValueError(
                f"Example {i}: token/tag length mismatch ({len(tokens)} vs {len(tags)})"
            )
        rows.append({
            "tokens": " ".join(tokens),
            "tags":   " ".join(tags),
        })

    if out_path:
        with open(out_path, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=["tokens", "tags"])
            writer.writeheader()
            writer.writerows(rows)

    if print_rows:
        for i, r in enumerate(rows, start=1):
            print(f"#{i}")
            print("TOKENS:", r["tokens"])
            print("TAGS:  ", r["tags"])
            print("-" * 40)

    return rows


In [10]:
train_split = all_splits["NER Dump (Active)"]["training"]

# Now this will work:
rows = log_token_tag_table(train_split, max_samples=25, out_path="train_samples_table.csv")

#1
TOKENS: এই শহরের জনসংখ্যার ১১ হল ৬ বছর বা তার কম বয়সী
TAGS:   O S-LOC O O O B-DATE E-DATE O O O O
----------------------------------------
#2
TOKENS: এটি শেষ হয় ১৩০৪ সালে
TAGS:   O O O B-DATE E-DATE
----------------------------------------
#3
TOKENS: সকাল ৮টা থেকে রাত ৮টা পর্যন্ত প্রতি আধ ঘণ্টা পর পর জানিয়ে দেয়া হয় ঢাকার ট্রাফিক জ্যাম বিষয়ক তথ্য ঢাকার চাকা
TAGS:   O O O O O O O O O O O O O O S-LOC O O O O S-LOC O
----------------------------------------
#4
TOKENS: মুহাম্মাদ সমগ্র বাহিনীর প্রধান হিসেবে দায়িত্ব নেন
TAGS:   S-PER O O O O O O
----------------------------------------
#5
TOKENS: প্রতিরোধ ২০১১ সাল পর্যন্ত হেপাটাইটিস সি এর কোন টিকা নাই
TAGS:   O B-DATE E-DATE O O O O O O O
----------------------------------------
#6
TOKENS: সারা ভারতের সাক্ষরতার হার ৫৯ ৫ তার চাইতে রাজুরা এর সাক্ষরতার হার বেশি
TAGS:   O S-LOC O O O O O O S-LOC O O O O
----------------------------------------
#7
TOKENS: পরবর্তীতে ১৯৬৫ সাল থেকে পৃথকভাবে সম্প্রচারিত হয়
TAGS:   O B-DATE E-DATE O O O O
--

In [14]:
# # 2) Log a table of training samples for quick inspection
# train_examples = all_splits["NER Dump (Active)"]["training"]
# log_token_tag_table(
#     train_examples,
#     key="NER_Dump_Active_train_samples"
# )

## Preprocessing



1.   Pull your split files back down from the W&B artifact
2.   Turn them into a HuggingFace Dataset
3.   Apply tokenize_align function to each split—without ever re-parsing the raw CoNLL file



### Using Fast tokenizer to preprocess the data

AutoTokenizer and BertTokenizerFast are part of the Hugging Face Transformers library:

AutoTokenizer is a generic tokenizer class that inspects a model identifier (e.g., "bert-base-cased" or "sagorsarker/bangla-bert-base") and automatically instantiates the correct tokenizer under the hood. It chooses a “fast” implementation (Rust-based) whenever available, falling back to a pure-Python tokenizer otherwise.

BertTokenizerFast is the specialized, high-performance tokenizer for BERT models. It inherits from PreTrainedTokenizerFast (the base fast-tokenizer class) and implements WordPiece subword tokenization specifically for BERT’s vocabulary format. Being a “fast” tokenizer, it relies on the Hugging Face Tokenizers Rust library for efficient, batched tokenization and alignment utilities.


**Problem faced: KeyboardInterrupt**

AutoTokenizer.from_pretrained(...) fetches model files when you call from_pretrained("sagorsarker/bangla-bert-base"), Hugging Face checks your local cache.

If it’s not already downloaded, it makes HTTPS requests to pull down several files (e.g. tokenizer.json, vocab.txt, config files).

Depending on network speed or server response, this can take several seconds—or even tens of seconds the first time.

Long network I/O can feel like a “hang”

During that download, your Colab cell shows no output until everything is fetched.

If you press ▶️ Stop (or your browser hits a timeout), Python raises KeyboardInterrupt to abort the process.


**How to avoid it**

Use the “Fast” tokenizer:

from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained(
    "sagorsarker/bangla-bert-base", use_fast=True
)

The Rust-based fast tokenizers tend to load more quickly and reliably.

Cache locally: Run once without interruption so files are stored in ~/.cache/huggingface/transformers. Subsequent runs will load almost instantly.

Offline mode: After the first successful download, you can do

tokenizer = AutoTokenizer.from_pretrained(
    "sagorsarker/bangla-bert-base", local_files_only=True
)
which skips any network calls and fails fast if files aren’t present.

In [11]:
from transformers import BertTokenizerFast

# This will download and cache the tokenizer the first time,
# then load locally thereafter (fast).
tokenizer = BertTokenizerFast.from_pretrained(
    "sagorsarker/bangla-bert-base",
    use_fast=True,
    #local_files_only=False  # set True if you already have it cached
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.txt: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/491 [00:00<?, ?B/s]

In [12]:
# Collect all unique tag strings
unique_labels = sorted({lab for seq in labels for lab in seq})

# Map each label string to an integer ID
label2id = {lab: idx for idx, lab in enumerate(unique_labels)}
id2label = {idx: lab for lab, idx in label2id.items()}

print(label2id)

{'B-DATE': 0, 'B-LOC': 1, 'B-ORG': 2, 'B-PER': 3, 'E-DATE': 4, 'E-LOC': 5, 'E-ORG': 6, 'E-PER': 7, 'I-DATE': 8, 'I-LOC': 9, 'I-ORG': 10, 'I-PER': 11, 'O': 12, 'S-DATE': 13, 'S-LOC': 14, 'S-OBJ': 15, 'S-ORG': 16, 'S-PER': 17}


#### 2. Convert each CoNLL file into a HF Dataset

In [13]:
def read_conll(path):
    tokens_list, tags_list = [], []
    with open(path, encoding="utf-8") as f:
        toks, tgs = [], []
        for line in f:
            line = line.strip()
            if not line:
                if toks:
                    tokens_list.append(toks)
                    tags_list.append(tgs)
                    toks, tgs = [], []
                continue
            tok, tag = line.split()
            toks.append(tok)
            tgs.append(tag)
    return {"tokens": tokens_list, "ner_tags": tags_list}

# Build three HF Datasets
# train_data = read_conll(train_path)
# val_data   = read_conll(val_path)
# test_data  = read_conll(test_path)

# train_ds = Dataset.from_dict(train_data)
# val_ds   = Dataset.from_dict(val_data)
# test_ds  = Dataset.from_dict(test_data)



In [14]:
def tokenize_align(batch):
    # 1. Tokenize the batch, keeping track of word boundaries
    tokenized = tokenizer(
        batch["tokens"],
        is_split_into_words=True,
        padding="max_length",
        truncation=True,
        max_length=128
    )

    aligned_labels = []
    # 2. Iterate over each example’s encoding + original tag sequence
    for i, (encoding, orig_tags) in enumerate(zip(tokenized.encodings, batch["ner_tags"])):
        word_ids = encoding.word_ids  # per-example list of word_idx or None
        label_ids = []
        prev_word_idx = None

        # 3. Align each subword token to its correct label
        for word_idx in word_ids:
            if word_idx is None:
                # special tokens or padding
                label_ids.append(-100)
            elif word_idx != prev_word_idx:
                # first subword of the word
                label_ids.append(label2id[orig_tags[word_idx]])
            else:
                # subsequent subword of the same word
                label_ids.append(label2id[orig_tags[word_idx]])
            prev_word_idx = word_idx

        aligned_labels.append(label_ids)

    tokenized["labels"] = aligned_labels
    return tokenized


In [15]:
from datasets import Dataset

# # 1) Start a W&B run
# run = wandb.init(project="bangla-ner", job_type="preprocess")

# # 2) Retrieve your splits artifact
# raw_art = run.use_artifact("multi-dataset-split:latest", type="dataset")

# 3) Download each split file
# train_path = raw_art.get_path("NER Dump (Active)/training.txt").download()
# val_path   = raw_art.get_path("NER Dump (Active)/validation.txt").download()
# test_path  = raw_art.get_path("NER Dump (Active)/test.txt").download()
dataset_name = "NER Dump (Active)"
base_dir = os.path.join(".", dataset_name)
paths = {
    split: os.path.join(base_dir, f"{split}.txt")
    for split in ("training", "validation", "test")
}

In [16]:
# datasets = {}
# for split, path in paths.items():
#     data_dict = read_conll(path)
#     datasets[split] = Dataset.from_dict(data_dict)

datasets = {
    split: Dataset.from_dict(read_conll(path))
    for split, path in paths.items()
}

In [17]:
# make sure label2id is defined in scope
# label2id = {"O":0, "B-PER":1, …}

# # 4) Tokenize & align in batched mode
# train_tokenized = train_ds.map(
#     tokenize_align,
#     batched=True,
#     remove_columns=["tokens", "ner_tags"],
# )

# val_tokenized = val_ds.map(
#     tokenize_align,
#     batched=True,
#     remove_columns=["tokens", "ner_tags"],
# )

# test_tokenized = test_ds.map(
#     tokenize_align,
#     batched=True,
#     remove_columns=["tokens", "ner_tags"],
# )
from datasets import Dataset, DatasetDict

# 4) Tokenize each split
tokenized = DatasetDict({
    split: ds.map(tokenize_align, batched=True, remove_columns=["tokens","ner_tags"])
    for split, ds in datasets.items()
})

Map:   0%|          | 0/1676 [00:00<?, ? examples/s]

Map:   0%|          | 0/209 [00:00<?, ? examples/s]

Map:   0%|          | 0/210 [00:00<?, ? examples/s]

### Tokenized/Preprocess artifact

In [18]:
import os
import json
import shutil  # only used if you enable zipping
# from datasets import Dataset  # only needed if you're constructing datasets here

# tokenized is assumed to be: {"training": Dataset, "validation": Dataset, "test": Dataset}
out_root = "tokenized"
os.makedirs(out_root, exist_ok=True)

manifest = {}

for split, ds in tokenized.items():
    path = os.path.join(out_root, split)
    ds.save_to_disk(path)
    manifest[split] = {"num_rows": ds.num_rows, "path": path}

# Save a simple manifest for reference
with open(os.path.join(out_root, "manifest.json"), "w", encoding="utf-8") as f:
    json.dump(manifest, f, ensure_ascii=False, indent=2)

print("✅ Saved splits to disk.")
print({k: v["num_rows"] for k, v in manifest.items()})

# # Optional: zip the whole directory (set to True if you want an archive)
# MAKE_ARCHIVE = False
# if MAKE_ARCHIVE:
#     archive_path = shutil.make_archive(out_root, "zip", root_dir=out_root)
#     print("📦 Zipped to:", archive_path)


Saving the dataset (0/1 shards):   0%|          | 0/1676 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/209 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/210 [00:00<?, ? examples/s]

✅ Saved splits to disk.
{'training': 1676, 'validation': 209, 'test': 210}


In [19]:
from datasets import load_from_disk
train_ds = load_from_disk("tokenized/training")
val_ds   = load_from_disk("tokenized/validation")
test_ds  = load_from_disk("tokenized/test")


##### TODO: add subword_tokens_tags column:

In [20]:
# ---- VISUALIZATION STEP (no W&B) ----
import os
import csv

max_samples = 20

# Read a few examples from your local training split
examples = read_conll(paths["training"])
tokens_list, tags_list = examples["tokens"], examples["ner_tags"]

# Tokenize the same slice you want to inspect
batch = {
    "tokens":   tokens_list[:max_samples],
    "ner_tags": tags_list[:max_samples],
}
tok_out = tokenize_align(batch)

# Build rows for inspection
rows = []
for i, (toks, tags) in enumerate(zip(batch["tokens"], batch["ner_tags"]), start=1):
    sub_tokens = tokenizer.convert_ids_to_tokens(tok_out["input_ids"][i-1])
    label_ids  = tok_out["labels"][i-1]
    rows.append({
        "idx": i,
        "orig_tokens": " ".join(toks),
        "orig_tags": " ".join(tags),
        "subword_tokens": " ".join(sub_tokens),
        "label_ids": " ".join(str(x) for x in label_ids),
    })

# # Pretty-print to console
# for r in rows:
#     print(f"#{r['idx']}")
#     print("TOKENS:     ", r["orig_tokens"])
#     print("TAGS:       ", r["orig_tags"])
#     print("SUBTOKENS:  ", r["subword_tokens"])
#     print("LABEL_IDS:  ", r["label_ids"])
#     print("-" * 60)

# Save to CSV for later inspection
os.makedirs("viz", exist_ok=True)
csv_path = os.path.join("viz", "tokenization_spotcheck.csv")
with open(csv_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(
        f, fieldnames=["idx", "orig_tokens", "orig_tags", "subword_tokens", "label_ids"]
    )
    writer.writeheader()
    writer.writerows(rows)
print("✅ Wrote:", csv_path)

# Optional: show as a DataFrame in notebooks
try:
    import pandas as pd
    from IPython.display import display
    df = pd.DataFrame(rows)
    display(df)
except Exception:
    pass


✅ Wrote: viz/tokenization_spotcheck.csv


Unnamed: 0,idx,orig_tokens,orig_tags,subword_tokens,label_ids
0,1,এই শহরের জনসংখ্যার ১১ হল ৬ বছর বা তার কম বয়সী,O S-LOC O O O B-DATE E-DATE O O O O,[CLS] এই শহরের জনস ##ংখ ##যার [UNK] হল ৬ বছর ব...,-100 12 14 12 12 12 12 12 0 4 12 12 12 12 12 1...
1,2,এটি শেষ হয় ১৩০৪ সালে,O O O B-DATE E-DATE,[CLS] এটি শেষ হয [UNK] সালে [SEP] [PAD] [PAD] ...,-100 12 12 12 0 4 -100 -100 -100 -100 -100 -10...
2,3,সকাল ৮টা থেকে রাত ৮টা পর্যন্ত প্রতি আধ ঘণ্টা প...,O O O O O O O O O O O O O O S-LOC O O O O S-LOC O,[CLS] সকাল ৮ ##টা থেকে রাত ৮ ##টা পর ##যন ##ত ...,-100 12 12 12 12 12 12 12 12 12 12 12 12 12 12...
3,4,মুহাম্মাদ সমগ্র বাহিনীর প্রধান হিসেবে দায়িত্ব...,S-PER O O O O O O,[CLS] মহা ##মম ##াদ সম ##গর বাহিনীর পর ##ধান হ...,-100 17 17 17 12 12 12 12 12 12 12 12 12 12 -1...
4,5,প্রতিরোধ ২০১১ সাল পর্যন্ত হেপাটাইটিস সি এর কোন...,O B-DATE E-DATE O O O O O O O,[CLS] পরত ##ির ##ে ##াধ [UNK] সাল পর ##যন ##ত ...,-100 12 12 12 12 0 4 12 12 12 12 12 12 12 12 1...
5,6,সারা ভারতের সাক্ষরতার হার ৫৯ ৫ তার চাইতে রাজুর...,O S-LOC O O O O O O S-LOC O O O O,[CLS] সারা ভারতের সাক ##ষর ##তার হার [UNK] ৫ ত...,-100 12 14 12 12 12 12 12 12 12 12 14 14 12 12...
6,7,পরবর্তীতে ১৯৬৫ সাল থেকে পৃথকভাবে সম্প্রচারিত হয়,O B-DATE E-DATE O O O O,[CLS] পরব ##রত ##ীতে [UNK] সাল থেকে পথ ##কভাবে...,-100 12 12 12 0 4 12 12 12 12 12 12 12 12 -100...
7,8,বাংলাদেশের ২০১২ সালের বন্যপ্রাণী আইনের তফসিল ১...,S-LOC B-DATE E-DATE O O O O O O O O,[CLS] বাংলাদেশের [UNK] সালের বন ##য ##পরা ##ণী...,-100 14 0 4 12 12 12 12 12 12 12 12 12 12 12 1...
8,9,দ্বিতীয় খন্ড প্রকাশিত হয় ১৮৮৭ সালের ২৪ জানুয...,O O O O B-DATE I-DATE I-DATE E-DATE,[CLS] দ ##বিত ##ী ##য খন ##ড পর ##কাশিত হয [UN...,-100 12 12 12 12 12 12 12 12 12 0 8 8 4 4 4 -1...
9,10,এই শহরের জনসংখ্যার ১১ হল ৬ বছর বা তার কম বয়সী,O S-LOC O O O B-DATE E-DATE O O O O,[CLS] এই শহরের জনস ##ংখ ##যার [UNK] হল ৬ বছর ব...,-100 12 14 12 12 12 12 12 0 4 12 12 12 12 12 1...


### Define Metrics Computation

In [21]:
from seqeval.metrics import f1_score, accuracy_score, classification_report

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)
    true = p.label_ids

    # Convert to label strings, filtering -100
    true_labels = [
        [id2label[l] for l in seq if l != -100]
        for seq in true
    ]
    pred_labels = [
        [id2label[p_] for (p_, l) in zip(seq_p, seq_t) if l != -100]
        for seq_p, seq_t in zip(preds, true)
    ]

    return {
        "accuracy": accuracy_score(true_labels, pred_labels),
        "f1": f1_score(true_labels, pred_labels)
    }


### Log Model

In [22]:
from datasets import load_from_disk, DatasetDict
from transformers import (
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    TrainerCallback,
    EarlyStoppingCallback
)
# from transformers.integrations import WandbCallback
from sklearn.model_selection import KFold


In [None]:
# Shell: enable automatic checkpoint artifact logging
!export WANDB_PROJECT="bangla-ner"
!export WANDB_LOG_MODEL="checkpoint"


In [24]:
import os
import numpy as np
from datasets import load_from_disk
from sklearn.model_selection import KFold
from transformers import (
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
)

# 1) Load your tokenized splits from LOCAL disk (no W&B)
tokenized_dir = "./tokenized"  # change if needed
train_ds = load_from_disk(os.path.join(tokenized_dir, "training"))
val_ds   = load_from_disk(os.path.join(tokenized_dir, "validation"))
test_ds  = load_from_disk(os.path.join(tokenized_dir, "test"))

# 2) Configure 5-fold CV on the training set only
kf = KFold(n_splits=5, shuffle=True, random_state=42)
all_probs, all_labels, all_sents = [], [], []

# Use numeric indices explicitly to avoid type issues
indices = np.arange(len(train_ds))

for fold, (train_idx, val_idx) in enumerate(kf.split(indices), start=1):
    print(f"=== Fold {fold}/5 ===")

    # 3) Select per-fold train/validation subsets
    fold_train = train_ds.select(train_idx)
    fold_val   = train_ds.select(val_idx)
    print(f"  Train indices count: {len(fold_train)}")
    print(f"  Val   indices count: {len(fold_val)}\n")

    # 4) Fresh model for each fold
    model = AutoModelForTokenClassification.from_pretrained(
        "sagorsarker/bangla-bert-base",
        num_labels=len(label2id),
        id2label=id2label,
        label2id=label2id,
    )

    # 5) Training args — NO W&B (report_to="none")
    args = TrainingArguments(
        output_dir=f"./cv_fold_{fold}",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,

        report_to="none",             # <- key change
        logging_strategy="epoch",
        save_total_limit=2,

        learning_rate=3e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=5,
        weight_decay=0.01,
        seed=42 + fold,
    )

    # 6) Trainer (no WandbCallback)
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=fold_train,
        eval_dataset=fold_val,
        tokenizer=tokenizer,          # assumes you defined this earlier
        compute_metrics=compute_metrics,  # assumes you defined this earlier
        callbacks=[
            EarlyStoppingCallback(early_stopping_patience=1)
        ]
    )

    # 7) Train & evaluate this fold
    trainer.train()

    # 8) Save best checkpoint path locally (already saved by HF; this just echoes)
    best_ckpt = trainer.state.best_model_checkpoint or args.output_dir
    print(f"Best checkpoint for fold {fold}: {best_ckpt}")

    # 9) Collect predictions/labels
    preds = trainer.predict(fold_val)
    all_probs.append(preds.predictions)   # (n_val, seq_len, n_labels)
    all_labels.append(preds.label_ids)    # (n_val, seq_len)

    # 10) Keep the original tokens for later analysis
    fold_sents = fold_val["orig_tokens"]       # list[list[str]]
    all_sents.extend(fold_sents)

print("✅ 5-fold CV complete (no W&B).")


=== Fold 1/5 ===
  Train indices count: 1340
  Val   indices count: 336



Some weights of the model checkpoint at sagorsarker/bangla-bert-base were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at sagorsarker/bangla-bert-base and a

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6333,0.404682,0.898088,0.668524
2,0.2647,0.316023,0.919793,0.745863
3,0.134,0.297338,0.923099,0.764248
4,0.0735,0.305624,0.933017,0.794428
5,0.0466,0.292962,0.936898,0.810179


Best checkpoint for fold 1: ./cv_fold_1/checkpoint-420


ValueError: Column 'orig_tokens' doesn't exist.

In [None]:

# 1) Start a single W&B run for 5-fold CV
run = wandb.init(project="bangla-ner", job_type="5fold-cv")

# 2) Reuse your preprocessed tokenized artifact
tok_art = run.use_artifact("multi-dataset-tokenized:latest", type="dataset")
tokenized_dir = tok_art.download()  # fetches ./tokenized/ locally :contentReference[oaicite:4]{index=4}

# 3) Load splits from disk into HF Datasets
train_ds = load_from_disk(os.path.join(tokenized_dir, "training"))
val_ds   = load_from_disk(os.path.join(tokenized_dir, "validation"))
test_ds  = load_from_disk(os.path.join(tokenized_dir, "test"))

# 4) Configure 5-fold CV on the training set only
kf = KFold(n_splits=5, shuffle=True, random_state=42)
all_probs, all_labels, all_sents = [], [], []
for fold, (train_idx, val_idx) in enumerate(kf.split(train_ds), start=1):
    print(f"=== Fold {fold}/5 ===")
    # print(f"=== Fold {fold + 1}/{n_splits} ===")


    # 5) Select per-fold train/validation subsets
    fold_train = train_ds.select(train_idx)
    fold_val   = train_ds.select(val_idx)
    print(f"  Train indices count: {len(fold_train)}")
    print(f"  Val   indices count: {len(fold_val)}\n")

    # 6) Initialize a fresh BERT model for token classification
    model = AutoModelForTokenClassification.from_pretrained(
        "sagorsarker/bangla-bert-base",
        num_labels=len(label2id),
        id2label=id2label,
        label2id=label2id
    )

    # 7) Define TrainingArguments with W&B integration
    args = TrainingArguments(
        output_dir=f"./cv_fold_{fold}",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,

        report_to="wandb",                  # Auto-log to W&B :contentReference[oaicite:5]{index=5}
        run_name=f"fold-{fold}",            # Name each fold run in UI
        # logging_dir=f"./logs/fold_{fold}",
        logging_strategy="epoch",   # log metrics each epoch
        # logging_steps=0,
        save_total_limit=2,

        # log_model=True,
        # log_model_name="bangla-bert-base",

        learning_rate=3e-5,                 # Hyperparameters
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=5,
        weight_decay=0.01,
        seed=42 + fold
    )


    # 8) Instantiate Trainer with WandbCallback & early stopping
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=fold_train,
        eval_dataset=fold_val,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[
            # WandbCallback(),              # handles train/val metric logging
            # ArtifactEpochCallback(),      # saves & logs artifact each epoch :contentReference[oaicite:3]{index=3}
            EarlyStoppingCallback(early_stopping_patience=1)
        ]
    )

    # 9) Train and evaluate on the fold
    trainer.train()


    # 10) Log best checkpoint as a Model Artifact
    best_ckpt = trainer.state.best_model_checkpoint
    trainer.save_model(best_ckpt)
    run.log_model(path=best_ckpt, name=f"bangla-ner-fold-{fold}")  # creates Model Artifact under “bangla-ner-fold-{fold}” :contentReference[oaicite:6]{index=6}

    preds  = trainer.predict(fold_val)
    all_probs.append(preds.predictions)      # shape (n_val, seq_len, n_labels)
    all_labels.append(preds.label_ids)        # shape (n_val, seq_len)

    # e) Extract the actual sentences (list of token lists) for this fold
    #    Assumes your dataset has a "tokens" column
    fold_sents = fold_val["tokens"]          # list of lists of strings
    all_sents.extend(fold_sents)

run.finish()  # ensures all data and artifacts are synced


In [None]:
print(trainer.state.best_model_checkpoint)


In [None]:
# Flatten arrays across folds

pred_probs = np.concatenate(all_probs, axis=0)
true_labels = np.concatenate(all_labels, axis=0)
pred_labels = np.argmax(pred_probs, axis=2)

# 11) After CV, you can optionally retrain on full training set with chosen hyperparams
#     final_trainer = Trainer(..., train_dataset=train_ds, eval_dataset=None, ...)
#     final_trainer.train()
#     run.log_model(path="./final_model", name="bangla-ner-final")

# 12) Evaluate final model on held-out validation & test splits
# val_metrics = trainer.predict(val_ds)
# test_metrics = trainer.predict(test_ds)
# run.log({"val_f1": val_metrics.metrics["eval_f1"], "test_f1": test_metrics.metrics["eval_f1"]})

# 13) Finish the W&B run

In [None]:
import wandb
run = wandb.init(project="bangla-ner", job_type="analysis")

table = wandb.Table(
    columns=["sentence", "token", "true_label", "pred_label", "pred_prob"]
)
for sent, probs, preds, trues in zip(sentences, pred_probs, pred_labels, true_labels):
    for token, p_row, p_label, t_label in zip(sent, probs, preds, trues):
        if t_label == -100:
            continue                  # skip ignores
        table.add_data(
            " ".join(sent),
            token,
            id2label[t_label],
            id2label[p_label],
            float(max(p_row)),
        )
run.log({"predictions_table": table})
run.finish()



In [None]:
!pip install cleanlab[fast]

In [None]:
### Code for hyperparameter tuning

from sklearn.model_selection import KFold

# 1) Load your splits
run.use_artifact("multi-dataset-tokenized:latest", type="dataset")
train_ds = load_from_disk("tokenized/training")
val_ds   = load_from_disk("tokenized/validation")
test_ds  = load_from_disk("tokenized/test")

# 2) Inner 5-fold CV on train_ds to pick hyperparams
best_hparams = None
best_score  = -float("inf")
for lr in [5e-5, 3e-5, 2e-5]:
  cv_scores = []
  kf = KFold(n_splits=5, shuffle=True, random_state=42)
  for train_idx, val_idx in kf.split(train_ds):
    ds_train = train_ds.select(train_idx)
    ds_val   = train_ds.select(val_idx)
    trainer  = build_trainer(hyperparams={"learning_rate": lr})
    trainer.train()
    metrics  = trainer.evaluate(ds_val)
    cv_scores.append(metrics["eval_f1"])
  avg_score = sum(cv_scores) / len(cv_scores)
  if avg_score > best_score:
    best_score  = avg_score
    best_hparams = {"learning_rate": lr}

# 3) Retrain on full training set with best_hparams
final_trainer = build_trainer(hyperparams=best_hparams)
final_trainer.train()



In [None]:

# 4) Evaluate on validation
val_metrics = final_trainer.predict(val_ds)
run.log(val_metrics.metrics)

# 5) (Optional) Final test
test_metrics = final_trainer.predict(test_ds)
run.log(test_metrics.metrics)

### In-Depth Evaluation & Error Analysis

In [None]:
preds  = trainer.predict(dataset.select(val_idx))
all_probs.append(preds.predictions)      # shape (n_val, seq_len, n_labels)
all_labels.append(preds.label_ids)       # shape (n_val, seq_len)
pred_probs = np.concatenate(all_probs, axis=0)
true_labels = np.concatenate(all_labels, axis=0)

In [None]:
# Run predictions on your validation set

# pred_output = trainer.predict(validation_dataset)
# logits       = pred_output.predictions       # shape (N, L, C)
# true_labels  = pred_output.label_ids         # shape (N, L)

In [None]:
# === 7. Filter out any unmatched instances before Cleanlab ===
valid_indices = [i for i, p in enumerate(pred_probs) if p is not None and len(p) == len(labels_int[i])]
filtered_labels = [labels_int[i] for i in valid_indices]
filtered_probs  = [pred_probs[i] for i in valid_indices]

# filtered_labels is List[List[int]], filtered_probs is List[np.ndarray]
labels_flat = np.concatenate(filtered_labels, axis=0)              # shape (sum Lᵢ,)
probs_flat  = np.concatenate(filtered_probs,  axis=0)             # shape (sum Lᵢ, K)


In [None]:

# === 8. Run Cleanlab on filtered training data ===
issues = find_label_issues_main(
    labels=labels_flat,
    pred_probs=probs_flat,
    return_indices_ranked_by="self_confidence"
)

offsets = np.cumsum([0] + [len(seq) for seq in filtered_labels[:-1]])
# Map back to original sentence/token indices
for flat_j in issues[:10]:
    sent_i = np.searchsorted(offsets, flat_j, side="right") - 1
    tok_i  = flat_j - offsets[sent_i]
    print(
        f"Sentence {sent_i}, Token '{sentences[sent_i][tok_i]}'",
        f"Gold={labels[sent_i][tok_i]}",
        f"Pred={id2label[probs_flat[flat_j].argmax()]}"
    )


In [None]:
# === 9. NER Error Analysis Using seqeval spans ===
from seqeval.metrics.sequence_labeling import get_entities

# Predict on validation set
texts = [" ".join(sentences[i]) for i in range(len(val_ds))]
true_labels = val_ds["ner_tags"]
pred_labels = []
for seq in val_ds["tokens"]:
    enc = tokenizer(
        seq,
        is_split_into_words=True,
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=128
    )
    logits = model(**enc).logits.squeeze(0).detach().cpu().numpy()
    # take tokens length
    length = len(seq)
    preds = [id2label[idx] for idx in logits.argmax(-1)[1:length+1]]
    pred_labels.append(preds)

# Compute span errors per sentence
print("text	GOLD	PRED	SPAN_ERROR_TYPE")
for text, gold_seq, pred_seq in zip(texts, true_labels, pred_labels):
    gold_spans = get_entities(gold_seq)
    pred_spans = get_entities(pred_seq)
    # identify errors
    # false negatives
    for span in gold_spans:
        if span not in pred_spans:
            err_type = 'FN'  # missed span
            print(f"{text}	{span}	None	{err_type}")
    # false positives
    for span in pred_spans:
        if span not in gold_spans:
            err_type = 'FP'  # spurious span
            print(f"{text}	None	{span}	{err_type}")
    # type errors: spans with same span but different label
    for g in gold_spans:
        for p in pred_spans:
            if g[0]==p[0] and g[1]==p[1] and g[2]!=p[2]:
                err_type = 'TYPE'  # wrong entity type
                print(f"{text}	{g}	{p}	{err_type}")

In [None]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

# 1. After your final Trainer.predict (or on your held-out test set):
preds_output = trainer.predict(test_ds)
preds = np.argmax(preds_output.predictions, axis=2)
labels = preds_output.label_ids

# 2. Convert IDs → label strings, filtering out the special –100 tags:
true_labels = [
    [id2label[label_id] for label_id in label_seq if label_id != -100]
    for label_seq in labels
]
pred_labels = [
    [id2label[pred_id]   for (pred_id,label_id) in zip(pred_seq, label_seq) if label_id != -100]
    for pred_seq, label_seq in zip(preds, labels)
]

# 3. Compute scores:
precision = precision_score(true_labels, pred_labels)
recall    = recall_score   (true_labels, pred_labels)
f1        = f1_score       (true_labels, pred_labels)

print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1‐Score:  {f1:.4f}")

# 4. (Optional) Detailed per‐class breakdown:
print(classification_report(true_labels, pred_labels))
