In [17]:
# ============================================
# 1. INSTALL DEPENDENCIES
# ============================================

print("Installing dependencies...")
!pip install -q -U pip
!pip install -q -U "transformers>=4.31.0" "huggingface_hub>=0.18.0"
!pip install -q git+https://github.com/csebuetnlp/normalizer
!pip install -q evaluate seqeval datasets accelerate


Installing dependencies...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  Preparing metadata (setup.py) ... [?25l[?25hdone


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [18]:
# ============================================
# 2. IMPORTS
# ============================================

print("\nImporting libraries...")
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
import evaluate
import numpy as np



Importing libraries...


In [19]:
# ============================================
# 3. GPU CHECK
# ============================================

print("\n" + "="*60)
print("GPU DIAGNOSTIC CHECK")
print("="*60)
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda if torch.cuda.is_available() else 'N/A'}")

if torch.cuda.is_available():
    print(f"GPU count: {torch.cuda.device_count()}")
    print(f"GPU name: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    device = torch.device("cuda")
    print("‚úì GPU is available and will be used!")
else:
    device = torch.device("cpu")
    print("‚ö†Ô∏è WARNING: No GPU detected! Training will be SLOW on CPU")
    print("In Colab: Runtime > Change runtime type > Hardware accelerator > GPU")



GPU DIAGNOSTIC CHECK
PyTorch version: 2.6.0+cu124
CUDA available: True
CUDA version: 12.4
GPU count: 2
GPU name: Tesla T4
GPU memory: 15.83 GB
‚úì GPU is available and will be used!


In [20]:
# ============================================
# 4. LOAD DATASET
# ============================================

print("\n" + "="*60)
print("LOADING WIKIANN BANGLA DATASET")
print("="*60)
dataset = load_dataset("wikiann", "bn")
print(dataset)
print(f"\nTrain size: {len(dataset['train'])}")
print(f"Validation size: {len(dataset['validation'])}")
print(f"Test size: {len(dataset['test'])}")

print("\nSample from training set:")
print(dataset["train"][0])

label_list = dataset["train"].features["ner_tags"].feature.names
print(f"\nNER Labels: {label_list}")

print("\n" + "-"*60)
print("SAMPLE SENTENCES WITH TAGS")
print("-"*60)
for i in range(3):
    tokens = dataset["train"][i]["tokens"]
    tags = [label_list[t] for t in dataset["train"][i]["ner_tags"]]
    print(f"\nSentence {i+1}:")
    print("Tokens:", " ".join(tokens))
    print("Tags:  ", " ".join(tags))



LOADING WIKIANN BANGLA DATASET
DatasetDict({
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 1000
    })
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 10000
    })
})

Train size: 10000
Validation size: 1000
Test size: 1000

Sample from training set:
{'tokens': ['‡¶°‡ßç‡¶Ø‡¶æ‡¶®‡¶≠‡¶ø‡¶≤', ',', '‡¶á‡¶≤‡¶ø‡¶®‡¶Ø‡¶º'], 'ner_tags': [5, 6, 6], 'langs': ['bn', 'bn', 'bn'], 'spans': ['LOC: ‡¶°‡ßç‡¶Ø‡¶æ‡¶®‡¶≠‡¶ø‡¶≤ , ‡¶á‡¶≤‡¶ø‡¶®‡¶Ø‡¶º']}

NER Labels: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

------------------------------------------------------------
SAMPLE SENTENCES WITH TAGS
------------------------------------------------------------

Sentence 1:
Tokens: ‡¶°‡ßç‡¶Ø‡¶æ‡¶®‡¶≠‡¶ø‡¶≤ , ‡¶á‡¶≤‡¶ø‡¶®‡¶Ø‡¶º
Tags:   B-LOC I-LOC I-LOC

Sentence 2:
Tokens: ‡¶∂‡¶ø‡

In [21]:
# ============================================
# 5. LOAD TOKENIZER
# ============================================

print("\n" + "="*60)
print("LOADING TOKENIZER")
print("="*60)
MODEL_NAME = "csebuetnlp/banglabert"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(f"Tokenizer loaded: {MODEL_NAME}")



LOADING TOKENIZER
Tokenizer loaded: csebuetnlp/banglabert


In [22]:
# ============================================
# 6. TOKENIZATION FUNCTION
# ============================================

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding=False,
        max_length=128
    )

    all_labels = []
    for i, labels in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(labels[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        all_labels.append(label_ids)

    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs


In [23]:
# ============================================
# 7. TOKENIZE DATASET
# ============================================

print("\n" + "="*60)
print("TOKENIZING DATASET")
print("="*60)
tokenized_datasets = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    desc="Tokenizing"
)
tokenized_datasets = tokenized_datasets.remove_columns(["tokens", "ner_tags", "langs"])
print("Tokenized dataset:")
print(tokenized_datasets)



TOKENIZING DATASET


Tokenizing:   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenized dataset:
DatasetDict({
    validation: Dataset({
        features: ['spans', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['spans', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    train: Dataset({
        features: ['spans', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 10000
    })
})


In [24]:
# ============================================
# 8. LOAD MODEL
# ============================================

print("\n" + "="*60)
print("LOADING MODEL")
print("="*60)
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label_list),
    id2label={i: label for i, label in enumerate(label_list)},
    label2id={label: i for i, label in enumerate(label_list)}
)
model = model.to(device)
print(f"Model loaded: {MODEL_NAME}")
print(f"Model device: {next(model.parameters()).device}")
print(f"Number of parameters: {model.num_parameters():,}")



LOADING MODEL


Some weights of ElectraForTokenClassification were not initialized from the model checkpoint at csebuetnlp/banglabert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded: csebuetnlp/banglabert
Model device: cuda:0
Number of parameters: 110,032,135


In [25]:
# ============================================
# 9. SETUP METRICS
# ============================================

print("\n" + "="*60)
print("LOADING METRICS")
print("="*60)
metric = evaluate.load("seqeval")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }



LOADING METRICS


In [26]:
# ============================================
# 10. DATA COLLATOR
# ============================================

print("\n" + "="*60)
print("SETTING UP DATA COLLATOR")
print("="*60)
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    padding=True,
    label_pad_token_id=-100
)
print("Data collator created successfully!")



SETTING UP DATA COLLATOR
Data collator created successfully!


In [27]:
# ============================================
# 11. TRAINING ARGUMENTS
# ============================================

print("\n" + "="*60)
print("SETTING UP TRAINING ARGUMENTS")
print("="*60)

training_args = TrainingArguments(
    output_dir="./results-banglabert-ner",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    fp16=torch.cuda.is_available(),
    push_to_hub=False,
    save_total_limit=2,
    report_to="none",
)

print(f"Training device: {training_args.device}")
print(f"FP16 training: {training_args.fp16}")
print(f"Batch size: {training_args.per_device_train_batch_size}")
print(f"Number of epochs: {training_args.num_train_epochs}")



SETTING UP TRAINING ARGUMENTS
Training device: cuda:0
FP16 training: True
Batch size: 16
Number of epochs: 5


In [28]:
# ============================================
# 12. CREATE TRAINER
# ============================================

print("\n" + "="*60)
print("CREATING TRAINER")
print("="*60)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

print("Trainer created successfully!")



CREATING TRAINER
Trainer created successfully!


In [29]:
# ============================================
# 13. START TRAINING
# ============================================

print("\n" + "="*60)
print("STARTING TRAINING")
print("="*60)
print(f"This will take approximately 15‚Äì25 minutes on a T4 GPU")
print("="*60 + "\n")

trainer.train()

print("\n" + "="*60)
print("TRAINING COMPLETED!")
print("="*60)



STARTING TRAINING
This will take approximately 15‚Äì25 minutes on a T4 GPU





Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2085,0.153454,0.922598,0.936766,0.929628,0.962853
2,0.1047,0.105964,0.951526,0.957543,0.954525,0.975006
3,0.0619,0.089324,0.957105,0.96748,0.962264,0.979821
4,0.0395,0.095077,0.968468,0.971093,0.969779,0.981197
5,0.0221,0.093371,0.97385,0.97561,0.974729,0.983031





TRAINING COMPLETED!


In [30]:
# ============================================
# 14. FINAL EVALUATION
# ============================================

print("\n" + "="*60)
print("FINAL EVALUATION ON VALIDATION SET")
print("="*60)
eval_results = trainer.evaluate()
print("\nValidation Results:")
for key, value in eval_results.items():
    print(f"  {key}: {value:.4f}")

print("\n" + "="*60)
print("EVALUATION ON TEST SET")
print("="*60)
test_results = trainer.evaluate(tokenized_datasets["test"])
print("\nTest Results:")
for key, value in test_results.items():
    print(f"  {key}: {value:.4f}")



FINAL EVALUATION ON VALIDATION SET





Validation Results:
  eval_loss: 0.0934
  eval_precision: 0.9739
  eval_recall: 0.9756
  eval_f1: 0.9747
  eval_accuracy: 0.9830
  eval_runtime: 2.0639
  eval_samples_per_second: 484.5280
  eval_steps_per_second: 15.5050
  epoch: 5.0000

EVALUATION ON TEST SET

Test Results:
  eval_loss: 0.0935
  eval_precision: 0.9707
  eval_recall: 0.9725
  eval_f1: 0.9716
  eval_accuracy: 0.9820
  eval_runtime: 2.0461
  eval_samples_per_second: 488.7290
  eval_steps_per_second: 15.6390
  epoch: 5.0000


In [31]:
# ============================================
# 15. SAVE MODEL
# ============================================

print("\n" + "="*60)
print("SAVING FINAL MODEL")
print("="*60)
trainer.save_model("./banglabert-ner-final")
tokenizer.save_pretrained("./banglabert-ner-final")
print("Model saved to: ./banglabert-ner-final")



SAVING FINAL MODEL
Model saved to: ./banglabert-ner-final


In [32]:
# ============================================
# 16. TEST INFERENCE
# ============================================

print("\n" + "="*60)
print("TESTING INFERENCE")
print("="*60)

test_sentence = dataset["test"][0]
test_tokens = test_sentence["tokens"]
print(f"\nTest sentence: {' '.join(test_tokens)}")

inputs = tokenizer(
    test_tokens,
    is_split_into_words=True,
    return_tensors="pt",
    truncation=True,
    padding=True
).to(device)

with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=2)

predicted_labels = [label_list[p.item()] for p in predictions[0]]
word_ids = inputs.word_ids()

final_predictions = []
previous_word_idx = None
for word_idx, pred_label in zip(word_ids, predicted_labels):
    if word_idx is not None and word_idx != previous_word_idx:
        final_predictions.append(pred_label)
        previous_word_idx = word_idx

print("\nPredicted NER tags:")
for token, pred_tag in zip(test_tokens, final_predictions):
    print(f"  {token:20s} -> {pred_tag}")

print("\n" + "="*60)
print("ALL DONE! üéâ")
print("="*60)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



TESTING INFERENCE

Test sentence: ‡¶â‡¶∞‡ßÅ‡¶ó‡ßÅ‡¶Ø‡¶º‡ßá ‡¶ú‡¶æ‡¶§‡ßÄ‡¶Ø‡¶º ‡¶´‡ßÅ‡¶ü‡¶¨‡¶≤ ‡¶¶‡¶≤

Predicted NER tags:
  ‡¶â‡¶∞‡ßÅ‡¶ó‡ßÅ‡¶Ø‡¶º‡ßá             -> B-ORG
  ‡¶ú‡¶æ‡¶§‡ßÄ‡¶Ø‡¶º               -> I-ORG
  ‡¶´‡ßÅ‡¶ü‡¶¨‡¶≤                -> I-ORG
  ‡¶¶‡¶≤                   -> I-ORG

ALL DONE! üéâ
