In [1]:
# ============================================
# 1. INSTALL DEPENDENCIES
# ============================================

print("Installing dependencies...")
!pip install -q -U pip
!pip install -q -U "transformers>=4.31.0" "huggingface_hub>=0.18.0"
!pip install -q git+https://github.com/csebuetnlp/normalizer
!pip install -q evaluate seqeval datasets accelerate


Installing dependencies...
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datasets 4.1.1 requires pyarrow>=21.0.0, but you have pyarrow 19.0.1 which is incompatible.
gradio 5.38.1 requires pydantic<2.12,>=2.0, but you have pydantic 2.12.0a1 which is incompatible.[0m[31m
[0m  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[33m  DEPRECATION: Building 'normalizer' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possib

In [2]:
# ============================================
# 2. IMPORTS
# ============================================

print("\nImporting libraries...")
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
import evaluate
import numpy as np



Importing libraries...


2025-10-23 13:49:04.299863: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1761227344.752730      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1761227344.890641      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# ============================================
# 3. GPU CHECK
# ============================================

print("\n" + "="*60)
print("GPU DIAGNOSTIC CHECK")
print("="*60)
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda if torch.cuda.is_available() else 'N/A'}")

if torch.cuda.is_available():
    print(f"GPU count: {torch.cuda.device_count()}")
    print(f"GPU name: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    device = torch.device("cuda")
    print("✓ GPU is available and will be used!")
else:
    device = torch.device("cpu")
    print("⚠️ WARNING: No GPU detected! Training will be SLOW on CPU")
    print("In Colab: Runtime > Change runtime type > Hardware accelerator > GPU")



GPU DIAGNOSTIC CHECK
PyTorch version: 2.6.0+cu124
CUDA available: True
CUDA version: 12.4
GPU count: 2
GPU name: Tesla T4
GPU memory: 15.83 GB
✓ GPU is available and will be used!


In [4]:
# ============================================
# 4. LOAD DATASETS
# ============================================

print("\n" + "="*60)
print("LOADING WIKIANN TAMIL DATASET (Training)")
print("="*60)
train_dataset = load_dataset("wikiann", "ta")
print("Tamil dataset for training:")
print(train_dataset)
print(f"\nTrain size: {len(train_dataset['train'])}")
print(f"Validation size: {len(train_dataset['validation'])}")

print("\n" + "="*60)
print("LOADING WIKIANN BANGLA DATASET (Testing - Cross-lingual Transfer)")
print("="*60)
test_dataset = load_dataset("wikiann", "bn")
print("Bangla dataset for testing:")
print(test_dataset)
print(f"Test size: {len(test_dataset['test'])}")

# Use Tamil for training/validation, Bangla for test
dataset = {
    "train": train_dataset["train"],
    "validation": train_dataset["validation"],
    "test": test_dataset["test"]
}

print("\nSample from Tamil training set:")
print(dataset["train"][0])

label_list = dataset["train"].features["ner_tags"].feature.names
print(f"\nNER Labels: {label_list}")

print("\n" + "-"*60)
print("SAMPLE TAMIL SENTENCES WITH TAGS (Training Data)")
print("-"*60)
for i in range(3):
    tokens = dataset["train"][i]["tokens"]
    tags = [label_list[t] for t in dataset["train"][i]["ner_tags"]]
    print(f"\nSentence {i+1}:")
    print("Tokens:", " ".join(tokens))
    print("Tags:  ", " ".join(tags))

print("\n" + "-"*60)
print("SAMPLE BANGLA SENTENCES (Test Data - Cross-lingual Evaluation)")
print("-"*60)
for i in range(2):
    tokens = dataset["test"][i]["tokens"]
    tags = [label_list[t] for t in dataset["test"][i]["ner_tags"]]
    print(f"\nSentence {i+1}:")
    print("Tokens:", " ".join(tokens))
    print("Tags:  ", " ".join(tags))


LOADING WIKIANN TAMIL DATASET (Training)


README.md: 0.00B [00:00, ?B/s]

ta/validation-00000-of-00001.parquet:   0%|          | 0.00/92.4k [00:00<?, ?B/s]

ta/test-00000-of-00001.parquet:   0%|          | 0.00/92.7k [00:00<?, ?B/s]

ta/train-00000-of-00001.parquet:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/15000 [00:00<?, ? examples/s]

Tamil dataset for training:
DatasetDict({
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 1000
    })
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 15000
    })
})

Train size: 15000
Validation size: 1000

LOADING WIKIANN BANGLA DATASET (Testing - Cross-lingual Transfer)


bn/validation-00000-of-00001.parquet:   0%|          | 0.00/56.0k [00:00<?, ?B/s]

bn/test-00000-of-00001.parquet:   0%|          | 0.00/57.6k [00:00<?, ?B/s]

bn/train-00000-of-00001.parquet:   0%|          | 0.00/554k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Bangla dataset for testing:
DatasetDict({
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 1000
    })
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 10000
    })
})
Test size: 1000

Sample from Tamil training set:
{'tokens': ['இங்கிருக்கும்', 'அரச', 'மரம்', 'தேவனம்பியதீச', 'மன்னன்', 'காலத்திலேயே', 'அனுராதபுரத்திலிருக்கும்', 'சிறீ', 'மகாபோதியிலிருந்து', 'முதலாவதாகப்', 'பிரித்தெடுத்து', 'நடப்பட்டதாகும்', '.'], 'ner_tags': [0, 5, 6, 1, 0, 0, 0, 3, 4, 0, 0, 0, 0], 'langs': ['ta', 'ta', 'ta', 'ta', 'ta', 'ta', 'ta', 'ta', 'ta', 'ta', 'ta', 'ta', 'ta'], 'spans': ['LOC: அரச மரம்', 'PER: தேவனம்பியதீச', 'ORG: சிறீ மகாபோதியிலிருந்து']}

NER Labels: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

------------------------------------------------------------
SAMPLE TAMIL 

In [5]:
# ============================================
# 5. LOAD TOKENIZER — INDICBERT
# ============================================

print("\n" + "="*60)
print("LOADING TOKENIZER (INDICBERT)")
print("="*60)
MODEL_NAME = "ai4bharat/IndicBERTv2-MLM-only"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(f"Tokenizer loaded: {MODEL_NAME}")



LOADING TOKENIZER (INDICBERT)


tokenizer_config.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.75M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Tokenizer loaded: ai4bharat/IndicBERTv2-MLM-only


In [6]:
# ============================================
# 6. TOKENIZATION FUNCTION
# ============================================

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding=False,
        max_length=128
    )

    all_labels = []
    for i, labels in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(labels[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        all_labels.append(label_ids)

    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs


In [7]:
# ============================================
# 7. TOKENIZE DATASET
# ============================================

print("\n" + "="*60)
print("TOKENIZING DATASETS")
print("="*60)

# Tokenize each split separately
tokenized_train = dataset["train"].map(
    tokenize_and_align_labels,
    batched=True,
    desc="Tokenizing Tamil train"
)
tokenized_val = dataset["validation"].map(
    tokenize_and_align_labels,
    batched=True,
    desc="Tokenizing Tamil validation"
)
tokenized_test = dataset["test"].map(
    tokenize_and_align_labels,
    batched=True,
    desc="Tokenizing Bangla test"
)

# Remove unnecessary columns
tokenized_train = tokenized_train.remove_columns(["tokens", "ner_tags", "langs"])
tokenized_val = tokenized_val.remove_columns(["tokens", "ner_tags", "langs"])
tokenized_test = tokenized_test.remove_columns(["tokens", "ner_tags", "langs"])

# Create the tokenized_datasets dictionary
tokenized_datasets = {
    "train": tokenized_train,
    "validation": tokenized_val,
    "test": tokenized_test
}

print("Tokenized datasets:")
print(f"  Train: {len(tokenized_datasets['train'])} samples")
print(f"  Validation: {len(tokenized_datasets['validation'])} samples")
print(f"  Test (Bangla): {len(tokenized_datasets['test'])} samples")


TOKENIZING DATASETS


Tokenizing Tamil train:   0%|          | 0/15000 [00:00<?, ? examples/s]

Tokenizing Tamil validation:   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenizing Bangla test:   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenized datasets:
  Train: 15000 samples
  Validation: 1000 samples
  Test (Bangla): 1000 samples


In [8]:
# ============================================
# 8. LOAD MODEL — INDICBERT
# ============================================

print("\n" + "="*60)
print("LOADING MODEL (INDICBERT)")
print("="*60)
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label_list),
    id2label={i: label for i, label in enumerate(label_list)},
    label2id={label: i for i, label in enumerate(label_list)}
)
model = model.to(device)
print(f"Model loaded: {MODEL_NAME}")
print(f"Model device: {next(model.parameters()).device}")
print(f"Number of parameters: {model.num_parameters():,}")



LOADING MODEL (INDICBERT)


config.json:   0%|          | 0.00/639 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at ai4bharat/IndicBERTv2-MLM-only and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded: ai4bharat/IndicBERTv2-MLM-only
Model device: cuda:0
Number of parameters: 277,456,135


In [9]:
# ============================================
# 9. SETUP METRICS
# ============================================

print("\n" + "="*60)
print("LOADING METRICS")
print("="*60)
metric = evaluate.load("seqeval")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }



LOADING METRICS


Downloading builder script: 0.00B [00:00, ?B/s]

In [10]:
# ============================================
# 10. DATA COLLATOR
# ============================================

print("\n" + "="*60)
print("SETTING UP DATA COLLATOR")
print("="*60)
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    padding=True,
    label_pad_token_id=-100
)
print("Data collator created successfully!")



SETTING UP DATA COLLATOR
Data collator created successfully!


In [11]:
# ============================================
# 11. TRAINING ARGUMENTS
# ============================================

print("\n" + "="*60)
print("SETTING UP TRAINING ARGUMENTS")
print("="*60)

training_args = TrainingArguments(
    output_dir="./results-indicbert-tamil-ner",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    fp16=torch.cuda.is_available(),
    push_to_hub=False,
    save_total_limit=2,
    report_to="none",
)

print(f"Training device: {training_args.device}")
print(f"FP16 training: {training_args.fp16}")
print(f"Batch size: {training_args.per_device_train_batch_size}")
print(f"Number of epochs: {training_args.num_train_epochs}")
print("\n⚠️ NOTE: Training on TAMIL data, Testing on BANGLA data (Cross-lingual Transfer)")


SETTING UP TRAINING ARGUMENTS
Training device: cuda:0
FP16 training: True
Batch size: 16
Number of epochs: 5

⚠️ NOTE: Training on TAMIL data, Testing on BANGLA data (Cross-lingual Transfer)


In [12]:
# ============================================
# 12. CREATE TRAINER
# ============================================

print("\n" + "="*60)
print("CREATING TRAINER")
print("="*60)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

print("Trainer created successfully!")



CREATING TRAINER
Trainer created successfully!


In [13]:
# ============================================
# 13. START TRAINING
# ============================================

print("\n" + "="*60)
print("STARTING TRAINING")
print("="*60)
print(f"This will take approximately 15–25 minutes on a T4 GPU")
print("="*60 + "\n")

trainer.train()

print("\n" + "="*60)
print("TRAINING COMPLETED!")
print("="*60)


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 3}.



STARTING TRAINING
This will take approximately 15–25 minutes on a T4 GPU





Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1887,0.181388,0.801581,0.831148,0.816097,0.941725
2,0.137,0.14949,0.855519,0.863934,0.859706,0.954751
3,0.0992,0.14166,0.839314,0.881967,0.860112,0.955985
4,0.0695,0.148929,0.860539,0.890164,0.875101,0.958865
5,0.0616,0.153928,0.861905,0.890164,0.875806,0.95955





TRAINING COMPLETED!


In [14]:
# ============================================
# 14. FINAL EVALUATION
# ============================================

print("\n" + "="*60)
print("FINAL EVALUATION ON TAMIL VALIDATION SET")
print("="*60)
eval_results = trainer.evaluate()
print("\nTamil Validation Results:")
for key, value in eval_results.items():
    print(f"  {key}: {value:.4f}")

print("\n" + "="*60)
print("CROSS-LINGUAL EVALUATION ON BANGLA TEST SET")
print("="*60)
print("⚠️ Testing cross-lingual transfer: Tamil-trained model on Bangla data")
test_results = trainer.evaluate(tokenized_datasets["test"])
print("\nBangla Test Results (Cross-lingual Transfer):")
for key, value in test_results.items():
    print(f"  {key}: {value:.4f}")

print("\n" + "="*60)
print("TRANSFER LEARNING ANALYSIS")
print("="*60)
print(f"Tamil → Tamil F1: {eval_results['eval_f1']:.4f}")
print(f"Tamil → Bangla F1: {test_results['eval_f1']:.4f}")
transfer_gap = eval_results['eval_f1'] - test_results['eval_f1']
print(f"Transfer Gap: {transfer_gap:.4f}")
if transfer_gap > 0:
    print("✓ Model performs better on source language (expected)")
else:
    print("⚠️ Model performs better on target language (unexpected)")


FINAL EVALUATION ON TAMIL VALIDATION SET





Tamil Validation Results:
  eval_loss: 0.1539
  eval_precision: 0.8619
  eval_recall: 0.8902
  eval_f1: 0.8758
  eval_accuracy: 0.9596
  eval_runtime: 4.0541
  eval_samples_per_second: 246.6660
  eval_steps_per_second: 7.8930
  epoch: 5.0000

CROSS-LINGUAL EVALUATION ON BANGLA TEST SET
⚠️ Testing cross-lingual transfer: Tamil-trained model on Bangla data

Bangla Test Results (Cross-lingual Transfer):
  eval_loss: 0.5748
  eval_precision: 0.7756
  eval_recall: 0.8127
  eval_f1: 0.7937
  eval_accuracy: 0.8668
  eval_runtime: 3.8238
  eval_samples_per_second: 261.5220
  eval_steps_per_second: 8.3690
  epoch: 5.0000

TRANSFER LEARNING ANALYSIS
Tamil → Tamil F1: 0.8758
Tamil → Bangla F1: 0.7937
Transfer Gap: 0.0821
✓ Model performs better on source language (expected)


In [15]:
# ============================================
# 15. SAVE MODEL
# ============================================

print("\n" + "="*60)
print("SAVING FINAL MODEL")
print("="*60)
trainer.save_model("./indicbert-tamil-ner-final")
tokenizer.save_pretrained("./indicbert-tamil-ner-final")
print("Model saved to: ./indicbert-tamil-ner-final")
print("This model was trained on Tamil and can be used for cross-lingual transfer to Bangla")


SAVING FINAL MODEL
Model saved to: ./indicbert-tamil-ner-final
This model was trained on Tamil and can be used for cross-lingual transfer to Bangla


In [16]:
import shutil
import os

# Define the output zip file
output_filename = "kaggle_working_dir.zip"

# Current working directory
current_dir = os.getcwd()

# Create a zip of the current directory
shutil.make_archive("kaggle_working_dir", 'zip', current_dir)

print(f"✅ ZIP file created: {output_filename}")


✅ ZIP file created: kaggle_working_dir.zip


In [17]:
# ============================================
# 16. TEST INFERENCE
# ============================================

print("\n" + "="*60)
print("TESTING INFERENCE")
print("="*60)

test_sentence = dataset["test"][0]
test_tokens = test_sentence["tokens"]
print(f"\nTest sentence: {' '.join(test_tokens)}")

inputs = tokenizer(
    test_tokens,
    is_split_into_words=True,
    return_tensors="pt",
    truncation=True,
    padding=True
).to(device)

with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=2)

predicted_labels = [label_list[p.item()] for p in predictions[0]]
word_ids = inputs.word_ids()

final_predictions = []
previous_word_idx = None
for word_idx, pred_label in zip(word_ids, predicted_labels):
    if word_idx is not None and word_idx != previous_word_idx:
        final_predictions.append(pred_label)
        previous_word_idx = word_idx

print("\nPredicted NER tags:")
for token, pred_tag in zip(test_tokens, final_predictions):
    print(f"  {token:20s} -> {pred_tag}")

print("\n" + "="*60)
print("ALL DONE! 🎉")
print("="*60)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



TESTING INFERENCE

Test sentence: উরুগুয়ে জাতীয় ফুটবল দল

Predicted NER tags:
  উরুগুয়ে             -> B-ORG
  জাতীয়               -> I-ORG
  ফুটবল                -> I-ORG
  দল                   -> I-ORG

ALL DONE! 🎉
