In [74]:
# ============================================
# 1. INSTALL DEPENDENCIES
# ============================================

print("Installing dependencies...")
!pip install -q -U pip
!pip install -q -U "transformers>=4.31.0" "huggingface_hub>=0.18.0"
!pip install -q git+https://github.com/csebuetnlp/normalizer
!pip install -q evaluate seqeval datasets accelerate


Installing dependencies...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  Preparing metadata (setup.py) ... [?25l[?25hdone


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [75]:
# ============================================
# 2. IMPORTS
# ============================================

print("\nImporting libraries...")
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
import evaluate
import numpy as np



Importing libraries...


In [76]:
# ============================================
# 3. GPU CHECK
# ============================================

print("\n" + "="*60)
print("GPU DIAGNOSTIC CHECK")
print("="*60)
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda if torch.cuda.is_available() else 'N/A'}")

if torch.cuda.is_available():
    print(f"GPU count: {torch.cuda.device_count()}")
    print(f"GPU name: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    device = torch.device("cuda")
    print("✓ GPU is available and will be used!")
else:
    device = torch.device("cpu")
    print("⚠️ WARNING: No GPU detected! Training will be SLOW on CPU")
    print("In Colab: Runtime > Change runtime type > Hardware accelerator > GPU")



GPU DIAGNOSTIC CHECK
PyTorch version: 2.6.0+cu124
CUDA available: True
CUDA version: 12.4
GPU count: 2
GPU name: Tesla T4
GPU memory: 15.83 GB
✓ GPU is available and will be used!


In [77]:
# ============================================
# 4. LOAD DATASETS
# ============================================

print("\n" + "="*60)
print("LOADING WIKIANN MARATHI DATASET (Training)")
print("="*60)
train_dataset = load_dataset("wikiann", "mr")
print("Marathi dataset for training:")
print(train_dataset)
print(f"\nTrain size: {len(train_dataset['train'])}")
print(f"Validation size: {len(train_dataset['validation'])}")

print("\n" + "="*60)
print("LOADING WIKIANN BANGLA DATASET (Testing - Cross-lingual Transfer)")
print("="*60)
test_dataset = load_dataset("wikiann", "bn")
print("Bangla dataset for testing:")
print(test_dataset)
print(f"Test size: {len(test_dataset['test'])}")

# Use Marathi for training/validation, Bangla for test
dataset = {
    "train": train_dataset["train"],
    "validation": train_dataset["validation"],
    "test": test_dataset["test"]
}

print("\nSample from Marathi training set:")
print(dataset["train"][0])

label_list = dataset["train"].features["ner_tags"].feature.names
print(f"\nNER Labels: {label_list}")

print("\n" + "-"*60)
print("SAMPLE MARATHI SENTENCES WITH TAGS (Training Data)")
print("-"*60)
for i in range(3):
    tokens = dataset["train"][i]["tokens"]
    tags = [label_list[t] for t in dataset["train"][i]["ner_tags"]]
    print(f"\nSentence {i+1}:")
    print("Tokens:", " ".join(tokens))
    print("Tags:  ", " ".join(tags))

print("\n" + "-"*60)
print("SAMPLE BANGLA SENTENCES (Test Data - Cross-lingual Evaluation)")
print("-"*60)
for i in range(2):
    tokens = dataset["test"][i]["tokens"]
    tags = [label_list[t] for t in dataset["test"][i]["ner_tags"]]
    print(f"\nSentence {i+1}:")
    print("Tokens:", " ".join(tokens))
    print("Tags:  ", " ".join(tags))


LOADING WIKIANN MARATHI DATASET (Training)


mr/validation-00000-of-00001.parquet:   0%|          | 0.00/75.0k [00:00<?, ?B/s]

mr/test-00000-of-00001.parquet:   0%|          | 0.00/75.9k [00:00<?, ?B/s]

mr/train-00000-of-00001.parquet:   0%|          | 0.00/373k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Marathi dataset for training:
DatasetDict({
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 1000
    })
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 5000
    })
})

Train size: 5000
Validation size: 1000

LOADING WIKIANN BANGLA DATASET (Testing - Cross-lingual Transfer)
Bangla dataset for testing:
DatasetDict({
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 1000
    })
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 10000
    })
})
Test size: 1000

Sample from Marathi training set:
{'tokens': ['आल्बुकर्की', '-', '५', ',', '४५', ',', '८५२'], 'ner_tags': [5,

In [78]:
# ============================================
# 5. LOAD TOKENIZER — INDICBERT
# ============================================

print("\n" + "="*60)
print("LOADING TOKENIZER (INDICBERT)")
print("="*60)
MODEL_NAME = "ai4bharat/IndicBERTv2-MLM-only"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(f"Tokenizer loaded: {MODEL_NAME}")



LOADING TOKENIZER (INDICBERT)
Tokenizer loaded: ai4bharat/IndicBERTv2-MLM-only


In [79]:
# ============================================
# 6. TOKENIZATION FUNCTION
# ============================================

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding=False,
        max_length=128
    )

    all_labels = []
    for i, labels in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(labels[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        all_labels.append(label_ids)

    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs


In [80]:
# ============================================
# 7. TOKENIZE DATASET
# ============================================

print("\n" + "="*60)
print("TOKENIZING DATASETS")
print("="*60)

# Tokenize each split separately
tokenized_train = dataset["train"].map(
    tokenize_and_align_labels,
    batched=True,
    desc="Tokenizing Marathi train"
)
tokenized_val = dataset["validation"].map(
    tokenize_and_align_labels,
    batched=True,
    desc="Tokenizing Marathi validation"
)
tokenized_test = dataset["test"].map(
    tokenize_and_align_labels,
    batched=True,
    desc="Tokenizing Bangla test"
)

# Remove unnecessary columns
tokenized_train = tokenized_train.remove_columns(["tokens", "ner_tags", "langs"])
tokenized_val = tokenized_val.remove_columns(["tokens", "ner_tags", "langs"])
tokenized_test = tokenized_test.remove_columns(["tokens", "ner_tags", "langs"])

# Create the tokenized_datasets dictionary
tokenized_datasets = {
    "train": tokenized_train,
    "validation": tokenized_val,
    "test": tokenized_test
}

print("Tokenized datasets:")
print(f"  Train: {len(tokenized_datasets['train'])} samples")
print(f"  Validation: {len(tokenized_datasets['validation'])} samples")
print(f"  Test (Bangla): {len(tokenized_datasets['test'])} samples")


TOKENIZING DATASETS


Tokenizing Marathi train:   0%|          | 0/5000 [00:00<?, ? examples/s]

Tokenizing Marathi validation:   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenized datasets:
  Train: 5000 samples
  Validation: 1000 samples
  Test (Bangla): 1000 samples


In [81]:
# ============================================
# 8. LOAD MODEL — INDICBERT
# ============================================

print("\n" + "="*60)
print("LOADING MODEL (INDICBERT)")
print("="*60)
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label_list),
    id2label={i: label for i, label in enumerate(label_list)},
    label2id={label: i for i, label in enumerate(label_list)}
)
model = model.to(device)
print(f"Model loaded: {MODEL_NAME}")
print(f"Model device: {next(model.parameters()).device}")
print(f"Number of parameters: {model.num_parameters():,}")



LOADING MODEL (INDICBERT)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at ai4bharat/IndicBERTv2-MLM-only and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded: ai4bharat/IndicBERTv2-MLM-only
Model device: cuda:0
Number of parameters: 277,456,135


In [82]:
# ============================================
# 9. SETUP METRICS
# ============================================

print("\n" + "="*60)
print("LOADING METRICS")
print("="*60)
metric = evaluate.load("seqeval")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }



LOADING METRICS


In [83]:
# ============================================
# 10. DATA COLLATOR
# ============================================

print("\n" + "="*60)
print("SETTING UP DATA COLLATOR")
print("="*60)
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    padding=True,
    label_pad_token_id=-100
)
print("Data collator created successfully!")



SETTING UP DATA COLLATOR
Data collator created successfully!


In [84]:
# ============================================
# 11. TRAINING ARGUMENTS
# ============================================

print("\n" + "="*60)
print("SETTING UP TRAINING ARGUMENTS")
print("="*60)

training_args = TrainingArguments(
    output_dir="./results-indicbert-marathi-ner",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    fp16=torch.cuda.is_available(),
    push_to_hub=False,
    save_total_limit=2,
    report_to="none",
)

print(f"Training device: {training_args.device}")
print(f"FP16 training: {training_args.fp16}")
print(f"Batch size: {training_args.per_device_train_batch_size}")
print(f"Number of epochs: {training_args.num_train_epochs}")
print("\n⚠️ NOTE: Training on MARATHI data, Testing on BANGLA data (Cross-lingual Transfer)")


SETTING UP TRAINING ARGUMENTS
Training device: cuda:0
FP16 training: True
Batch size: 16
Number of epochs: 5

⚠️ NOTE: Training on MARATHI data, Testing on BANGLA data (Cross-lingual Transfer)


In [85]:
# ============================================
# 12. CREATE TRAINER
# ============================================

print("\n" + "="*60)
print("CREATING TRAINER")
print("="*60)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

print("Trainer created successfully!")



CREATING TRAINER
Trainer created successfully!


In [86]:
# ============================================
# 13. START TRAINING
# ============================================

print("\n" + "="*60)
print("STARTING TRAINING")
print("="*60)
print(f"This will take approximately 15–25 minutes on a T4 GPU")
print("="*60 + "\n")

trainer.train()

print("\n" + "="*60)
print("TRAINING COMPLETED!")
print("="*60)


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 3}.



STARTING TRAINING
This will take approximately 15–25 minutes on a T4 GPU





Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2691,0.214569,0.789308,0.810985,0.8,0.933428
2,0.1742,0.169986,0.849802,0.868336,0.858969,0.949717
3,0.1176,0.145203,0.866457,0.890953,0.878534,0.957224
4,0.0799,0.148466,0.870008,0.897415,0.883499,0.959065
5,0.0755,0.148765,0.883886,0.903877,0.89377,0.962181





TRAINING COMPLETED!


In [87]:
# ============================================
# 14. FINAL EVALUATION
# ============================================

print("\n" + "="*60)
print("FINAL EVALUATION ON MARATHI VALIDATION SET")
print("="*60)
eval_results = trainer.evaluate()
print("\nMarathi Validation Results:")
for key, value in eval_results.items():
    print(f"  {key}: {value:.4f}")

print("\n" + "="*60)
print("CROSS-LINGUAL EVALUATION ON BANGLA TEST SET")
print("="*60)
print("⚠️ Testing cross-lingual transfer: Marathi-trained model on Bangla data")
test_results = trainer.evaluate(tokenized_datasets["test"])
print("\nBangla Test Results (Cross-lingual Transfer):")
for key, value in test_results.items():
    print(f"  {key}: {value:.4f}")

print("\n" + "="*60)
print("TRANSFER LEARNING ANALYSIS")
print("="*60)
print(f"Marathi → Marathi F1: {eval_results['eval_f1']:.4f}")
print(f"Marathi → Bangla F1: {test_results['eval_f1']:.4f}")
transfer_gap = eval_results['eval_f1'] - test_results['eval_f1']
print(f"Transfer Gap: {transfer_gap:.4f}")
if transfer_gap > 0:
    print("✓ Model performs better on source language (expected)")
else:
    print("⚠️ Model performs better on target language (unexpected)")


FINAL EVALUATION ON MARATHI VALIDATION SET





Marathi Validation Results:
  eval_loss: 0.1488
  eval_precision: 0.8839
  eval_recall: 0.9039
  eval_f1: 0.8938
  eval_accuracy: 0.9622
  eval_runtime: 3.9829
  eval_samples_per_second: 251.0730
  eval_steps_per_second: 8.0340
  epoch: 5.0000

CROSS-LINGUAL EVALUATION ON BANGLA TEST SET
⚠️ Testing cross-lingual transfer: Marathi-trained model on Bangla data

Bangla Test Results (Cross-lingual Transfer):
  eval_loss: 0.5795
  eval_precision: 0.6860
  eval_recall: 0.7585
  eval_f1: 0.7205
  eval_accuracy: 0.8380
  eval_runtime: 3.8550
  eval_samples_per_second: 259.4050
  eval_steps_per_second: 8.3010
  epoch: 5.0000

TRANSFER LEARNING ANALYSIS
Marathi → Marathi F1: 0.8938
Marathi → Bangla F1: 0.7205
Transfer Gap: 0.1733
✓ Model performs better on source language (expected)


In [88]:
# ============================================
# 15. SAVE MODEL
# ============================================

print("\n" + "="*60)
print("SAVING FINAL MODEL")
print("="*60)
trainer.save_model("./indicbert-marathi-ner-final")
tokenizer.save_pretrained("./indicbert-marathi-ner-final")
print("Model saved to: ./indicbert-marathi-ner-final")
print("This model was trained on Marathi and can be used for cross-lingual transfer to Bangla")


SAVING FINAL MODEL
Model saved to: ./indicbert-marathi-ner-final
This model was trained on Marathi and can be used for cross-lingual transfer to Bangla


In [89]:
import shutil
import os

# Define the output zip file
output_filename = "kaggle_working_dir.zip"

# Current working directory
current_dir = os.getcwd()

# Create a zip of the current directory
shutil.make_archive("kaggle_working_dir", 'zip', current_dir)

print(f"✅ ZIP file created: {output_filename}")


✅ ZIP file created: kaggle_working_dir.zip


In [90]:
# ============================================
# 16. TEST INFERENCE
# ============================================

print("\n" + "="*60)
print("TESTING INFERENCE")
print("="*60)

test_sentence = dataset["test"][0]
test_tokens = test_sentence["tokens"]
print(f"\nTest sentence: {' '.join(test_tokens)}")

inputs = tokenizer(
    test_tokens,
    is_split_into_words=True,
    return_tensors="pt",
    truncation=True,
    padding=True
).to(device)

with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=2)

predicted_labels = [label_list[p.item()] for p in predictions[0]]
word_ids = inputs.word_ids()

final_predictions = []
previous_word_idx = None
for word_idx, pred_label in zip(word_ids, predicted_labels):
    if word_idx is not None and word_idx != previous_word_idx:
        final_predictions.append(pred_label)
        previous_word_idx = word_idx

print("\nPredicted NER tags:")
for token, pred_tag in zip(test_tokens, final_predictions):
    print(f"  {token:20s} -> {pred_tag}")

print("\n" + "="*60)
print("ALL DONE! 🎉")
print("="*60)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



TESTING INFERENCE

Test sentence: উরুগুয়ে জাতীয় ফুটবল দল

Predicted NER tags:
  উরুগুয়ে             -> B-ORG
  জাতীয়               -> I-ORG
  ফুটবল                -> I-ORG
  দল                   -> I-ORG

ALL DONE! 🎉
