In [10]:
#pip install transformers datasets seqeval
#!pip install evaluate

In [4]:
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
)
from seqeval.metrics import classification_report, accuracy_score
import evaluate
import torch
import time
from transformers import DataCollatorForTokenClassification
from transformers import EarlyStoppingCallback


# -------------------
# 1. Parse tab-separated CoNLL file safely
# -------------------
def parse_conll(file_path):
    """
    Load a tab-separated .conll file.
    Returns:
        sentences: list of {"tokens": [...], "ner_tags": [...]}
        label_list: sorted list of unique labels
    """
    sentences = []
    tokens, labels = [], []
    label_set = set()

    with open(file_path, encoding="utf-8") as f:
        for i, line in enumerate(f, 1):
            line = line.strip()
            if not line:
                if tokens:
                    sentences.append({"tokens": tokens, "ner_tags": labels})
                    tokens, labels = [], []
                continue
            parts = line.split("\t")
            if len(parts) < 3:
                print(f"Line {i} skipped (not enough columns): {line}")
                continue
            token, _, ner = parts[:3]  # ignore POS column
            tokens.append(token)
            labels.append(ner)
            label_set.add(ner)

    if tokens:
        sentences.append({"tokens": tokens, "ner_tags": labels})

    return sentences, sorted(label_set)


# -------------------
# 2. Load train, validation, test
# -------------------
train_data, label_list = parse_conll("ner_train.conll")
val_data, _ = parse_conll("ner_val.conll")
test_data, _ = parse_conll("ner_test.conll")

label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)
test_dataset = Dataset.from_list(test_data)


# -------------------
# 3. Tokenization
# -------------------
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
    )

    labels = []
    for i, label_seq in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        prev_word = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != prev_word:
                label_ids.append(label2id[label_seq[word_idx]])
            else:
                label_ids.append(label2id[label_seq[word_idx]])
            prev_word = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs



train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
val_dataset = val_dataset.map(tokenize_and_align_labels, batched=True)
test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)


# -------------------
# 4. Model & Training Arguments
# -------------------

model = AutoModelForTokenClassification.from_pretrained(
    model_name, num_labels=len(label_list), id2label=id2label, label2id=label2id
)

training_args = TrainingArguments(
    output_dir="./results",              # where to save checkpoints
    eval_strategy="epoch",         # run eval each epoch
    save_strategy="epoch",               # save checkpoint each epoch
    learning_rate=5e-5,                  # standard for BERT/MBERT fine-tuning
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,                   # small L2 regularization
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=2,                  # keep only last 2 checkpoints
    load_best_model_at_end=True,         # load best model (based on eval metric)
    metric_for_best_model="f1",          # depends on your compute_metrics
    greater_is_better=True,              # set False if you monitor loss
    seed=42,
)

# -------------------
# 5. Metrics
# -------------------
seqeval_metric = evaluate.load("seqeval")


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [id2label[label_id] for label_id in label_seq if label_id != -100]
        for label_seq in labels
    ]
    pred_labels = [
        [id2label[pred_id] for pred_id, label_id in zip(pred_seq, label_seq) if label_id != -100]
        for pred_seq, label_seq in zip(predictions, labels)
    ]
    results = seqeval_metric.compute(predictions=pred_labels, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


# -------------------
# 6. Trainer with Early Stopping
# -------------------
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,   # 🚀 dynamic padding
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
)


# -------------------
# 7. Training with timing
# -------------------
start_time = time.time()
trainer.train()
end_time = time.time()
print(f"Training time: {end_time - start_time:.2f} seconds")


# -------------------
# 8. Evaluation on Test Set
# -------------------
predictions, labels, _ = trainer.predict(test_dataset)
predictions = np.argmax(predictions, axis=2)

true_labels = [
    [id2label[label_id] for label_id in label_seq if label_id != -100]
    for label_seq in labels
]
pred_labels = [
    [id2label[pred_id] for pred_id, label_id in zip(pred_seq, label_seq) if label_id != -100]
    for pred_seq, label_seq in zip(predictions, labels)
]

print(classification_report(true_labels, pred_labels))
print("Accuracy:", accuracy_score(true_labels, pred_labels))


# -------------------
# 9. Save model & tokenizer
# -------------------
trainer.save_model("./mbert_ner_best")
tokenizer.save_pretrained("./mbert_ner_best")


Map: 100%|██████████| 45894/45894 [00:23<00:00, 1946.00 examples/s]
Map: 100%|██████████| 11474/11474 [00:04<00:00, 2472.06 examples/s]
Map: 100%|██████████| 14343/14343 [00:05<00:00, 2452.61 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [5]:
import torch
print(torch.cuda.is_available())   # True if GPU is available
print(torch.cuda.device_count())   # Number of GPUs
print(torch.cuda.get_device_name(0))  # Name of first GPU

False
0


AssertionError: Torch not compiled with CUDA enabled