# Model Comparison for Amharic NER

This notebook compares multiple transformer-based models for Named Entity Recognition (NER) on Amharic e-commerce data. Models compared: XLM-Roberta, mBERT, and DistilBERT.


## 1. Install and Import Dependencies


In [None]:
%pip install transformers datasets seqeval --quiet
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
import torch
import os


## 2. Load and Prepare CoNLL Data


In [None]:
def read_conll(filepath):
    sentences = []
    labels = []
    with open(filepath, encoding='utf-8') as f:
        tokens = []
        tags = []
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    sentences.append(tokens)
                    labels.append(tags)
                    tokens, tags = [], []
            else:
                splits = line.split()
                if len(splits) >= 2:
                    tokens.append(splits[0])
                    tags.append(splits[-1])
        if tokens:
            sentences.append(tokens)
            labels.append(tags)
    return sentences, labels

conll_path = '../data/processed/ner_sample_conll.txt'  # Adjust path if needed
sentences, ner_tags = read_conll(conll_path)

unique_tags = sorted(set(tag for doc in ner_tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

data = {
    'tokens': sentences,
    'ner_tags': [[tag2id[tag] for tag in tags] for tags in ner_tags]
}
dataset = Dataset.from_dict(data)
dataset = dataset.train_test_split(test_size=0.2, seed=42)
dataset = DatasetDict({
    'train': dataset['train'],
    'test': dataset['test']
})
print(dataset)


## 3. Define Model List and Helper Functions


In [None]:
model_names = {
    'XLM-Roberta': 'xlm-roberta-base',
    'mBERT': 'bert-base-multilingual-cased',
    'DistilBERT': 'distilbert-base-multilingual-cased'
}

def tokenize_and_align_labels(examples, tokenizer):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label[word_idx] != tag2id['O'] else tag2id['O'])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs['labels'] = labels
    return tokenized_inputs

from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_labels = [[id2tag[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2tag[pred] for (pred, lab) in zip(prediction, label) if lab != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return {
        'f1': f1_score(true_labels, true_predictions),
        'precision': precision_score(true_labels, true_predictions),
        'recall': recall_score(true_labels, true_predictions),
        'report': classification_report(true_labels, true_predictions)
    }


## 4. Fine-Tune and Evaluate Each Model


In [None]:
results = {}
for model_label, model_checkpoint in model_names.items():
    print(f'\n--- Fine-tuning {model_label} ---')
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    tokenized_datasets = dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer), batched=True)
    tokenized_datasets = tokenized_datasets.remove_columns(['tokens', 'ner_tags'])
    tokenized_datasets.set_format('torch')
    model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(tag2id))
    data_collator = DataCollatorForTokenClassification(tokenizer)
    args = TrainingArguments(
        output_dir=f'./results/{model_label}',
        evaluation_strategy='epoch',
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir=f'./logs/{model_label}',
        logging_steps=10,
        save_strategy='epoch',
        load_best_model_at_end=True,
        metric_for_best_model='f1',
        push_to_hub=False
        # fp16=True  # Uncomment if using GPU with mixed precision
    )
    trainer = Trainer(
        model,
        args,
        train_dataset=tokenized_datasets['train'],
        eval_dataset=tokenized_datasets['test'],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    trainer.train()
    eval_results = trainer.evaluate()
    print(eval_results['report'])
    results[model_label] = eval_results


## 5. Compare Results and Select Best Model


In [None]:
summary = []
for model_label, res in results.items():
    summary.append({
        'Model': model_label,
        'F1': res['f1'],
        'Precision': res['precision'],
        'Recall': res['recall']
    })
df = pd.DataFrame(summary)
df = df.sort_values('F1', ascending=False)
display(df)


### Conclusion
Summarize which model performed best and why it is recommended for EthioMart's NER task.
