# Fine-tune NER Model for Amharic E-commerce Data

This notebook demonstrates how to fine-tune a Named Entity Recognition (NER) model using Hugging Face Transformers on Amharic E-commerce data in CoNLL format.


## 1. Install and Import Dependencies

In [1]:
%pip install transformers datasets seqeval --quiet
import pandas as pd
import numpy as np
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
import torch
import os


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import transformers
print(transformers.__version__)

In [None]:
  import sys
  print(sys.executable)

## 2. Load and Prepare CoNLL Data
Assume your labeled data is in `data/processed/ner_sample_conll.txt` (or update the path as needed).


In [None]:
def read_conll(filepath):
    sentences = []
    labels = []
    with open(filepath, encoding='utf-8') as f:
        tokens = []
        tags = []
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    sentences.append(tokens)
                    labels.append(tags)
                    tokens, tags = [], []
            else:
                splits = line.split()
                if len(splits) >= 2:
                    tokens.append(splits[0])
                    tags.append(splits[-1])
        if tokens:
            sentences.append(tokens)
            labels.append(tags)
    return sentences, labels

conll_path = '../data/processed/ner_sample_conll.txt'  # Update if needed
sentences, ner_tags = read_conll(conll_path)
print(f'Total sentences: {len(sentences)}')
print('Example:', list(zip(sentences[0], ner_tags[0])))


## 3. Prepare Dataset for Hugging Face Transformers
We need to map labels to IDs and create a Hugging Face Dataset.


In [None]:
unique_tags = sorted(set(tag for doc in ner_tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

data = {
    'tokens': sentences,
    'ner_tags': [[tag2id[tag] for tag in tags] for tags in ner_tags]
}
dataset = Dataset.from_dict(data)
dataset = dataset.train_test_split(test_size=0.2, seed=42)
dataset = DatasetDict({
    'train': dataset['train'],
    'test': dataset['test']
})
print(dataset)


## 4. Tokenization and Alignment


In [None]:
model_checkpoint = 'bert-base-multilingual-cased'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label[word_idx] != tag2id['O'] else tag2id['O'])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs['labels'] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['tokens', 'ner_tags'])
tokenized_datasets.set_format('torch')
print(tokenized_datasets)


## 5. Model and Training Setup


In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, num_labels=len(tag2id)
)
data_collator = DataCollatorForTokenClassification(tokenizer)

args = TrainingArguments(
    'ner-amharic',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    push_to_hub=False
)


## 6. Metrics and Trainer Setup


In [None]:
from seqeval.metrics import classification_report, f1_score

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_labels = [[id2tag[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2tag[pred] for (pred, lab) in zip(prediction, label) if lab != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return {
        'f1': f1_score(true_labels, true_predictions),
        'report': classification_report(true_labels, true_predictions)
    }


In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()


## 7. Evaluate and Save Model


In [None]:
results = trainer.evaluate()
print(results['report'])
trainer.save_model('amharic-ner-model')
tokenizer.save_pretrained('amharic-ner-model')
