In [2]:
!git clone https://github.com/Sari-Amin/ethio-ner-pipeline.git

Cloning into 'ethio-ner-pipeline'...
remote: Enumerating objects: 35, done.[K
remote: Counting objects: 100% (35/35), done.[K
remote: Compressing objects: 100% (22/22), done.[K
remote: Total 35 (delta 7), reused 25 (delta 5), pack-reused 0 (from 0)[K
Receiving objects: 100% (35/35), 25.11 KiB | 8.37 MiB/s, done.
Resolving deltas: 100% (7/7), done.


In [3]:
!pip install -U transformers datasets

Collecting transformers
  Downloading transformers-4.53.0-py3-none-any.whl.metadata (39 kB)
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading transformers-4.53.0-py3-none-any.whl (10.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m118.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m48.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, transformers, datasets
  Attempting uninstall: fsspec
    Found existing installation:

In [4]:
!pip install transformers datasets accelerate peft

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.wh

In [5]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [6]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from typing import List, Tuple
import numpy as np
import os


class NERTrainer:
    def __init__(self, model_name: str, label_list: List[str]):
        self.model_name = model_name
        self.label_list = label_list
        self.label_to_id = {l: i for i, l in enumerate(label_list)}
        self.id_to_label = {i: l for i, l in enumerate(label_list)}

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForTokenClassification.from_pretrained(
            model_name,
            num_labels=len(label_list),
            id2label=self.id_to_label,
            label2id=self.label_to_id,
            ignore_mismatched_sizes=True # Add this line to ignore size mismatches
        )


    def load_conll_data(self, filepath: str, split: float = 0.2, seed: int = 42) -> DatasetDict:
        """
        Parse CoNLL format and return a train/validation split.
        """
        data = []
        tokens = []
        labels = []

        with open(filepath, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    if tokens:
                        data.append({"tokens": tokens, "ner_tags": [self.label_to_id[t] for t in labels]})
                        tokens, labels = [], []
                else:
                    splits = line.split()
                    if len(splits) == 2:
                        token, label = splits
                        tokens.append(token)
                        labels.append(label)

        dataset = Dataset.from_list(data)
        dataset_split = dataset.train_test_split(test_size=split, seed=seed)
        return dataset_split



    def tokenize_and_align_labels(self, examples):
        tokenized_inputs = self.tokenizer(
            examples["tokens"],
            truncation=True,
            is_split_into_words=True
        )

        labels = []
        for i, label in enumerate(examples["ner_tags"]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            aligned_labels = []
            previous_word_idx = None
            for word_idx in word_ids:
                if word_idx is None:
                    aligned_labels.append(-100)
                elif word_idx != previous_word_idx:
                    aligned_labels.append(label[word_idx])
                else:
                    aligned_labels.append(label[word_idx] if self.label_list[label[word_idx]].startswith("I-") else -100)
                previous_word_idx = word_idx
            labels.append(aligned_labels)

        tokenized_inputs["labels"] = labels
        return tokenized_inputs


    def train(self, train_dataset: Dataset, val_dataset: Dataset, output_dir="ner_model"):
        tokenized_train = train_dataset.map(self.tokenize_and_align_labels, batched=True)
        tokenized_val = val_dataset.map(self.tokenize_and_align_labels, batched=True)

        args = TrainingArguments(
            output_dir=output_dir,
            eval_strategy="epoch",
            learning_rate=2e-5,
            per_device_train_batch_size=8,
            num_train_epochs=3,
            weight_decay=0.01,
            save_strategy="epoch",
            logging_dir=f"{output_dir}/logs",
            run_name="Amharic_NER_XLMR"
        )

        data_collator = DataCollatorForTokenClassification(self.tokenizer)
        trainer = Trainer(
            model=self.model,
            args=args,
            train_dataset=tokenized_train,
            eval_dataset=tokenized_val,
            tokenizer=self.tokenizer,
            data_collator=data_collator
        )

        trainer.train()
        self.model.save_pretrained(output_dir)
        self.tokenizer.save_pretrained(output_dir)

In [6]:
#xlm-roberta-base
# import sys

# sys.path.append("../")

# from src.ner_trainer.ner_trainer import NERTrainer


labels = [
            "O",
            "B-PRODUCT", "I-PRODUCT",
            "B-PRICE", "I-PRICE",
            "B-LOC", "I-LOC"
        ]
xlm_trainer = NERTrainer("xlm-roberta-base", labels)

dataset = xlm_trainer.load_conll_data("/content/ethio-ner-pipeline/Data/labeled_telegram_product_price_location.txt")

train_dataset = dataset["train"]
val_dataset = dataset["test"]

xlm_trainer.train(train_dataset, val_dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2532 [00:00<?, ? examples/s]

Map:   0%|          | 0/634 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,0.023391
2,0.104400,0.012189
3,0.104400,0.008482


In [6]:
!pip install shap



In [7]:
import shap
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForTokenClassification
from typing import List, Tuple

class NERInterpreter:
    def __init__(self, model_path: str):
        """
        Initialize with a fine-tuned Hugging Face token classification model.
        """
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForTokenClassification.from_pretrained(model_path).to(self.device)
        self.model.eval()
        self.id2label = self.model.config.id2label

    def predict(self, sentence: str) -> List[Tuple[str, str]]:
        """
        Predict token-level labels.
        """
        inputs = self.tokenizer(sentence, return_tensors="pt", truncation=True).to(self.device)
        with torch.no_grad():
            logits = self.model(**inputs).logits
            preds = torch.argmax(logits, dim=-1).squeeze()
        tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze())
        labels = [self.id2label[i.item()] for i in preds]
        return list(zip(tokens, labels))

    def explain(self, sentence: str):
        """
        Explain prediction using SHAP DeepExplainer.
        """
        inputs = self.tokenizer(sentence, return_tensors="pt", truncation=True, return_offsets_mapping=True)
        input_ids = inputs["input_ids"].to(self.device)
        attention_mask = inputs["attention_mask"].to(self.device)

        def model_forward(x):
            x = torch.tensor(x, dtype=torch.long).to(self.device)
            with torch.no_grad():
                outputs = self.model(input_ids=x, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=-1)
            top_logits = torch.stack([outputs.logits[i, :, preds[i]] for i in range(outputs.logits.shape[0])])
            return top_logits.detach().cpu().numpy()

        # Use a small dummy input as background for SHAP
        background = input_ids.detach().cpu().numpy()

        explainer = shap.DeepExplainer(self.model, background)
        shap_values = explainer.shap_values(input_ids.detach().cpu().numpy())

        tokens = self.tokenizer.convert_ids_to_tokens(input_ids[0])
        predictions = self.predict(sentence)
        print("Token\tLabel\tSHAP")
        for token, (_, label), value in zip(tokens, predictions, shap_values[0][0]):
            print(f"{token}\t{label}\t{value:.4f}")

In [12]:
#rasyosef/bert-tiny-amharic
# import sys

# sys.path.append("../")

# from src.ner_trainer.ner_trainer import NERTrainer


labels = [
            "O",
            "B-PRODUCT", "I-PRODUCT",
            "B-PRICE", "I-PRICE",
            "B-LOC", "I-LOC"
        ]
bert_trainer = NERTrainer("rasyosef/bert-tiny-amharic", labels)

dataset = bert_trainer.load_conll_data("/content/ethio-ner-pipeline/Data/labeled_telegram_product_price_location.txt")

train_dataset = dataset["train"]
val_dataset = dataset["test"]

bert_trainer.train(train_dataset, val_dataset)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at rasyosef/bert-tiny-amharic and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2532 [00:00<?, ? examples/s]

Map:   0%|          | 0/634 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,0.486408
2,0.729000,0.297654
3,0.729000,0.256547


In [8]:
#masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0
# import sys

# sys.path.append("../")

# from src.ner_trainer.ner_trainer import NERTrainer


labels = [
            "O",
            "B-PRODUCT", "I-PRODUCT",
            "B-PRICE", "I-PRICE",
            "B-LOC", "I-LOC"
        ]

afro_trainer = NERTrainer("masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0", labels)

dataset = afro_trainer.load_conll_data("/content/ethio-ner-pipeline/Data/labeled_telegram_product_price_location.txt")

train_dataset = dataset["train"]
val_dataset = dataset["test"]

afro_trainer.train(train_dataset, val_dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/404 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([7]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 1024]) in the checkpoint and torch.Size([7, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2532 [00:00<?, ? examples/s]

Map:   0%|          | 0/634 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,0.011689
2,0.076000,0.008061


Epoch,Training Loss,Validation Loss
1,No log,0.011689
2,0.076000,0.008061
3,0.076000,0.004394


In [9]:
#explain xlm_roberta
# Load your trained model
xlm = NERInterpreter("/content/ner_model")

# Sentence to interpret
text = "አዲስ የልጆች ጫማ ዋጋ 2500 ብር በቦሌ"

# Predict
print(xlm.predict(text))

# SHAP Visualization
xlm.explain(text)


[('<s>', 'O'), ('▁አዲስ', 'B-PRODUCT'), ('▁የ', 'I-PRODUCT'), ('ል', 'I-PRODUCT'), ('ጆች', 'I-PRODUCT'), ('▁', 'I-PRODUCT'), ('ጫ', 'I-PRODUCT'), ('ማ', 'I-PRODUCT'), ('▁ዋጋ', 'I-PRICE'), ('▁2500', 'I-PRICE'), ('▁ብር', 'I-PRICE'), ('▁በ', 'O'), ('ቦ', 'I-LOC'), ('ሌ', 'O'), ('</s>', 'O')]


TypeError: 'int' object is not callable

In [8]:
!zip -r /content/file.zip /content/ner_model

  adding: content/ner_model/ (stored 0%)
  adding: content/ner_model/model.safetensors

 (deflated 29%)
  adding: content/ner_model/logs/ (stored 0%)
  adding: content/ner_model/logs/events.out.tfevents.1750950892.6935ff899284.493.0 (deflated 59%)
  adding: content/ner_model/special_tokens_map.json (deflated 52%)
  adding: content/ner_model/sentencepiece.bpe.model (deflated 49%)
  adding: content/ner_model/checkpoint-317/ (stored 0%)
  adding: content/ner_model/checkpoint-317/optimizer.pt (deflated 70%)
  adding: content/ner_model/checkpoint-317/training_args.bin (deflated 51%)
  adding: content/ner_model/checkpoint-317/model.safetensors (deflated 29%)
  adding: content/ner_model/checkpoint-317/special_tokens_map.json (deflated 52%)
  adding: content/ner_model/checkpoint-317/sentencepiece.bpe.model (deflated 49%)
  adding: content/ner_model/checkpoint-317/config.json (deflated 53%)
  adding: content/ner_model/checkpoint-317/tokenizer.json (deflated 76%)
  adding: content/ner_model/chec