In [4]:
from transformers import TrainingArguments

args = TrainingArguments(
    "test-output",
    eval_strategy="epoch"
)
print("Success!")

Success!


In [5]:
from transformers import (
    AutoTokenizer, 
    AutoModelForTokenClassification, 
    TrainingArguments, 
    Trainer, 
    DataCollatorForTokenClassification
)
from datasets import Dataset
import numpy as np
import pandas as pd
import re

In [6]:
import re

input_path = '../data/processed/ner_labels_template.conll'
output_path = '../data/processed/ner_labels_template_cleaned.conll'

with open(input_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile:
    for line in infile:
        # Fix common label typos
        line = re.sub(r'B-CONTACT_INFO\\\\?', 'B-CONTACT_INFO', line)
        line = re.sub(r'I-PRICErocessor', 'I-PRICE', line)
        line = re.sub(r'B-PRICEroduct', 'B-Product', line)
        line = re.sub(r'I-PRICEroduct', 'I-Product', line)
        # Remove lines with 'Channel' as a label
        if re.search(r'\bChannel\b', line):
            continue
        outfile.write(line)
print("Cleaned file written to", output_path)

Cleaned file written to ../data/processed/ner_labels_template_cleaned.conll


In [7]:
def parse_conll(filepath):
    sentences = []
    labels = []
    with open(filepath, encoding='utf-8') as f:
        tokens = []
        tags = []
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    sentences.append(tokens)
                    labels.append(tags)
                    tokens = []
                    tags = []
            else:
                splits = line.split()
                if len(splits) >= 2:
                    tokens.append(splits[0])
                    tags.append(splits[1])
        if tokens:
            sentences.append(tokens)
            labels.append(tags)
    return sentences, labels

output_path = '../data/processed/ner_labels_template_cleaned.conll'  
sentences, tags = parse_conll(output_path)
print(f"Loaded {len(sentences)} messages")

Loaded 104 messages


In [8]:
data = {'tokens': sentences, 'ner_tags': tags}
dataset = Dataset.from_dict(data)
dataset = dataset.train_test_split(test_size=0.2, seed=42)
label_list = sorted(list({l for tag_seq in tags for l in tag_seq}))
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}
print(label2id)

{'B-CONTACT_INFO': 0, 'B-LOC': 1, 'B-PRICE': 2, 'B-Product': 3, 'B-QUANTITY': 4, 'B-SPECIFICATION': 5, 'I-CONTACT_INFO': 6, 'I-LOC': 7, 'I-PRICE': 8, 'I-Product': 9, 'O': 10}


In [9]:
model_name = "xlm-roberta-base"  # or "Davlan/bert-tiny-amharic" or "Davlan/afroxlmr-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/83 [00:00<?, ? examples/s]

Map:   0%|          | 0/21 [00:00<?, ? examples/s]

In [10]:
from seqeval.metrics import f1_score, accuracy_score, precision_score, recall_score

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return {
        "f1": f1_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "accuracy": accuracy_score(true_labels, true_predictions)
    }

In [11]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification


model = AutoModelForTokenClassification.from_pretrained(
    model_name, 
    num_labels=len(label_list), 
    id2label=id2label, 
    label2id=label2id
)

args = TrainingArguments(
    "ner-finetuned-amharic",
    eval_strategy="epoch",
    save_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=40,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_strategy="epoch",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=False,
    metric_for_best_model="eval_loss",
    report_to="none"
)

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,2.3749,1.582383,0.0,0.0,0.0,0.553571
2,1.3785,1.28643,0.236967,0.714286,0.142045,0.584034
3,1.189,1.006951,0.279365,0.316547,0.25,0.709034
4,0.8812,0.738674,0.44898,0.461078,0.4375,0.793067
5,0.6537,0.625921,0.415205,0.427711,0.403409,0.779412
6,0.5755,0.506772,0.411111,0.402174,0.420455,0.807773
7,0.4914,0.437322,0.527778,0.516304,0.539773,0.852941
8,0.4327,0.39374,0.594444,0.581522,0.607955,0.867647
9,0.3651,0.373573,0.606232,0.60452,0.607955,0.883403
10,0.3032,0.335183,0.653521,0.648045,0.659091,0.887605


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=440, training_loss=0.2875511886043982, metrics={'train_runtime': 459.7339, 'train_samples_per_second': 7.222, 'train_steps_per_second': 0.957, 'total_flos': 550516933306758.0, 'train_loss': 0.2875511886043982, 'epoch': 40.0})

In [12]:
# Save the fine-tuned model
trainer.save_model("ner-finetuned-amharic-final")

# Save the tokenizer
tokenizer.save_pretrained("ner-finetuned-amharic-final")

print("Model and tokenizer saved to 'ner-finetuned-amharic-final'")

Model and tokenizer saved to 'ner-finetuned-amharic-final'


In [13]:
def predict_ner(text, model, tokenizer, id2label):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    # Tokenize input
    inputs = tokenizer(text, return_tensors="pt", is_split_into_words=False)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=2)[0].tolist()
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    word_ids = inputs["input_ids"].new_zeros(inputs["input_ids"].shape[1]).tolist()
    if hasattr(tokenizer, "word_ids"):
        word_ids = tokenizer(text, return_tensors="pt").word_ids(batch_index=0)
    else:
        # fallback: treat each token as a word
        word_ids = list(range(len(tokens)))
    results = []
    previous_word_idx = None
    for idx, word_id in enumerate(word_ids):
        if word_id is None or word_id == previous_word_idx:
            continue
        token = tokens[idx]
        label = id2label[predictions[idx]]
        results.append((token, label))
        previous_word_idx = word_id
    return results

In [18]:
import torch
def pretty_print_ner(text, model, tokenizer, id2label):
    results = predict_ner(text, model, tokenizer, id2label)
    for token, label in results:
        # Skip special tokens and empty tokens
        if token in ["<s>", "</s>", "▁"] or label == "O":
            continue
        # Remove the leading '▁' for readability
        print(f"{token.replace('▁', '')}: {label}")

sample_text = "ይህ አዲስ Long-lasting battery በ 5000 ብር ይሸጣል መገናኛ_መሰረት_ደፋር_ሞል_ሁለተኛ_ፎቅ ላይ ያገኙት።"
pretty_print_ner(sample_text, model, tokenizer, id2label)

Long: B-Product
-: I-Product
lasting: I-Product
battery: I-Product
5000: B-PRICE
ብር: I-PRICE
መ: B-LOC
ገና: B-LOC
ኛ: B-LOC
መሰ: I-LOC
ረት: I-LOC
_: I-LOC
ደ: I-LOC
ፋር: I-LOC
_: I-LOC
ሞ: I-LOC
ል: I-LOC
_: I-LOC
ሁለተኛ: I-LOC
_: I-LOC
ፎ: I-LOC
ቅ: I-LOC
ላይ: I-LOC


In [19]:
results = trainer.evaluate()
print(results)

{'eval_loss': 0.343707412481308, 'eval_f1': 0.8221574344023324, 'eval_precision': 0.844311377245509, 'eval_recall': 0.8011363636363636, 'eval_accuracy': 0.9023109243697479, 'eval_runtime': 0.4651, 'eval_samples_per_second': 45.152, 'eval_steps_per_second': 6.45, 'epoch': 40.0}


new model


In [21]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
import torch

model_name_or_path = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = AutoModelForTokenClassification.from_pretrained(
    model_name_or_path,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [24]:

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)


Map:   0%|          | 0/83 [00:00<?, ? examples/s]

Map:   0%|          | 0/21 [00:00<?, ? examples/s]

In [26]:
training_args = TrainingArguments(
    output_dir="./ner-finetuned-bert-multilingual",
    eval_strategy="epoch",
    save_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=40,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_strategy="epoch",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=False,
    metric_for_best_model="eval_loss",
    report_to="none"
)

data_collator = DataCollatorForTokenClassification(tokenizer)


In [27]:
from seqeval.metrics import f1_score, precision_score, recall_score, accuracy_score

def compute_metrics(p):
    predictions, labels = p
    predictions = torch.argmax(torch.tensor(predictions), axis=2).tolist()

    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return {
        "f1": f1_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "accuracy": accuracy_score(true_labels, true_predictions),
    }


In [28]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,1.6881,1.359956,0.09009,0.217391,0.056818,0.619748
2,1.1172,1.011481,0.299595,0.521127,0.210227,0.67542
3,0.8933,0.773139,0.374101,0.509804,0.295455,0.746849
4,0.6854,0.622548,0.575163,0.676923,0.5,0.805672
5,0.5607,0.477925,0.60423,0.645161,0.568182,0.837185
6,0.4723,0.430517,0.631579,0.650602,0.613636,0.856092
7,0.4024,0.385667,0.679365,0.769784,0.607955,0.857143
8,0.3745,0.319767,0.674286,0.678161,0.670455,0.894958
9,0.3268,0.315125,0.687861,0.7,0.676136,0.893908
10,0.3103,0.260179,0.752239,0.792453,0.715909,0.928571


TrainOutput(global_step=440, training_loss=0.2650248642672192, metrics={'train_runtime': 301.0764, 'train_samples_per_second': 11.027, 'train_steps_per_second': 1.461, 'total_flos': 538477930553406.0, 'train_loss': 0.2650248642672192, 'epoch': 40.0})

In [30]:
trainer.save_model("./ner-finetuned-bert-multilingual")
tokenizer.save_pretrained("./ner-finetuned-bert-multilingual")


('./ner-finetuned-bert-multilingual\\tokenizer_config.json',
 './ner-finetuned-bert-multilingual\\special_tokens_map.json',
 './ner-finetuned-bert-multilingual\\vocab.txt',
 './ner-finetuned-bert-multilingual\\added_tokens.json',
 './ner-finetuned-bert-multilingual\\tokenizer.json')