In [1]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
import numpy as np
from evaluate import load

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
label_list = ['O', 'B-Product', 'I-Product', 'B-PRICE', 'I-PRICE', 'B-LOC', 'I-LOC']
label_to_id = {label: idx for idx, label in enumerate(label_list)}
id_to_label = {idx: label for label, idx in label_to_id.items()}

In [3]:
conll_path = r"C:\Users\bless\OneDrive\Desktop\week_4\EthioMart-NER-Project-Final\data\labeled\conll_labeled_data.txt"

In [4]:
def load_conll_data(filepath):
    sentences, labels = [], []
    with open(filepath, encoding='utf-8') as f:
        words, tags = [], []
        for i, line in enumerate(f, 1):
            line = line.strip()
            if not line:
                if words:
                    sentences.append(words)
                    labels.append(tags)
                    words, tags = [], []
            else:
                try:
                    parts = line.split()
                    if len(parts) != 2:
                        print(f"Skipping line {i}: '{line}' (expected 2 parts, got {len(parts)})")
                        continue
                    token, label = parts
                    if label not in label_list:
                        print(f"Skipping line {i}: Invalid label '{label}'")
                        continue
                    words.append(token)
                    tags.append(label)
                except ValueError as e:
                    print(f"Error on line {i}: '{line}' - {e}")
                    continue
        if words:
            sentences.append(words)
            labels.append(tags)
    return pd.DataFrame({'tokens': sentences, 'ner_tags': labels})

In [5]:
df = load_conll_data(conll_path)
dataset = Dataset.from_pandas(df)

Skipping line 63: 'O' (expected 2 parts, got 1)
Skipping line 346: 'O' (expected 2 parts, got 1)
Skipping line 467: 'O' (expected 2 parts, got 1)
Skipping line 469: 'O' (expected 2 parts, got 1)
Skipping line 616: 'O' (expected 2 parts, got 1)
Skipping line 618: 'O' (expected 2 parts, got 1)
Skipping line 709: 'O' (expected 2 parts, got 1)
Skipping line 776: 'O' (expected 2 parts, got 1)
Skipping line 955: 'O' (expected 2 parts, got 1)
Skipping line 957: 'O' (expected 2 parts, got 1)
Skipping line 959: 'O' (expected 2 parts, got 1)
Skipping line 993: 'O' (expected 2 parts, got 1)
Skipping line 995: 'O' (expected 2 parts, got 1)
Skipping line 1085: 'O' (expected 2 parts, got 1)
Skipping line 1147: 'O' (expected 2 parts, got 1)
Skipping line 1149: 'O' (expected 2 parts, got 1)
Skipping line 1203: 'O' (expected 2 parts, got 1)
Skipping line 1205: 'O' (expected 2 parts, got 1)
Skipping line 1207: 'O' (expected 2 parts, got 1)
Skipping line 1209: 'O' (expected 2 parts, got 1)
Skipping line 

In [6]:
model_checkpoint = "rasyosef/bert-tiny-amharic"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [16]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples['tokens'],
        truncation=True,
        padding='max_length',  
        max_length=128,
        is_split_into_words=True
    )
    
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label_to_id[label[word_idx]])
            else:
                label_ids.append(label_to_id[label[word_idx]])
            previous_word_idx = word_idx
        
        
        input_len = len(tokenized_inputs["input_ids"][i])
        label_ids += [-100] * (input_len - len(label_ids))
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs



In [17]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_to_id), id2label=id_to_label, label2id=label_to_id)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at rasyosef/bert-tiny-amharic and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

In [20]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at rasyosef/bert-tiny-amharic and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
training_args = TrainingArguments(
    output_dir="./models/ner_model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True
)

In [22]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=2)
    true_labels = [[id_to_label[l] for l in label if l != -100] for label in labels]
    pred_labels = [[id_to_label[p] for p, l in zip(pred, label) if l != -100] for pred, label in zip(predictions, labels)]
    return load("seqeval").compute(predictions=pred_labels, references=true_labels)

In [23]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [24]:
trainer.train()
results = trainer.evaluate()
print("Evaluation results:", results)

Epoch,Training Loss,Validation Loss,Loc,Price,Product,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,No log,2.054748,"{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}",0.0,0.0,0.0,0.029781
2,No log,2.023542,"{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}",0.0,0.0,0.0,0.043887
3,No log,2.011136,"{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}",0.0,0.0,0.0,0.048589


Downloading builder script: 6.34kB [00:00, 1.81MB/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Evaluation results: {'eval_loss': 2.011136054992676, 'eval_LOC': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}, 'eval_PRICE': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}, 'eval_Product': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}, 'eval_overall_precision': 0.0, 'eval_overall_recall': 0.0, 'eval_overall_f1': 0.0, 'eval_overall_accuracy': 0.048589341692789965, 'eval_runtime': 2.1224, 'eval_samples_per_second': 2.827, 'eval_steps_per_second': 0.471, 'epoch': 3.0}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
trainer.save_model("./models/ner_model_final")
tokenizer.save_pretrained("./models/ner_model_final")

('./models/ner_model_final\\tokenizer_config.json',
 './models/ner_model_final\\special_tokens_map.json',
 './models/ner_model_final\\vocab.txt',
 './models/ner_model_final\\added_tokens.json',
 './models/ner_model_final\\tokenizer.json')