In [14]:
# Step 0: Install Dependencies
!pip install transformers datasets seqeval evaluate



In [3]:
#Step 1: Imports
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict, load_metric
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
import os
from google.colab import files

In [4]:
#Step 2: Upload

uploaded = files.upload()
conll_path = "labeled.txt"

Saving labeled.txt to labeled (1).txt


In [5]:
#ste3 : Parse CONLL File

def parse_conll(filepath):
    tokens = []
    labels = []
    sentence_id = 0
    all_data = []

    with open(filepath, encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line == "":
                if tokens:
                    for i in range(len(tokens)):
                        all_data.append({
                            "sentence_id": sentence_id,
                            "tokens": tokens[i],
                            "ner_tags": labels[i]
                        })
                    sentence_id += 1
                    tokens, labels = [], []
                continue

            if line.startswith("#"):  # ignore comments
                continue

            splits = line.split()
            if len(splits) >= 2:
                token, label = splits[0], splits[-1]
                tokens.append(token)
                labels.append(label)

    return pd.DataFrame(all_data)

df = parse_conll(conll_path)
df.head()

Unnamed: 0,sentence_id,tokens,ner_tags
0,0,3pcs,B-Product
1,0,Bottle,I-Product
2,0,Stopper,I-Product
3,0,በማንኛውም,O
4,0,ጠርሙስ,O


In [6]:
# step4 Group by sentence ID
grouped = df.groupby("sentence_id").agg({"tokens": list, "ner_tags": list}).reset_index()

# Flatten tokens (if nested)
def flatten_tokens(token_list):
    flat = []
    for t in token_list:
        if isinstance(t, list):
            flat.extend(t)
        else:
            flat.append(t)
    return flat

grouped["tokens"] = grouped["tokens"].apply(flatten_tokens)
grouped.head()

Unnamed: 0,sentence_id,tokens,ner_tags
0,0,"[3pcs, Bottle, Stopper, በማንኛውም, ጠርሙስ, ጫፍ, የሚገጠ...","[B-Product, I-Product, I-Product, O, O, O, O, ..."
1,1,"[1, pairs, Sneaker, Crease, Protector, ዋጋ, 400...","[B-Product, I-Product, I-Product, I-Product, I..."
2,2,"[Imitation, Volcano, Humidifier, with, LED, Li...","[B-Product, I-Product, I-Product, I-Product, I..."
3,3,"[Baby, Carrier, በፈለጉት, አቅጣጫ, ልጅዎን, በምቾት, ማዘል, ...","[B-Product, I-Product, O, O, O, O, O, O, B-PRI..."
4,4,"[Smart, Usb, Ultrasonic, Car, And, Home, Air, ...","[B-Product, I-Product, I-Product, I-Product, I..."


In [7]:
# Flatten tokens so they are like ['item1', 'item2', ...] not [['item1'], ['item2']]
def flatten_tokens(token_list):
    return [t[0] if isinstance(t, list) else t for t in token_list]

grouped["tokens"] = grouped["tokens"].apply(flatten_tokens)


In [17]:
#Step 4: Define Label Mapping
label_list = ["O", "B-Product", "I-Product", "B-PRICE", "I-PRICE", "B-LOC", "I-LOC"]
label_to_id = {label: i for i, label in enumerate(label_list)}

def encode_tags(tags):
    return [label_to_id.get(tag, label_to_id["O"]) for tag in tags]

grouped["ner_tags"] = grouped["ner_tags"].apply(encode_tags)


In [19]:
#Step 5: Train-Validation Split
train_grouped, val_grouped = train_test_split(grouped, test_size=0.2, random_state=42)

In [20]:
#Step 6: Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_grouped)
val_dataset = Dataset.from_pandas(val_grouped)

dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset
})

In [21]:
#Step 7: Tokenization Setup
model_checkpoint = "Davlan/xlm-roberta-base-ner-hrl"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
label_all_tokens = True
# Step: Define tokenize_and_align_labels after tokenizer is defined
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        is_split_into_words=True,
        padding=True,
        truncation=True,
        max_length=128,
    )

    all_labels = []
    for i, labels in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(labels[word_idx])
            else:
                label_ids.append(labels[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx
        all_labels.append(label_ids)

    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs


In [22]:
# Step 7: Tokenization (add this AFTER dataset creation)
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [23]:
# Step: Remove unnecessary columns before training to avoid warnings/errors
columns_to_remove = ['tokens', 'ner_tags', 'sentence_id', '__index_level_0__']
columns_to_remove = [col for col in columns_to_remove if col in tokenized_dataset["train"].column_names]

tokenized_dataset = tokenized_dataset.remove_columns(columns_to_remove)



In [24]:
#Step 9: Load Model
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    ignore_mismatched_sizes=True
)


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at Davlan/xlm-roberta-base-ner-hrl and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([7]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([7, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
#Step 10: Define Training Arguments
training_args = TrainingArguments(
    output_dir="./ner_model",
    eval_strategy="epoch",
    learning_rate=2e-5,  # Slightly higher LR
    per_device_train_batch_size=8,  # smaller batch fits memory better
    per_device_eval_batch_size=8,
    num_train_epochs=14,  # more epochs
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    remove_unused_columns=False,
    metric_for_best_model="f1",
    load_best_model_at_end=True
)


In [26]:
#Step 11: Define metrics
import evaluate
metric = evaluate.load("seqeval")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = []
    true_predictions = []

    for label, pred in zip(labels, predictions):
        # Filter out ignored labels (-100)
        active_labels = [label_list[l] for l in label if l != -100]
        active_preds = [label_list[p] for (p, l) in zip(pred, label) if l != -100]

        # Only add if there are true entities (not all 'O')
        if any(tag != 'O' for tag in active_labels):
            true_labels.append(active_labels)
            true_predictions.append(active_preds)

    if not true_labels:  # All batches are 'O'
        return {"precision": 0.0, "recall": 0.0, "f1": 0.0, "accuracy": 0.0}

    results = metric.compute(
        predictions=true_predictions,
        references=true_labels,
        zero_division=0  # Explicitly handle division-by-zero
    )
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],  # Fixed typo (previously "recall")
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [27]:
#Step 12: Initialize Trainer
data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [28]:
#Step 13: Train the Model
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mwaleligntagesse[0m ([33mwaleligntagesse-none[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.671758,0.0,0.0,0.0,0.463855
2,No log,1.425919,0.0,0.0,0.0,0.620482
3,No log,1.274694,0.0,0.0,0.0,0.638554
4,No log,1.171934,0.0,0.0,0.0,0.638554
5,No log,1.098571,0.0,0.0,0.0,0.638554
6,No log,1.041808,0.0,0.0,0.0,0.638554
7,No log,0.992915,0.0,0.0,0.0,0.638554
8,No log,0.946902,0.0,0.0,0.0,0.638554
9,No log,0.902557,0.0,0.0,0.0,0.644578
10,1.290800,0.8612,0.0,0.0,0.0,0.644578


TrainOutput(global_step=14, training_loss=1.154725228037153, metrics={'train_runtime': 2068.6764, 'train_samples_per_second': 0.047, 'train_steps_per_second': 0.007, 'total_flos': 6402059930112.0, 'train_loss': 1.154725228037153, 'epoch': 14.0})

In [29]:
from collections import Counter
all_tags = [tag for tags in grouped["ner_tags"] for tag in tags]
print(Counter(all_tags))

Counter({0: 239, 2: 40, 6: 36, 4: 19, 1: 9, 3: 9, 5: 9})


In [34]:
import torch

# 1. Define your label list (must match training)
bio_label_list = ["O", "B-Product", "I-Product", "B-PRICE", "I-PRICE", "B-LOC", "I-LOC"]

# 2. Load original sentences from labeled.txt
with open("labeled.txt", "r") as f:
    original_lines = [line.strip() for line in f if line.strip() and not line.startswith("#")]

# 3. Get tokenized sample and predictions
sample_idx = 0  # Change this to inspect different samples
tokenized_sample = tokenized_dataset["validation"][sample_idx]

with torch.no_grad():
    inputs = {k: torch.tensor([v]) for k,v in tokenized_sample.items() if k != "labels"}
    predictions = model(**inputs)

# 4. Process predictions
predicted_tags = [bio_label_list[i] for i in predictions.logits.argmax(-1)[0].numpy()]
true_tags = [bio_label_list[i] for i in tokenized_sample["labels"] if i != -100]

# 5. Find corresponding original sentence
original_tokens = original_lines[sample_idx].split()  # Simple space splitting - adjust if your file uses different delimiters

# 6. Print aligned results
print("ORIGINAL SENTENCE:", " ".join(original_tokens))
print("\nTOKEN\t\tPRED\tTRUE")
print("-----------------------")
for i, token in enumerate(original_tokens):
    if i >= len(predicted_tags) or i >= len(true_tags):
        break
    print(f"{token[:10]}\t{predicted_tags[i]}\t{true_tags[i]}")

ORIGINAL SENTENCE: 3pcs B-Product

TOKEN		PRED	TRUE
-----------------------
3pcs	I-Product	B-Product
B-Product	B-LOC	B-Product


In [35]:
!git
!git remote add origin https://github.com/WaleTg/ethio_mart_amharic_ner.git

usage: git [--version] [--help] [-C <path>] [-c <name>=<value>]
           [--exec-path[=<path>]] [--html-path] [--man-path] [--info-path]
           [-p | --paginate | -P | --no-pager] [--no-replace-objects] [--bare]
           [--git-dir=<path>] [--work-tree=<path>] [--namespace=<name>]
           [--super-prefix=<path>] [--config-env=<name>=<envvar>]
           <command> [<args>]

These are common Git commands used in various situations:

start a working area (see also: git help tutorial)
   clone     Clone a repository into a new directory
   init      Create an empty Git repository or reinitialize an existing one

work on the current change (see also: git help everyday)
   add       Add file contents to the index
   mv        Move or rename a file, a directory, or a symlink
   restore   Restore working tree files
   rm        Remove files from the working tree and from the index

examine the history and state (see also: git help revisions)
   bisect    Use binary search to find th

In [36]:
!git config --global user.name "WaleTg"
!git config --global user.email "waleligntagesse@gmail.com"


In [39]:
!git init


[33mhint: Using 'master' as the name for the initial branch. This default branch name[m
[33mhint: is subject to change. To configure the initial branch name to use in all[m
[33mhint: [m
[33mhint: 	git config --global init.defaultBranch <name>[m
[33mhint: [m
[33mhint: Names commonly chosen instead of 'master' are 'main', 'trunk' and[m
[33mhint: 'development'. The just-created branch can be renamed via this command:[m
[33mhint: [m
[33mhint: 	git branch -m <name>[m
Initialized empty Git repository in /content/.git/


In [40]:
!git add Un1.ipynb
!git commit -m "Add Un1 notebook"


fatal: pathspec 'Un1.ipynb' did not match any files
On branch master

Initial commit

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31m.config/[m
	[31mlabeled (1).txt[m
	[31mlabeled.txt[m
	[31mlogs/[m
	[31mner_model/[m
	[31msample_data/[m
	[31mwandb/[m

nothing added to commit but untracked files present (use "git add" to track)
