In [16]:
import pandas as pd
import re

# Load the dataset containing messages
dataset_file = 'fetched_messages.json'  
output_file = 'labeled_dataset.conll'

# Load the dataset
with open(dataset_file, 'r', encoding='utf-8') as f:
    messages = json.load(f)

# Extract the message texts
texts = [msg['text'] for msg in messages if msg['text'] is not None]

# Function to label entities in CoNLL format
def label_entities(text):
    tokens = text.split()  
    labeled_tokens = []  
    # Patterns for identifying entities
    product_pattern = re.compile(r'([A-Za-z0-9\u1200-\u137F\s]+(?:\s+[A-Za-z0-9\u1200-\u137F]+)*)') 
    price_pattern = re.compile(r'(ዋጋ\s+\d+\s*ብር|\d+\s*ብር|በ\s+\d+\s*ብር)')
    loc_pattern = re.compile(r'(Addis Ababa|Bole|አዲስ አበባ|ቦሌ)')  

    for token in tokens:
        label = 'O'  # Default label for tokens outside any entity

        # Check for product entities
        if product_pattern.fullmatch(token):
            if len(labeled_tokens) == 0 or labeled_tokens[-1][1] != 'B-Product':
                label = 'B-Product'
            else:
                label = 'I-Product'

        # Check for price entities
        elif price_pattern.fullmatch(token):
            if len(labeled_tokens) == 0 or labeled_tokens[-1][1] != 'B-PRICE':
                label = 'B-PRICE'
            else:
                label = 'I-PRICE'

        # Check for location entities
        elif loc_pattern.fullmatch(token):
            if len(labeled_tokens) == 0 or labeled_tokens[-1][1] != 'B-LOC':
                label = 'B-LOC'
            else:
                label = 'I-LOC'

        # Append the token and its label
        labeled_tokens.append((token, label))

    return labeled_tokens

# Process each message and save to CoNLL format
with open(output_file, 'w', encoding='utf-8') as f:
    for text in texts[:50]:  # Process the first 50 messages
        labeled_tokens = label_entities(text)

        for token, label in labeled_tokens:
            f.write(f"{token}\t{label}\n")
        f.write("\n")  

print(f"Labeled dataset saved to {output_file}")


Labeled dataset saved to labeled_dataset.conll
