In [None]:
# 📘 data_labeling.ipynb or data_labeling.py

import os
import pandas as pd
import random


# 📥 Step 1: Load cleaned messages
input_path = "../data/processed/telegram_messages_20250621_052911_cleaned.csv"

df = pd.read_csv(input_path)

# 🧪 Step 2: Sample 50 tokenized messages
df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)
df_sample = df_shuffled.head(50)

unlabeled_data = [{"tokens": eval(row["tokens"])} for _, row in df_sample.iterrows()]

# 👀 Step 3: Show 5 for manual labeling
print("\n🔍 Preview of messages for labeling:\n")
for i, sample in enumerate(unlabeled_data[:5]):
    print(f"🟦 Sample {i+1}:")
    print("Tokens:", sample["tokens"])
    print()

# ✍️ Step 4: Add your labels manually here
# For each labeled example, ensure:
#   - token list length == label list length
#   - BIO labels use the format B-TAG, I-TAG, O

labeled_data = [
    {
        "tokens": ["ለልጆች", "ጫማ", "በ", "350", "ብር"],
        "labels": ["B-PRODUCT", "I-PRODUCT", "O", "B-PRICE", "I-PRICE"]
    },
    {
        "tokens": ["በአዲስ", "አበባ", "የሚገኝ", "መኪና"],
        "labels": ["B-LOC", "I-LOC", "O", "B-PRODUCT"]
    }
    # 🔁 Copy more token sets from `unlabeled_data` and annotate them
]

# 💾 Step 5: Save to CoNLL format
def save_conll(data, filepath):
    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    with open(filepath, "w", encoding="utf-8") as f:
        for item in data:
            for token, label in zip(item["tokens"], item["labels"]):
                f.write(f"{token}\t{label}\n")
            f.write("\n")
    print(f"✅ Saved {len(data)} labeled samples to: {filepath}")

# 💽 Step 6: Export annotated data
save_conll(labeled_data, "data/labeled/amharic_ner.conll")




# 📦 Imports
import pandas as pd
from IPython.display import display

import os
os.makedirs("data/unlabeled", exist_ok=True)


# 📥 Load tokenized unlabeled data
df = pd.read_pickle("data/unlabeled/unlabeled_data.pkl")  # Or replace with your variable

# 🔢 Number of samples to label
NUM_TO_LABEL = 5  # You can increase as needed

# 📝 Define available tags
label_options = ["B-PRODUCT", "I-PRODUCT", "B-LOC", "I-LOC", "B-PRICE", "I-PRICE", "O"]

# 🧠 Initialize labeled data container
labeled_data = []

# 🧰 Labeling function
def label_sample(tokens):
    print(f"\n🟨 Tokens to label: {' '.join(tokens)}\n")
    labels = []
    for token in tokens:
        print(f"Token: {token}")
        print("Available Tags: ", label_options)
        label = input("Enter label: ").strip().upper()
        while label not in label_options:
            label = input("Invalid label. Try again: ").strip().upper()
        labels.append(label)
    return {"tokens": tokens, "labels": labels}

# ▶️ Start labeling
for i in range(NUM_TO_LABEL):
    print(f"\n🔷 Sample {i + 1}/{NUM_TO_LABEL}")
    tokens = df.iloc[i]["tokens"]
    labeled_sample = label_sample(tokens)
    labeled_data.append(labeled_sample)

# 💾 Save to CoNLL format
def save_conll(data, filepath):
    with open(filepath, "w", encoding="utf-8") as f:
        for item in data:
            for token, label in zip(item["tokens"], item["labels"]):
                f.write(f"{token}\t{label}\n")
            f.write("\n")

save_conll(labeled_data, "data/labeled/amharic_ner.conll")
print("✅ Done labeling. Saved to: data/labeled/amharic_ner.conll")




🔍 Preview of messages for labeling:

🟦 Sample 1:
Tokens: ['One', 'Step', 'Hair', 'Dryer', 'Styler', 'ከርል', 'ለመስራት', 'ለማለስለስ', 'እንዲሁም', 'ለማድረቅ', 'የሚያገለግል', 'ለኢትዮጵያውያን', 'ፀጉር', 'ተስማሚ', 'የሙቀት', 'መቆጣጠሪያ', 'ስላለው', 'ለአጠቃቀም', 'ምቹ', 'ዋጋ፦', '1600', 'ብር', 'አድራሻ', 'መገናኛ_መሰረት_ደፋር_ሞል_ሁለተኛ_ፎቅ', 'ቢሮ', 'ቁ.', 'S05S06', '0902660722', '0928460606', 'በTelegram', 'ለማዘዝ', 'ይጠቀሙ', 'zemencallcenter', 'zemenexpressadmin', 'ለተጨማሪ', 'ማብራሪያ', 'የቴሌግራም', 'ገፃችን']

🟦 Sample 2:
Tokens: ['ማስፈንጠሪያውን', 'ተጭነው', 'አሁኑኑ', 'ይመዝገቡ', '፤', '10', 'ቅናሽ', 'ያግኙ', '!', '!', 'የተንጋደደውን', 'ፊደል', 'የሚያቀና', 'መላው', 'ይኸውና', '!', '!']

🟦 Sample 3:
Tokens: ['2in1', 'Portable', 'Dumpling', 'Making', 'Machine', 'ዋጋ፦', '650', 'ብር', 'ውስን', 'ፍሬ', 'ነው', 'ያለው', 'አድራሻ', 'ቁ.መገናኛ', 'መሰረት', 'ደፋር', 'ሞል', 'ሁለተኛ', 'ፎቅ', 'ቢሮ', 'ቁ.', 'S05S06', 'ቁ.ፒያሳ', 'ጊዮርጊስ', 'አደባባይ', 'ራመት_ታቦር_ኦዳ_ህንፃ', '1ኛ', 'ፎቅ', 'ሱቅ', 'ቁ.', 'G1', '107', '0902660722', '0928460606', 'ፒያሳ', 'ቅርንጫፍ', '0960460606', 'mardashope', 'በTelegram', 'ለማዘዝ', 'ይጠቀሙ', 'zemencallcenter', 'zemenexpressadm

In [1]:
# Example tokenized samples (replace this with your actual data)
samples = [
    ['One', 'Step', 'Hair', 'Dryer', 'Styler', 'ከርል', 'ለመስራት', 'ለማለስለስ', 'እንዲሁም', 'ለማድረቅ', 'የሚያገለግል'],
    ['ማስፈንጠሪያውን', 'ተጭነው', 'አሁኑኑ', 'ይመዝገቡ', '፤', '10', 'ቅናሽ', 'ያግኙ', '!', '!'],
    # Add more samples as needed
]

# Print each sample in markdown style for labeling
for i, tokens in enumerate(samples, start=1):
    print(f"🟦 Sample {i}:")
    print("Tokens:", tokens)
    print()  # Blank line for readability


🟦 Sample 1:
Tokens: ['One', 'Step', 'Hair', 'Dryer', 'Styler', 'ከርል', 'ለመስራት', 'ለማለስለስ', 'እንዲሁም', 'ለማድረቅ', 'የሚያገለግል']

🟦 Sample 2:
Tokens: ['ማስፈንጠሪያውን', 'ተጭነው', 'አሁኑኑ', 'ይመዝገቡ', '፤', '10', 'ቅናሽ', 'ያግኙ', '!', '!']



In [2]:
sample = """🎯 Three-layer Baby Milk Powder Container 
💯 High Quality 

👍 Three Layer No-Spill Baby Feeding Milk Powder Food Dispenser. A perfect storage for travel or home use.

👍እናት ልጇን ይዛ የተለያየ ቦታ ስትንቀሳቀስ
የዱቄት ወተት የመሳሰሉትን አስፈላጊ የልጆች ምግብ ይዞ ለመንቀሳቀስ የሚረዳ 3 ፓርቲሽን ያለው አሪፍ ኮንቴነር

ዋጋ፦ 500ብር"""
print(sample.split())


['🎯', 'Three-layer', 'Baby', 'Milk', 'Powder', 'Container', '💯', 'High', 'Quality', '👍', 'Three', 'Layer', 'No-Spill', 'Baby', 'Feeding', 'Milk', 'Powder', 'Food', 'Dispenser.', 'A', 'perfect', 'storage', 'for', 'travel', 'or', 'home', 'use.', '👍እናት', 'ልጇን', 'ይዛ', 'የተለያየ', 'ቦታ', 'ስትንቀሳቀስ', 'የዱቄት', 'ወተት', 'የመሳሰሉትን', 'አስፈላጊ', 'የልጆች', 'ምግብ', 'ይዞ', 'ለመንቀሳቀስ', 'የሚረዳ', '3', 'ፓርቲሽን', 'ያለው', 'አሪፍ', 'ኮንቴነር', 'ዋጋ፦', '500ብር']


In [3]:
import re

ads_text = """
📌 Only baby 3in1 double bottle milk warmer,sterilizer,food steamer

⚡️ለሕፃን ወተት ማሞቂያ
⚠️ በተጨማሪ ምግብ ለመቀቀል የሚሆን 

ዋጋ፦  💲🏷 3000  ብር

♦️ውስን ፍሬ ነው ያለው🔥🔥🔥

🏢 አድራሻ👉

📍♦️#መገናኛ_መሰረት_ደፋር_ሞል_ሁለተኛ_ፎቅ ቢሮ ቁ. S05/S06

     💧💧💧💧

    📲 0902660722
    📲 0928460606 

🔖
💬በTelegram ለማዘዝ ⤵️ ይጠቀሙ🔽

@zemencallcenter 
@zemenexpressadmin

ለተጨማሪ ማብራሪያ የቴሌግራም ገፃችን⤵️
https://telegram.me/zemenexpress

💥💥...................................💥💥

📌 Mini Pocket UV Umbrella

👍 የቀለም አማራጭ አላቸው

☔በጣም ቀላል፣ ለመያዝ ምቹ ጥላ ☔️
☔በትንሽ የእጅ ቦርሳ ወይም በኪስ መያዝ የሚችል☔

#Specifications: 
👍Compact & Light-Weight
👍Unique Design
👍UV protection

 ዋጋ፦  💵🏷  1000ብር

♦️ውስን ፍሬ ነው ያለው 🔥🔥🔥

🏢 አድራሻ👉

📍♦️#መገናኛ_መሰረት_ደፋር_ሞል_ሁለተኛ_ፎቅ ቢሮ ቁ. S05/S06

     💧💧💧💧

    📲 0902660722
    📲 0928460606 

🔖
💬በTelegram ለማዘዝ ⤵️ ይጠቀሙ🔽

@zemencallcenter 
@zemenexpressadmin

ለተጨማሪ ማብራሪያ የቴሌግራም ገፃችን⤵️
https://telegram.me/zemenexpress
"""

# Find all lines starting with 📌 (likely product titles)
product_clues = re.findall(r"📌.*?(?=\n)", ads_text)

print("Number of product samples:", len(product_clues))
print("Samples found:")
for clue in product_clues:
    print("-", clue)


Number of product samples: 2
Samples found:
- 📌 Only baby 3in1 double bottle milk warmer,sterilizer,food steamer
- 📌 Mini Pocket UV Umbrella
