In [None]:
# scripts/fetch_telegram_data.py

%pip install telethon python-dotenv

from telethon import TelegramClient
from dotenv import load_dotenv
import os

load_dotenv()

api_id = int(os.getenv("API_ID"))
api_hash = os.getenv("API_HASH")
phone = os.getenv("PHONE")

client = TelegramClient("amharic_telegram", api_id, api_hash)

async def fetch_messages(channel, limit=1000):
    await client.start(phone)
    entity = await client.get_entity(channel)
    messages = []
    async for msg in client.iter_messages(entity, limit=limit):
        messages.append({
            "id": msg.id,
            "text": msg.message,
            "date": msg.date.isoformat(),
            "media": bool(msg.media)
        })
    return messages


In [None]:
channels = [
    "@ZemenExpress",
    "@nevacomputer",
    "@meneshayeofficial",
    "@ethio_brand_collection",
    "@Leyueqa",

]

In [None]:
import asyncio

async def main():
    # Only start the client if not already authorized
    if not await client.is_user_authorized():
        await client.start(phone)

    async def fetch_and_save(channel):
        msgs = await fetch_messages(channel)
        filename = f"data/raw/{channel.strip('@')}.json"
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(msgs, f, ensure_ascii=False, indent=2)
        print(f"✔️ Done: {channel}")

    await asyncio.gather(*(fetch_and_save(ch) for ch in channels))

await main()

In [None]:
import re
import json
import os
from tqdm import tqdm

def normalize_amharic(text):
    if not text:
        return ""
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)
    return text

def preprocess_channel(filename):
    with open(filename, encoding='utf-8') as f:
        data = json.load(f)

    preprocessed = []
    for item in data:
        if item["text"]:
            norm = normalize_amharic(item["text"])
            preprocessed.append({
                "text": norm,
                "date": item["date"]
            })

    out_file = "data/processed/" + os.path.basename(filename)
    with open(out_file, "w", encoding="utf-8") as f:
        json.dump(preprocessed, f, ensure_ascii=False, indent=2)

# Process all files
for fname in os.listdir("data/raw"):
    preprocess_channel("data/raw/" + fname)

In [None]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="labelled_data.json", split="train")

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification

model_name = "Davlan/bert-base-multilingual-cased-ner"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
data_collator = DataCollatorForTokenClassification(tokenizer)

training_args = TrainingArguments(
    output_dir="./models",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

In [None]:
from seqeval.metrics import classification_report

# y_true and y_pred should be BIO-tagged sequences
print(classification_report(y_true, y_pred))

In [None]:
from pymongo import MongoClient

client = MongoClient()
db = client.amharic_ner

for file in os.listdir("data/processed"):
    with open("data/processed/" + file) as f:
        data = json.load(f)
        db.messages.insert_many(data)