In [24]:
# 📦 Imports
import re
import pandas as pd
import emoji
import os
import json
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

# 🧼 Cleaning function
def clean_amharic_text(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = emoji.replace_emoji(text, replace='')
    text = re.sub(r'\.{3,}', ' ', text)
    text = re.sub(r'[^\w\s\u1200-\u137F.,!?]', '', text)
    text = ' '.join(text.split())
    return text.strip()

# 📂 Load data from .json or .csv
def load_data(input_path):
    ext = os.path.splitext(input_path)[-1].lower()
    if ext == ".json":
        with open(input_path, "r", encoding="utf-8") as f:
            data = json.load(f)
        df = pd.DataFrame(data)
    elif ext == ".csv":
        df = pd.read_csv(input_path)
    else:
        raise ValueError(f"Unsupported file format: {ext}")
    if "text" not in df.columns:
        raise KeyError("'text' column not found in input file.")
    return df

# 🧹 Clean + tokenize
def preprocess_dataframe(df):
    df["cleaned_text"] = df["text"].apply(clean_amharic_text)
    df["tokens"] = df["cleaned_text"].apply(word_tokenize)
    df = df[df["cleaned_text"].str.strip() != ""]
    return df

# 💾 Save the processed dataframe
def save_processed(df, input_path):
    # Set output directory and file name
    base_dir = os.path.dirname(input_path).replace("raw", "processed")
    os.makedirs(base_dir, exist_ok=True)
    file_name = os.path.splitext(os.path.basename(input_path))[0] + "_cleaned.csv"
    output_path = os.path.join(base_dir, file_name)

    df.to_csv(output_path, index=False, encoding="utf-8")
    print(f"✅ Processed file saved at: {output_path}")

# 🛠️ Input path
input_path = "../data/raw/telegram_messages_20250621_052911.json"  # or .csv

# 🚀 Run pipeline
df_raw = load_data(input_path)
df_processed = preprocess_dataframe(df_raw)
save_processed(df_processed, input_path)

# 🔍 Preview
df_processed[["text", "cleaned_text", "tokens"]].head(3)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


✅ Processed file saved at: ../data/processed\telegram_messages_20250621_052911_cleaned.csv


Unnamed: 0,text,cleaned_text,tokens
0,💥💥...................................💥💥\n\n📌Im...,Imitation Volcano Humidifier with LED Light በኤ...,"[Imitation, Volcano, Humidifier, with, LED, Li..."
1,💥💥...................................💥💥\n\n📌 B...,Baby Carrier በፈለጉት አቅጣጫ ልጅዎን በምቾት ማዘል ያስችልዎታል ...,"[Baby, Carrier, በፈለጉት, አቅጣጫ, ልጅዎን, በምቾት, ማዘል, ..."
2,💥💥...................................💥💥\n\n📌Sm...,Smart Usb Ultrasonic Car And Home Air Humidifi...,"[Smart, Usb, Ultrasonic, Car, And, Home, Air, ..."


In [25]:
df_with_symbols = df_processed[df_processed['cleaned_text'].str.contains(r'[💥\.]{5,}', regex=True)]

print(f"Rows with decorative symbols: {len(df_with_symbols)}")
if not df_with_symbols.empty:
    print(df_with_symbols[['text']].head(3))
else:
    print("✅ No decorative symbols found in cleaned_text.")
from itertools import chain

tokens_flat = list(chain.from_iterable(df_processed['tokens']))
suspicious_tokens = [tok for tok in tokens_flat if re.fullmatch(r'[💥\.]{3,}', tok)]

if suspicious_tokens:
    print("⚠️ Found suspicious tokens:", suspicious_tokens[:5])
else:
    print("✅ No decorative symbols in tokens.")


Rows with decorative symbols: 0
✅ No decorative symbols found in cleaned_text.
✅ No decorative symbols in tokens.
