In [10]:
import pandas as pd

# Load the scraped Telegram data
df = pd.read_csv("../telegram_data.csv",encoding='utf-8')

# Show structure
print("Total rows:", len(df))
df.head(5)


Total rows: 2500


Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path
0,Zemen Express®,@ZemenExpress,6982,💥💥...................................💥💥\n\n📌Im...,2025-06-18 06:01:10+00:00,
1,Zemen Express®,@ZemenExpress,6981,💥💥...................................💥💥\n\n📌 B...,2025-06-16 12:21:00+00:00,
2,Zemen Express®,@ZemenExpress,6980,,2025-06-16 05:11:57+00:00,photos\@ZemenExpress_6980.jpg
3,Zemen Express®,@ZemenExpress,6979,,2025-06-16 05:11:57+00:00,photos\@ZemenExpress_6979.jpg
4,Zemen Express®,@ZemenExpress,6978,,2025-06-16 05:11:57+00:00,photos\@ZemenExpress_6978.jpg


In [2]:
import re

# Drop rows with empty messages
df = df.dropna(subset=['Message'])

# Remove duplicates (based on message text)
df = df.drop_duplicates(subset=['Message'])

# Function to clean Amharic messages
def clean_message(text):
    if not isinstance(text, str):
        return ""
    # Remove emojis and special characters (optional)
    text = re.sub(r'[^\w\s፡።፣፤፥፦፧፨ብርመዝናብዋትናን]', '', text)
    
    # Normalize multiple spaces/newlines
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()

# Apply cleaning function
df['Cleaned_Message'] = df['Message'].apply(clean_message)

# Show sample
df[['Message', 'Cleaned_Message']].head(5)


Unnamed: 0,Message,Cleaned_Message
0,💥💥...................................💥💥\n\n📌Im...,Imitation Volcano Humidifier with LED Light በኤ...
1,💥💥...................................💥💥\n\n📌 B...,Baby Carrier በፈለጉት አቅጣጫ ልጅዎን በምቾት ማዘል ያስችልዎታል ...
9,💥💥...................................💥💥\n\n📌Sm...,Smart Usb Ultrasonic Car And Home Air Humidifi...
12,💥💥...................................💥💥\n\n📌Ba...,Baby Head Helmet Cotton Walk Safety Hat Breath...
14,💥💥...............🌞.................💥💥\n\n❓ በረፍ...,በረፍት ቀንዎ ሱቅ ላይ መስተናገድ ለምትፈልጉ ውድ ደንበኞቻችን ነገ ከጠዋ...


In [3]:
# Tokenize the cleaned Amharic message
def tokenize_amharic(text):
    # Basic whitespace tokenizer
    return text.split()

# Apply tokenization
df['Tokens'] = df['Cleaned_Message'].apply(tokenize_amharic)

# Preview tokens
df[['Cleaned_Message', 'Tokens']].head(5)


Unnamed: 0,Cleaned_Message,Tokens
0,Imitation Volcano Humidifier with LED Light በኤ...,"[Imitation, Volcano, Humidifier, with, LED, Li..."
1,Baby Carrier በፈለጉት አቅጣጫ ልጅዎን በምቾት ማዘል ያስችልዎታል ...,"[Baby, Carrier, በፈለጉት, አቅጣጫ, ልጅዎን, በምቾት, ማዘል, ..."
9,Smart Usb Ultrasonic Car And Home Air Humidifi...,"[Smart, Usb, Ultrasonic, Car, And, Home, Air, ..."
12,Baby Head Helmet Cotton Walk Safety Hat Breath...,"[Baby, Head, Helmet, Cotton, Walk, Safety, Hat..."
14,በረፍት ቀንዎ ሱቅ ላይ መስተናገድ ለምትፈልጉ ውድ ደንበኞቻችን ነገ ከጠዋ...,"[በረፍት, ቀንዎ, ሱቅ, ላይ, መስተናገድ, ለምትፈልጉ, ውድ, ደንበኞቻች..."


In [4]:
# Select and reorder relevant columns
structured_df = df[[
    'Channel Title',
    'Channel Username',
    'Date',
    'Cleaned_Message',
    'Tokens',
    'Media Path'
]]

# Preview structure
structured_df.head()


Unnamed: 0,Channel Title,Channel Username,Date,Cleaned_Message,Tokens,Media Path
0,Zemen Express®,@ZemenExpress,2025-06-18 06:01:10+00:00,Imitation Volcano Humidifier with LED Light በኤ...,"[Imitation, Volcano, Humidifier, with, LED, Li...",
1,Zemen Express®,@ZemenExpress,2025-06-16 12:21:00+00:00,Baby Carrier በፈለጉት አቅጣጫ ልጅዎን በምቾት ማዘል ያስችልዎታል ...,"[Baby, Carrier, በፈለጉት, አቅጣጫ, ልጅዎን, በምቾት, ማዘል, ...",
9,Zemen Express®,@ZemenExpress,2025-06-16 05:11:57+00:00,Smart Usb Ultrasonic Car And Home Air Humidifi...,"[Smart, Usb, Ultrasonic, Car, And, Home, Air, ...",photos\@ZemenExpress_6973.jpg
12,Zemen Express®,@ZemenExpress,2025-06-16 05:09:03+00:00,Baby Head Helmet Cotton Walk Safety Hat Breath...,"[Baby, Head, Helmet, Cotton, Walk, Safety, Hat...",photos\@ZemenExpress_6970.jpg
14,Zemen Express®,@ZemenExpress,2025-06-14 14:40:03+00:00,በረፍት ቀንዎ ሱቅ ላይ መስተናገድ ለምትፈልጉ ውድ ደንበኞቻችን ነገ ከጠዋ...,"[በረፍት, ቀንዎ, ሱቅ, ላይ, መስተናገድ, ለምትፈልጉ, ውድ, ደንበኞቻች...",photos\@ZemenExpress_6968.jpg


In [6]:
import os

# Create the clean data directory if it doesn't exist
os.makedirs("../data/clean", exist_ok=True)


In [7]:
# Export to raw CoNLL-style file for manual tagging

with open("../data/clean/unlabeled_conll.txt", "w", encoding='utf-8') as f:
    for tokens in df['Tokens']:
        for token in tokens:
            f.write(f"{token} O\n")  # Default tag: O
        f.write("\n")  # Blank line between sentences/messages


In [8]:
import os

# Step 1a: Create folder (if missing)
os.makedirs("../data/clean", exist_ok=True)

# Step 1b: Export tokens with default 'O' tags for manual labeling
with open("../data/clean/unlabeled_conll.txt", "w", encoding='utf-8') as f:
    for tokens in df['Tokens']:
        for token in tokens:
            f.write(f"{token} O\n")  # tag O = no entity
        f.write("\n")  # blank line separates messages


In [12]:
import pandas as pd
import re


def clean_text(text):
    if not isinstance(text, str):
        return ''
    # Keep Amharic Unicode (\u1200-\u137F), Latin letters, numbers, and spaces only
    text = re.sub(r'[^\w\s\u1200-\u137F]', '', text)
    text = re.sub(r'\s+', ' ', text)  # collapse multiple spaces
    return text.strip()

def tokenize(text):
    return text.split()

# Apply cleaning and tokenizing
df['Cleaned_Message'] = df['Message'].apply(clean_text)
df['Tokens'] = df['Cleaned_Message'].apply(tokenize)

# Check result
print(df[['Message', 'Cleaned_Message', 'Tokens']].head())

# Save the processed data for next steps
df.to_csv('../data/clean/processed_telegram_data.csv', index=False, encoding='utf-8')



                                             Message  \
0  💥💥...................................💥💥\n\n📌Im...   
1  💥💥...................................💥💥\n\n📌 B...   
2                                                NaN   
3                                                NaN   
4                                                NaN   

                                     Cleaned_Message  \
0  Imitation Volcano Humidifier with LED Light በኤ...   
1  Baby Carrier በፈለጉት አቅጣጫ ልጅዎን በምቾት ማዘል ያስችልዎታል ...   
2                                                      
3                                                      
4                                                      

                                              Tokens  
0  [Imitation, Volcano, Humidifier, with, LED, Li...  
1  [Baby, Carrier, በፈለጉት, አቅጣጫ, ልጅዎን, በምቾት, ማዘል, ...  
2                                                 []  
3                                                 []  
4                                                 [