In [17]:
import pandas as pd

channels_df = pd.read_excel(r"channels_to_crawl.xlsx")

print(f"Total channels available: {len(channels_df)}\n")

# Display the first 10 as a sample for manual selection
for idx, name in enumerate(channels_df.iloc[:, 0].tolist(), start=1):
    print(f"{idx}. {name}")

Total channels available: 22

1. @ZemenExpress
2. @nevacomputer
3. @meneshayeofficial
4. @ethio_brand_collection
5. @Leyueqa
6. @sinayelj
7. @Shewabrand
8. @helloomarketethiopia
9. @modernshoppingcenter
10. @qnashcom
11. @Fashiontera
12. @kuruwear
13. @gebeyaadama
14. @MerttEka
15. @forfreemarket
16. @classybrands
17. @marakibrand
18. @aradabrand2
19. @marakisat2
20. @belaclassic
21. @AwasMart
22. @qnashcom


In [1]:
from telethon import TelegramClient
import csv
import os
from dotenv import load_dotenv

# Load environment variables once
load_dotenv('.env')
api_id = os.getenv('TG_API_ID')
api_hash = os.getenv('TG_API_HASH')
phone = os.getenv('phone')

# Function to scrape data from a single channel
async def scrape_channel(client, channel_username, writer, media_dir):
    entity = await client.get_entity(channel_username)
    channel_title = entity.title  # Extract the channel's title
    async for message in client.iter_messages(entity, limit=10000):
        media_path = None
        if message.media and hasattr(message.media, 'photo'):
            # Create a unique filename for the photo
            filename = f"{channel_username}_{message.id}.jpg"
            media_path = os.path.join(media_dir, filename)
            # Download the media to the specified directory if it's a photo
            await client.download_media(message.media, media_path)
        
        # Write the channel title along with other data
        writer.writerow([channel_title, channel_username, message.id, message.message, message.date, media_path])

# Initialize the client once
client = TelegramClient('scraping_session', api_id, api_hash)

async def main():
    await client.start()
    
    # Create a directory for media files
    media_dir = 'photos'
    os.makedirs(media_dir, exist_ok=True)

    # Open the CSV file and prepare the writer
    with open('telegram_data.csv', 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Channel Title', 'Channel Username', 'ID', 'Message', 'Date', 'Media Path'])  # Include channel title in the header
        
        # List of channels to scrape
        channels = [
            '@nevacomputer',
            '@meneshayeofficial',
            '@ethio_brand_collection',
            '@Leyueqa',
            '@sinayelj',  

        ]
        # Iterate over channels and scrape data into the single CSV file
        for channel in channels:
            await scrape_channel(client, channel, writer, media_dir)
            print(f"Scraped data from {channel}")


In [None]:
import nest_asyncio
import asyncio

nest_asyncio.apply()  

async def wrapper():
    async with client:
        await main()

await wrapper()


In [10]:
import pandas as pd

df = pd.read_csv(r"..\telegram_data.csv", encoding="utf-8", engine="python")
df.head()

Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path
0,NEVA COMPUTER®,@nevacomputer,8775,,2025-06-11 13:56:52+00:00,photos\@nevacomputer_8775.jpg
1,NEVA COMPUTER®,@nevacomputer,8774,LENOVO X1 YOGA\nProcessor: 11th‑Gen Intel Core...,2025-06-11 13:56:52+00:00,photos\@nevacomputer_8774.jpg
2,NEVA COMPUTER®,@nevacomputer,8773,,2025-06-11 13:56:52+00:00,photos\@nevacomputer_8773.jpg
3,NEVA COMPUTER®,@nevacomputer,8772,🔥 Acer Nitro 5 – Power Meets Performance\n\n💻 ...,2025-06-09 21:37:09+00:00,photos\@nevacomputer_8772.jpg
4,NEVA COMPUTER®,@nevacomputer,8771,,2025-06-09 21:37:09+00:00,photos\@nevacomputer_8771.jpg


In [11]:
import re
import unicodedata
from nltk.tokenize import word_tokenize


# Normalize Amharic Text
def normalize_amharic(text):
    if pd.isnull(text):
        return ""
    
    # Remove emojis, English text, and punctuation
    text = re.sub(r'[^\u1200-\u137F\s]+', '', text)

    
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Remove diacritics
    text = ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')
    
    return text.strip()

# Simple Tokenizer (by whitespace)
def tokenize_amharic(text):
    return text.split()

# Apply cleaning
df['cleaned_text'] = df['Message'].apply(normalize_amharic)
df['tokens'] = df['cleaned_text'].apply(tokenize_amharic)

# Save cleaned data
df.to_csv("../src/messages_cleaned.csv", index=False)

print("✅ Cleaned and tokenized Amharic text saved.")


✅ Cleaned and tokenized Amharic text saved.


In [14]:
import os

# Load raw message data
df = pd.read_csv("../src/messages_cleaned.csv")

# ========== NORMALIZATION FUNCTION ==========
def normalize_amharic(text):
    if pd.isnull(text):
        return ""
    
    # Remove emojis, foreign characters, and punctuation (retain only Ge'ez script and numbers)
    text = re.sub(r'[^\u1200-\u137F\u1369-\u137C0-9\s]', '', text)
    
    # Remove diacritics
    text = ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')
    
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()

# ========== TOKENIZATION FUNCTION ==========
def tokenize_amharic(text):
    return text.split()

# ========== CLEANING AND STRUCTURING ==========
df['cleaned_text'] = df['Message'].apply(normalize_amharic)
df['tokens'] = df['cleaned_text'].apply(tokenize_amharic)

# Select and rename columns
final_df = df[['Channel Username', 'Date', 'Media Path', 'cleaned_text', 'tokens']].copy()

# Save final structured data
output_path = "../src/structured_telegram_data.csv"
final_df.to_csv(output_path, index=False)
print(f"✅ Structured data saved to: {output_path}")


✅ Structured data saved to: ../src/structured_telegram_data.csv
