In [1]:
# Cell 1 - Import necessary libraries
import os
from telethon import TelegramClient, events, sync
from datetime import datetime
import pandas as pd
import logging
from dotenv import load_dotenv

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('telegram_scraper.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()

True

In [2]:
# Cell 2 - Setup Telegram Client
# Get your API credentials from https://my.telegram.org/
API_ID = os.getenv('TELEGRAM_API_ID')
API_HASH = os.getenv('TELEGRAM_API_HASH')

# Define target channels
CHANNELS = [
    'DoctorsET',
    'lobelia4cosmetics',
    'yetenaweg',
    'EAHCI',
    # Add more channels as needed
]

# Initialize the client
client = TelegramClient('session_name', API_ID, API_HASH)

In [3]:
# Cell 3 - Define scraping functions
async def scrape_channel(channel_username, limit=100):
    """
    Scrape messages from a specific channel
    
    Args:
        channel_username (str): Username of the channel
        limit (int): Maximum number of messages to scrape
    
    Returns:
        list: List of dictionaries containing message data
    """
    try:
        channel = await client.get_entity(channel_username)
        messages = []
        
        async for message in client.iter_messages(channel, limit=limit):
            message_data = {
                'channel': channel_username,
                'date': message.date,
                'text': message.text,
                'has_image': message.photo is not None,
                'message_id': message.id
            }
            
            # If message has an image, download it
            if message.photo:
                path = f'images/{channel_username}_{message.id}.jpg'
                await client.download_media(message.photo, path)
                message_data['image_path'] = path
                
            messages.append(message_data)
            
        logger.info(f"Successfully scraped {len(messages)} messages from {channel_username}")
        return messages
    
    except Exception as e:
        logger.error(f"Error scraping channel {channel_username}: {str(e)}")
        return []

In [7]:
# Cell 4 - Alternative version for Jupyter notebooks
async def main():
    # Create images directory if it doesn't exist
    os.makedirs('images', exist_ok=True)
    
    all_messages = []
    
    # Scrape each channel
    for channel in CHANNELS:
        messages = await scrape_channel(channel)
        all_messages.extend(messages)
    
    # Convert to DataFrame
    df = pd.DataFrame(all_messages)
    
    # Save to CSV
    df.to_csv('telegram_data.csv', index=False)
    logger.info(f"Saved {len(df)} messages to telegram_data.csv")
    
    return df

# For Jupyter notebooks, use this syntax
async with client:
    df = await main()

2025-01-31 11:02:03,381 - telethon.network.mtprotosender - INFO - Connecting to 149.154.167.51:443/TcpFull...
2025-01-31 11:02:05,562 - telethon.network.mtprotosender - INFO - Connection to 149.154.167.51:443/TcpFull complete!
2025-01-31 11:02:48,260 - telethon.client.users - INFO - Phone migrated to 4
2025-01-31 11:02:48,439 - telethon.client.telegrambaseclient - INFO - Reconnecting to new data center 4
2025-01-31 11:02:48,674 - telethon.network.mtprotosender - INFO - Disconnecting from 149.154.167.51:443/TcpFull...
2025-01-31 11:02:48,680 - telethon.network.mtprotosender - INFO - Disconnection from 149.154.167.51:443/TcpFull complete!
2025-01-31 11:02:48,685 - telethon.network.mtprotosender - INFO - Connecting to 149.154.167.92:443/TcpFull...
2025-01-31 11:02:50,714 - telethon.network.mtprotosender - INFO - Connection to 149.154.167.92:443/TcpFull complete!


Signed in successfully as Selam; remember to not break the ToS or you will risk an account ban!


2025-01-31 11:03:25,567 - telethon.client.downloads - INFO - Starting direct file download in chunks of 131072 at 0, stride 131072
2025-01-31 11:03:27,359 - telethon.client.downloads - INFO - Starting direct file download in chunks of 131072 at 0, stride 131072
2025-01-31 11:03:29,044 - telethon.client.downloads - INFO - Starting direct file download in chunks of 131072 at 0, stride 131072
2025-01-31 11:03:31,008 - telethon.client.downloads - INFO - Starting direct file download in chunks of 131072 at 0, stride 131072
2025-01-31 11:03:35,508 - telethon.client.downloads - INFO - Starting direct file download in chunks of 131072 at 0, stride 131072
2025-01-31 11:03:37,416 - telethon.client.downloads - INFO - Starting direct file download in chunks of 131072 at 0, stride 131072
2025-01-31 11:03:39,449 - telethon.client.downloads - INFO - Starting direct file download in chunks of 131072 at 0, stride 131072
2025-01-31 11:03:42,844 - telethon.client.downloads - INFO - Starting direct file d

In [9]:
# Cell 6 - Data Analysis and Preview
# Display basic statistics about the scraped data
print("Total messages scraped:", len(df))
print("\nMessages per channel:")
print(df['channel'].value_counts())
print("\nMessages with images:", df['has_image'].sum())

# Display sample messages from each channel
print("\nSample messages from each channel:")
for channel in CHANNELS:
    print(f"\n=== Messages from {channel} ===")
    channel_sample = df[df['channel'] == channel].head(2)
    display(channel_sample)

# Optional: If you want to see specific columns only
selected_columns = ['channel', 'date', 'text', 'has_image']
print("\nCompact view of messages from all channels:")
display(df[selected_columns].groupby('channel').head(2).reset_index(drop=True))

Total messages scraped: 400

Messages per channel:
channel
DoctorsET            100
lobelia4cosmetics    100
yetenaweg            100
EAHCI                100
Name: count, dtype: int64

Messages with images: 350

Sample messages from each channel:

=== Messages from DoctorsET ===


Unnamed: 0,channel,date,text,has_image,message_id,image_path
0,DoctorsET,2023-12-18 17:04:02+00:00,https://youtu.be/5DBoEm-8kmA?si=LDLuEecNfULJVD...,True,864,images/DoctorsET_864.jpg
1,DoctorsET,2023-11-03 16:14:39+00:00,ዶክተርስ ኢትዮጵያ በ አዲስ አቀራረብ በ ቴሌቪዥን ፕሮግራሙን ለመጀመር ከ...,True,863,images/DoctorsET_863.jpg



=== Messages from lobelia4cosmetics ===


Unnamed: 0,channel,date,text,has_image,message_id,image_path
100,lobelia4cosmetics,2025-01-31 12:40:44+00:00,NEUROPRO CARE \nPrice 5500 birr \nTelegram @Lo...,True,15208,images/lobelia4cosmetics_15208.jpg
101,lobelia4cosmetics,2025-01-31 12:40:44+00:00,ENSURE 850GM\nPrice 3800 birr \nTelegram https...,True,15207,images/lobelia4cosmetics_15207.jpg



=== Messages from yetenaweg ===


Unnamed: 0,channel,date,text,has_image,message_id,image_path
200,yetenaweg,2025-01-22 09:50:47+00:00,,True,1247,images/yetenaweg_1247.jpg
201,yetenaweg,2025-01-22 09:50:46+00:00,,True,1246,images/yetenaweg_1246.jpg



=== Messages from EAHCI ===


Unnamed: 0,channel,date,text,has_image,message_id,image_path
300,EAHCI,2025-01-31 13:45:58+00:00,#**Adult_ECHO_Training****@****#ADISS_ABABA_By...,True,2607,images/EAHCI_2607.jpg
301,EAHCI,2025-01-31 12:31:56+00:00,#OBGYN_Ultrasound_Training@HOSSANA!\n#የካቲት 2 ይ...,True,2606,images/EAHCI_2606.jpg



Compact view of messages from all channels:


Unnamed: 0,channel,date,text,has_image
0,DoctorsET,2023-12-18 17:04:02+00:00,https://youtu.be/5DBoEm-8kmA?si=LDLuEecNfULJVD...,True
1,DoctorsET,2023-11-03 16:14:39+00:00,ዶክተርስ ኢትዮጵያ በ አዲስ አቀራረብ በ ቴሌቪዥን ፕሮግራሙን ለመጀመር ከ...,True
2,lobelia4cosmetics,2025-01-31 12:40:44+00:00,NEUROPRO CARE \nPrice 5500 birr \nTelegram @Lo...,True
3,lobelia4cosmetics,2025-01-31 12:40:44+00:00,ENSURE 850GM\nPrice 3800 birr \nTelegram https...,True
4,yetenaweg,2025-01-22 09:50:47+00:00,,True
5,yetenaweg,2025-01-22 09:50:46+00:00,,True
6,EAHCI,2025-01-31 13:45:58+00:00,#**Adult_ECHO_Training****@****#ADISS_ABABA_By...,True
7,EAHCI,2025-01-31 12:31:56+00:00,#OBGYN_Ultrasound_Training@HOSSANA!\n#የካቲት 2 ይ...,True
