# Task 1: Data Scraping and Collection
This notebook scrapes messages and images from public Telegram channels related to Ethiopian medical businesses, saving them in a partitioned directory structure for downstream processing.

**Instructions:**
- Run each cell in order.
- You will need your Telegram API ID and API Hash. Get them from https://my.telegram.org/apps.
- Download the resulting data from the Colab file browser when finished.


In [1]:
# Install dependencies
%pip install telethon nest_asyncio tqdm


Collecting telethon
  Downloading Telethon-1.40.0-py3-none-any.whl.metadata (3.9 kB)
Collecting pyaes (from telethon)
  Downloading pyaes-1.6.1.tar.gz (28 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading Telethon-1.40.0-py3-none-any.whl (722 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m722.0/722.0 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyaes
  Building wheel for pyaes (setup.py) ... [?25l[?25hdone
  Created wheel for pyaes: filename=pyaes-1.6.1-py3-none-any.whl size=26347 sha256=e660f8b86007caebe8eaad4fbda7878607b4cf6e0fda26b3a798d21206462f9a
  Stored in directory: /root/.cache/pip/wheels/4e/52/33/010d0843550bffb6a591b11629070ae140c0ad4f53e68a3bd3
Successfully built pyaes
Installing collected packages: pyaes, telethon
Successfully installed pyaes-1.6.1 telethon-1.40.0


In [2]:
import nest_asyncio
nest_asyncio.apply()

from telethon.sync import TelegramClient
from telethon.tl.types import MessageMediaPhoto
import os
import json
from datetime import datetime
from tqdm import tqdm
import logging

# Set up logging (stream only)
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s:%(message)s')


## Enter your Telegram API credentials
You can get these from https://my.telegram.org/apps.


In [None]:
api_id = input('Enter your Telegram API ID: ')
api_hash = input('Enter your Telegram API Hash: ')
client = TelegramClient('session', api_id, api_hash)

## Define channels to scrape
Add or remove channel usernames as needed.


In [4]:
channels = [
    'lobelia4cosmetics',
    'tikvahpharma',
    # Add more channel usernames as needed
]
limit = 100  # Number of messages to scrape per channel


## Scrape messages and images
This will save messages and images in a partitioned directory structure under `data/raw/telegram_messages/YYYY-MM-DD/channel_name.json`. Images will be saved in the same folder.


In [5]:
async def scrape_channel(channel_username, limit=100):
    await client.start()
    logging.info(f'Scraping messages from {channel_username}')
    messages = await client.get_messages(channel_username, limit=limit)
    today = datetime.now().strftime('%Y-%m-%d')
    base_dir = f'data/raw/telegram_messages/{today}/{channel_username}'
    os.makedirs(base_dir, exist_ok=True)
    data = []
    for i, message in enumerate(tqdm(messages, desc=channel_username)):
        msg_dict = {
            'id': message.id,
            'date': str(message.date),
            'text': message.text,
            'has_media': bool(message.media),
            'media_path': None
        }
        if message.media and isinstance(message.media, MessageMediaPhoto):
            file_path = os.path.join(base_dir, f'photo_{message.id}.jpg')
            try:
                await message.download_media(file=file_path)
                msg_dict['media_path'] = file_path
                logging.info(f'Saved image to: {file_path}')
            except Exception as e:
                logging.error(f'Failed to download image: {e}')
        data.append(msg_dict)
    # Save messages as JSON
    json_path = os.path.join(base_dir, f'{channel_username}.json')
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    logging.info(f'Saved messages to: {json_path}')
    await client.disconnect()


In [None]:
# Run scraping for all channels
import asyncio
for channel in channels:
    asyncio.run(scrape_channel(channel, limit=limit))

## Download your data
After scraping, use the Colab file browser (left sidebar) to download the `data/` folder to your local machine for further processing.


In [10]:
%cd /content/data/
!zip -r raw.zip raw/
from google.colab import files
files.download('raw.zip')

/content/data
updating: raw/ (stored 0%)
updating: raw/telegram_messages/ (stored 0%)
updating: raw/telegram_messages/2025-07-15/ (stored 0%)
updating: raw/telegram_messages/2025-07-15/tikvahpharma/ (stored 0%)
updating: raw/telegram_messages/2025-07-15/tikvahpharma/tikvahpharma.json (deflated 79%)
updating: raw/telegram_messages/2025-07-15/tikvahpharma/photo_172676.jpg (deflated 4%)
updating: raw/telegram_messages/2025-07-15/tikvahpharma/photo_172730.jpg (deflated 5%)
updating: raw/telegram_messages/2025-07-15/tikvahpharma/photo_172422.jpg (deflated 14%)
updating: raw/telegram_messages/2025-07-15/tikvahpharma/photo_172431.jpg (deflated 3%)
updating: raw/telegram_messages/2025-07-15/tikvahpharma/photo_172448.jpg (deflated 2%)
updating: raw/telegram_messages/2025-07-15/tikvahpharma/photo_172350.jpg (deflated 1%)
updating: raw/telegram_messages/2025-07-15/tikvahpharma/photo_172566.jpg (deflated 4%)
updating: raw/telegram_messages/2025-07-15/tikvahpharma/photo_172546.jpg (deflated 2%)
upd

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>