<a href="https://colab.research.google.com/github/Tanveer707/NewsBot707/blob/main/Web_Scrapper_Telegram_Bot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# Prothom Alo News Scraper with Multi-User Telegram Bot

# Install required packages
!pip install requests beautifulsoup4 pyTelegramBotAPI schedule

# Import libraries
import os
import re
import csv
import json
import datetime
import logging
import time
import threading
from typing import List, Dict, Any, Set
import requests
from bs4 import BeautifulSoup
from google.colab import drive
import telebot
import schedule

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class UserManager:
    """Manages subscribed users for the bot."""

    def __init__(self, file_path: str = "/content/drive/MyDrive/prothom_alo_articles/users.json"):
        """
        Initialize the user manager.

        Args:
            file_path: Path to store user data
        """
        self.file_path = file_path
        self.users_file = file_path
        self.ensure_directory_exists()

    def ensure_directory_exists(self):
        """Ensure the directory for user file exists."""
        os.makedirs(os.path.dirname(self.users_file), exist_ok=True)

    def load_users(self) -> Set[int]:
        """Load subscribed users from file."""
        try:
            if os.path.exists(self.users_file):
                with open(self.users_file, 'r') as f:
                    data = json.load(f)
                    return set(data.get('users', []))
            return set()
        except Exception as e:
            logging.error(f"Error loading users: {e}")
            return set()

    def save_users(self, users: Set[int]):
        """Save subscribed users to file."""
        try:
            with open(self.users_file, 'w') as f:
                json.dump({'users': list(users)}, f)
        except Exception as e:
            logging.error(f"Error saving users: {e}")

    def add_user(self, user_id: int) -> bool:
        """Add a user to the subscription list."""
        users = self.load_users()
        if user_id not in users:
            users.add(user_id)
            self.save_users(users)
            return True
        return False

    def remove_user(self, user_id: int) -> bool:
        """Remove a user from the subscription list."""
        users = self.load_users()
        if user_id in users:
            users.remove(user_id)
            self.save_users(users)
            return True
        return False

    def get_all_users(self) -> Set[int]:
        """Get all subscribed users."""
        return self.load_users()

class ProthomAloScraper:
    """Scrapes news articles from Prothom Alo using HTML parsing."""

    SITE_CONFIG = {
        "prothom_alo": {
            "url": "https://www.prothomalo.com/",
            "article_pattern": r"https://www\.prothomalo\.com/(?:bangladesh|international|sports|opinion|entertainment|[^/]+)/(?!video|gallery|photo)[^/]+$",
            "title_selector": "h1.title, h1, h1[class*='title'], h1.story-title",
            "content_selector": "div.story-element-text p, div.article-content p, div[class*='content'] p, div.story-body p"
        }
    }

    def __init__(self, max_articles: int = 10):
        """
        Initialize the scraper.

        Args:
            max_articles: Maximum number of articles to scrape
        """
        self.max_articles = max_articles
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }

    def scrape_prothom_alo(self, category: str = None) -> List[Dict[str, Any]]:
        """
        Scrape articles from Prothom Alo using BeautifulSoup.

        Args:
            category: Optional category to filter articles (bangladesh, international, sports, etc.)

        Returns:
            List of dictionaries containing article data
        """
        newspaper = self.SITE_CONFIG["prothom_alo"]
        articles = []

        # Modify URL if category is specified
        url = newspaper["url"]
        if category and category not in ["all", "latest"]:
            url = f"{newspaper['url']}{category}"

        logging.info(f"Crawling Prothom Alo ({url})...")

        # Get the homepage or category page
        try:
            response = requests.get(url, headers=self.headers, timeout=30)
            response.raise_for_status()
            response.encoding = 'utf-8'  # Ensure correct encoding
            soup = BeautifulSoup(response.text, 'html.parser')
        except requests.RequestException as e:
            logging.error(f"Error fetching homepage: {str(e)}")
            return []

        # Find article links
        article_urls = set()
        for a_tag in soup.find_all('a', href=True):
            url = a_tag['href']
            if not url.startswith('http'):
                url = 'https://www.prothomalo.com' + url
            if re.match(newspaper["article_pattern"], url):
                article_urls.add(url)
            if len(article_urls) >= self.max_articles:
                break

        # Scrape each article
        for url in article_urls:
            try:
                response = requests.get(url, headers=self.headers, timeout=30)
                response.raise_for_status()
                response.encoding = 'utf-8'  # Ensure correct encoding
                soup = BeautifulSoup(response.text, 'html.parser')

                # Extract title
                title_elem = soup.select_one(newspaper["title_selector"])
                title = title_elem.get_text().strip() if title_elem else ""

                # Extract content
                content_elems = soup.select(newspaper["content_selector"])
                content = " ".join(elem.get_text() for elem in content_elems).strip()
                content = re.sub(r'\s+', ' ', content)

                if not title or len(content) < 100:
                    logging.warning(f"Skipping article with missing title/short content: {url}")
                    continue

                # Summarize content (first 3 sentences)
                summary = self.summarize_text(content)
                if len(summary) < 50:
                    logging.warning(f"Skipping article with short summary: {url}")
                    continue

                article = {
                    "source": "prothom_alo",
                    "url": url,
                    "title": title,
                    "content": content,
                    "summary": summary,
                    "timestamp": datetime.datetime.now().isoformat()
                }

                articles.append(article)
                time.sleep(1)  # Be polite to the server

            except requests.RequestException as e:
                logging.error(f"Error scraping {url}: {str(e)}")
                continue

        logging.info(f"Found {len(articles)} articles from Prothom Alo")
        return articles

    def summarize_text(self, text: str, max_sentences: int = 3) -> str:
        """
        Summarize text by taking the first few sentences.

        Args:
            text: The text to summarize
            max_sentences: Number of sentences to include in summary

        Returns:
            Summarized text
        """
        # Split text into sentences using a simple regex
        sentences = re.split(r'(?<=[.!?])\s+', text.strip())
        # Take first max_sentences, ensuring we don't exceed available sentences
        summary_sentences = sentences[:min(max_sentences, len(sentences))]
        return " ".join(summary_sentences).strip()

class MultiUserNewsBot:
    """Multi-user Telegram bot for news updates."""

    def __init__(self, token: str):
        """
        Initialize the bot.

        Args:
            token: Telegram bot token
        """
        self.token = token
        self.bot = telebot.TeleBot(token)
        self.user_manager = UserManager()
        self.scraper = ProthomAloScraper(max_articles=5)
        self.setup_handlers()

    def setup_handlers(self):
        """Set up message handlers."""

        @self.bot.message_handler(commands=['start'])
        def handle_start(message):
            user_id = message.from_user.id
            username = message.from_user.username or message.from_user.first_name

            welcome_message = f"""
🗞️ Welcome to Prothom Alo News Bot, {username}!

This bot sends you the latest news from Prothom Alo.

Available commands:
• /start - Start receiving news updates
• /stop - Stop receiving news updates
• /news - Get latest news manually
• /status - Check your subscription status
• /help - Show this help message

You'll receive news updates automatically every 6 hours.
            """

            self.bot.reply_to(message, welcome_message)

            # Add user to subscription list
            was_new = self.user_manager.add_user(user_id)
            if was_new:
                self.bot.reply_to(message, "✅ You've been subscribed to news updates!")
                logging.info(f"New user subscribed: {user_id} ({username})")
            else:
                self.bot.reply_to(message, "📝 You're already subscribed to news updates!")

        @self.bot.message_handler(commands=['stop'])
        def handle_stop(message):
            user_id = message.from_user.id
            username = message.from_user.username or message.from_user.first_name

            removed = self.user_manager.remove_user(user_id)
            if removed:
                self.bot.reply_to(message, "❌ You've been unsubscribed from news updates.")
                logging.info(f"User unsubscribed: {user_id} ({username})")
            else:
                self.bot.reply_to(message, "❌ You weren't subscribed to news updates.")

        @self.bot.message_handler(commands=['news'])
        def handle_news(message):
            self.bot.reply_to(message, "📰 Fetching latest news...")

            articles = self.scraper.scrape_prothom_alo()
            if articles:
                self.send_articles_to_user(message.chat.id, articles[:3])  # Send top 3 articles
            else:
                self.bot.reply_to(message, "❌ Unable to fetch news at this time. Please try again later.")

        @self.bot.message_handler(commands=['status'])
        def handle_status(message):
            user_id = message.from_user.id
            users = self.user_manager.get_all_users()

            if user_id in users:
                self.bot.reply_to(message, "✅ You are subscribed to news updates!")
            else:
                self.bot.reply_to(message, "❌ You are not subscribed to news updates. Use /start to subscribe.")

        @self.bot.message_handler(commands=['help'])
        def handle_help(message):
            help_message = """
🤖 Prothom Alo News Bot Commands:

• /start - Start receiving news updates
• /stop - Stop receiving news updates
• /news - Get latest news manually
• /status - Check your subscription status
• /help - Show this help message

The bot automatically sends news updates every 6 hours to all subscribed users.
            """
            self.bot.reply_to(message, help_message)

    def send_articles_to_user(self, chat_id: int, articles: List[Dict[str, Any]]):
        """Send articles to a specific user."""
        for article in articles:
            message = (
                f"<b>{article['title']}</b>\n\n"
                f"<i>Source: Prothom Alo</i>\n"
                f"<a href='{article['url']}'>Read full article</a>\n\n"
                f"<b>Summary:</b>\n{article['summary']}\n\n"
            )

            # Telegram has a message length limit
            if len(message) > 4000:
                message = message[:3997] + "..."

            try:
                self.bot.send_message(
                    chat_id=chat_id,
                    text=message,
                    parse_mode='HTML',
                    disable_web_page_preview=False
                )
                time.sleep(1)  # Rate limiting
            except Exception as e:
                logging.error(f"Error sending message to {chat_id}: {e}")

    def send_news_to_all_users(self):
        """Send news updates to all subscribed users."""
        logging.info("🔄 Starting automated news update...")

        articles = self.scraper.scrape_prothom_alo()
        if not articles:
            logging.warning("No articles scraped for broadcast")
            return

        users = self.user_manager.get_all_users()
        successful_sends = 0

        for user_id in users:
            try:
                self.send_articles_to_user(user_id, articles)
                successful_sends += 1
                logging.info(f"✅ Sent news to user {user_id}")
            except Exception as e:
                logging.error(f"❌ Failed to send news to user {user_id}: {e}")

        logging.info(f"📊 News update complete: {successful_sends}/{len(users)} users reached")

    def start_polling(self):
        """Start the bot polling."""
        print("🤖 Starting bot...")
        print("✅ Bot is now running and accepting users!")
        print("Users can now find your bot and use /start to subscribe.")
        print("🔄 Automatic news updates will be sent every 6 hours.")

        # Schedule automatic news updates
        schedule.every(6).hours.do(self.send_news_to_all_users)

        # Run scheduler in a separate thread
        def run_scheduler():
            while True:
                schedule.run_pending()
                time.sleep(1)

        scheduler_thread = threading.Thread(target=run_scheduler)
        scheduler_thread.daemon = True
        scheduler_thread.start()

        # Start polling
        try:
            self.bot.infinity_polling(none_stop=True)
        except Exception as e:
            logging.error(f"Bot polling error: {e}")
            # Restart polling
            time.sleep(5)
            self.start_polling()

# Global bot instance
bot_instance = None

# Function to run the multi-user bot
def run_multi_user_bot():
    global bot_instance

    # Telegram bot token - Replace with your actual token
    TOKEN = "7741600382:AAHGR4cEy8YmrS_kYyD8JgCYieK56aYHJ2A"

    print("🚀 Initializing Multi-User Prothom Alo News Bot...")
    print("This bot will:")
    print("• Accept any user who starts it with /start")
    print("• Send news updates every 6 hours to all subscribed users")
    print("• Allow users to unsubscribe with /stop")
    print("• Let users get news manually with /news")
    print("\n" + "="*50)

    bot_instance = MultiUserNewsBot(TOKEN)
    bot_instance.start_polling()

# For Google Colab execution
# -------------------------------------------

# Enter your Telegram Bot Token below:
telegram_bot_token = "7741600382:AAHGR4cEy8YmrS_kYyD8JgCYieK56aYHJ2A"  # @@ Replace with your actual bot token @@

def execute_multi_user_bot():
    """Execute the multi-user bot."""
    global bot_instance

    if bot_instance is not None:
        print("⚠️ Bot is already running!")
        return

    # Replace with the provided token
    TOKEN = telegram_bot_token

    print("🚀 Initializing Multi-User Prothom Alo News Bot...")
    print("This bot will:")
    print("• Accept any user who starts it with /start")
    print("• Send news updates every 6 hours to all subscribed users")
    print("• Allow users to unsubscribe with /stop")
    print("• Let users get news manually with /news")
    print("\n" + "="*50)

    bot_instance = MultiUserNewsBot(TOKEN)

    # Start in a separate thread to prevent blocking
    def start_bot():
        try:
            bot_instance.start_polling()
        except Exception as e:
            print(f"❌ Error starting bot: {e}")
            logging.error(f"Bot startup error: {e}")

    bot_thread = threading.Thread(target=start_bot)
    bot_thread.daemon = True
    bot_thread.start()

    print("✅ Bot started successfully!")
    print("🔗 Your bot is now live and accepting users!")

# Create button for Colab
from IPython.display import HTML, display

def create_bot_button():
    button_html = """
    <div style="text-align: center; margin: 20px;">
        <button style="height:50px; background-color:#2196F3; color:white; border:none;
               padding:0 30px; text-align:center; text-decoration:none;
               display:inline-block; font-size:18px; font-weight:bold;
               cursor:pointer; border-radius:8px; transition:0.3s;"
               onclick="window.start_multi_user_bot()"
               onmouseover="this.style.backgroundColor='#1976D2'"
               onmouseout="this.style.backgroundColor='#2196F3'">
          🤖 Start Multi-User News Bot
        </button>
        <div id="bot-status" style="margin-top:15px; font-size:16px;"></div>
    </div>

    <div style="background-color:#f5f5f5; padding:20px; border-radius:8px; margin-top:20px;">
        <h3>🔧 How to use this Multi-User Bot:</h3>
        <ol style="text-align: left;">
            <li><strong>Start the bot</strong> by clicking the button above</li>
            <li><strong>Any user</strong> can now find your bot on Telegram and use <code>/start</code></li>
            <li><strong>Users will receive news</strong> automatically every 6 hours</li>
            <li><strong>Available commands for users:</strong>
                <ul>
                    <li><code>/start</code> - Subscribe to news updates</li>
                    <li><code>/stop</code> - Unsubscribe from updates</li>
                    <li><code>/news</code> - Get latest news manually</li>
                    <li><code>/status</code> - Check subscription status</li>
                    <li><code>/help</code> - Show help message</li>
                </ul>
            </li>
        </ol>
        <p><strong>⚠️ Note:</strong> Keep this Colab notebook running for the bot to work!</p>
    </div>

    <script>
    window.start_multi_user_bot = function() {
        document.getElementById('bot-status').innerHTML = '🔄 Starting bot... Please wait...';
        google.colab.kernel.invokeFunction('notebook.start_multi_user_bot', [], {})
            .then(function(result) {
                document.getElementById('bot-status').innerHTML = '✅ Bot started successfully! It is now accepting users.';
            })
            .catch(function(error) {
                document.getElementById('bot-status').innerHTML = '❌ Error starting bot: ' + error.message;
            });
    }
    </script>
    """
    return HTML(button_html)

# Register the function for Colab
from google.colab import output
output.register_callback('notebook.start_multi_user_bot', execute_multi_user_bot)

# Display the button and instructions
display(create_bot_button())

# Manual execution option
print("\n" + "="*60)
print("🚀 QUICK START:")
print("="*60)
print("Option 1: Click the button above")
print("Option 2: Run this command manually:")
print("execute_multi_user_bot()")
print("="*60)

# Instructions for setting up the bot
print("\n" + "="*60)
print("🛠️ SETUP INSTRUCTIONS:")
print("="*60)
print("1. Make sure you have created a Telegram bot using @BotFather")
print("2. Replace the TOKEN above with your actual bot token")
print("3. Click the 'Start Multi-User News Bot' button above")
print("4. Your bot will now accept any user who sends /start")
print("5. Keep this notebook running for continuous operation")
print("6. Share your bot username with others so they can subscribe")
print("="*60)

Mounted at /content/drive



🚀 QUICK START:
Option 1: Click the button above
Option 2: Run this command manually:
execute_multi_user_bot()

🛠️ SETUP INSTRUCTIONS:
1. Make sure you have created a Telegram bot using @BotFather
2. Replace the TOKEN above with your actual bot token
3. Click the 'Start Multi-User News Bot' button above
4. Your bot will now accept any user who sends /start
5. Keep this notebook running for continuous operation
6. Share your bot username with others so they can subscribe
🚀 Initializing Multi-User Prothom Alo News Bot...
This bot will:
• Accept any user who starts it with /start
• Send news updates every 6 hours to all subscribed users
• Allow users to unsubscribe with /stop
• Let users get news manually with /news

🤖 Starting bot...
✅ Bot is now running and accepting users!
Users can now find your bot and use /start to subscribe.
🔄 Automatic news updates will be sent every 6 hours.
✅ Bot started successfully!
🔗 Your bot is now live and accepting users!
