In [5]:
import requests
from bs4 import BeautifulSoup
import html2text
from urllib.parse import urljoin
import os
import re
import schedule
import time
from datetime import datetime, timedelta
import random

invalid_urls = [
    'tg://', 'tel:', 'qz', 'ru', 'en', 'cn', 'oz', 'ar'
]

unique_links = set()
unique_links_scraped = set()

converter = html2text.HTML2Text()
converter.ignore_links = True
converter.ignore_images = True
converter.single_line_break = True

alphanumeric_pattern = re.compile(r'\b[A-Za-z0-9]+\b')
unwanted_chars_pattern = re.compile(r'[#*:\-©]')

def main(url, output_file, unique_links, already_scraped_links):
    url_parts = url.split('/')
    if any(x in url_parts for x in invalid_urls):
        print(f'Skipping invalid URL: {url}')
        return

    try:
        response = requests.get(url)
    except requests.exceptions.SSLError as e:
        print(f'SSLError fetching {url}')
        return

    with open(output_file, 'a', encoding='utf-8') as f:
        # Fetch the main page content
        main_page_response = requests.get(url)
        main_page_content = main_page_response.text

        # Parse the main page and extract unique links
        soup = BeautifulSoup(main_page_content, 'html.parser')

        for link in soup.find_all('a', href=True):
            link_url = link['href']
            absolute_link_url = urljoin(url, link_url)
            url_parts = absolute_link_url.split('/')
            if not any(x in url_parts for x in invalid_urls):
                if absolute_link_url not in unique_links:
                    unique_links.add(absolute_link_url)

        # Scrape the content of the main page first and write it to the output file
        try:
            plain_text_content = converter.handle(main_page_content)
            f.write(f'URL: {url}\n\n')
            # Remove unwanted characters from the plain text content
            plain_text_content = unwanted_chars_pattern.sub('', plain_text_content)
            # Remove alphanumeric words from the plain text content
            plain_text_content = alphanumeric_pattern.sub('', plain_text_content)
            f.write(plain_text_content)
            f.write('\n' + '-' * 80 + '\n')
        except Exception as e:
            print(f'Error fetching main page {url}: {e}')

        # Scrape the content of each linked page and write it to the output file
        for link_url in unique_links:
            if link_url not in unique_links_scraped:
                try:
                    response = requests.get(link_url)
                    content = response.text
                    plain_text_content = converter.handle(content)
                    f.write(f'URL: {link_url}\n\n')
                    # Remove unwanted characters from the plain text content
                    plain_text_content = unwanted_chars_pattern.sub('', plain_text_content)
                    # Remove alphanumeric words from the plain text content
                    plain_text_content = alphanumeric_pattern.sub('', plain_text_content)
                    f.write(plain_text_content)
                    f.write('\n' + '-' * 80 + '\n')
                    unique_links_scraped.add(link_url)
                except requests.exceptions.SSLError as e:
                    print(f'SSLError fetching {link_url}: {e}')
                    continue
                except Exception as e:
                    print(f'Error fetching {link_url}: {e}')
                    continue

def get_next_run_time():
    now = datetime.now()
    random_offset = random.randint(1440, 2000)  # Random offset between 1 and 1440 minutes (24 hours)
    next_run_time = now + timedelta(minutes=random_offset)
    return next_run_time

def execute_main():
    main_urls = [
        'https://www.bbc.com/sinhala',
        'https://www.bbc.com/sinhala/topics/cg7267dz901t',
        'https://www.bbc.com/sinhala/topics/c83plvepnq1t',
        'https://www.lankadeepa.lk/latest-news/1',
        'https://www.kelimandala.lk/',
        'https://www.lankadeepa.lk/tharunaya/272',
        'https://www.lankadeepa.lk/business/9',
        'https://www.lankadeepa.lk/politics/13',
        'https://www.inform.kz/kz',
        'https://www.inform.kz/kz/bilik_g21',
        'https://www.inform.kz/kz/aymak_g25',
        'https://www.inform.kz/kz/medicina_t271',
        'https://www.inform.kz/kz/ekonomika_g22',
        'https://www.inform.kz/kz/search?sword=sports',
        'https://www.inform.kz/kz/kogam_g23',
        'https://www.inform.kz/kz/tagayyndau_s24904'
    ]

    output_files = [
        'sinhala_general_news(1).txt',
        'sinhala_srilankan_news.txt',
        'sinhala_world_news.txt',
        'sinhala_general_news(2).txt',
        'sinhala_sports.txt',
        'sinhala_general_news(3).txt',
        'sinhala_business.txt',
        'sinhala_politics.txt',
        'kazakh_general_news.txt',
        'kazakh_politics.txt',
        'kazakh_region.txt',
        'kazakh_world.txt',
        'kazakh_economy.txt',
        'kazakh_sports.txt',
        'kazakh_society.txt',
        'kazakh_accidents.txt'
    ]

    for url, output_file in zip(main_urls, output_files):
        main(url, output_file, unique_links, unique_links_scraped)

def schedule_script():
    # Run the  execute_main function immediately
    execute_main()

    # Schedule the `execute_main` function to run every day at the next_run_time
    next_run_time = get_next_run_time()
    print(f"Next run time for all URLs: {next_run_time.strftime('%Y-%m-%d %H:%M')}")
    schedule.every(1).day.at(next_run_time.strftime('%H:%M')).do(execute_main)

    # Run the pending scheduled tasks
    while True:
        schedule.run_pending()
        time.sleep(1)

# Call the `schedule_script` function to start scheduling
schedule_script()


Error fetching mailto:subs@wijeya.lk: No connection adapters were found for 'mailto:subs@wijeya.lk'
Error fetching http://epaper.ada.lk/: HTTPConnectionPool(host='epaper.ada.lk', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f8967dc9a20>: Failed to establish a new connection: [Errno -2] Name or service not known'))
Error fetching http://life.dailymirror.lk/: HTTPConnectionPool(host='life.dailymirror.lk', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f8967dc97e0>: Failed to establish a new connection: [Errno -5] No address associated with hostname'))
Error fetching http://epaper.tamilmirror.lk/: HTTPConnectionPool(host='epaper.tamilmirror.lk', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f8967dc9720>: Failed to establish a new connection: [Errno -2] Name or serv

KeyboardInterrupt: ignored

In [2]:
#Install the libraries
!pip install html2text
!pip install schedule
!pip install requests
!pip install beautifulsoup4

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting html2text
  Downloading html2text-2020.1.16-py3-none-any.whl (32 kB)
Installing collected packages: html2text
Successfully installed html2text-2020.1.16
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting schedule
  Downloading schedule-1.2.0-py2.py3-none-any.whl (11 kB)
Installing collected packages: schedule
Successfully installed schedule-1.2.0
