In [None]:
from pathlib import Path
import requests
import logging

BASE_RFC_URL = "https://www.rfc-editor.org/rfc/"
RAW_PATH_DATASET = Path("data/raw_dataset")

logging.basicConfig(filename='logs/scrapping_logger.log', level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')


def download_website(url, output_file):
    try:
        response = requests.get(url)
        response.raise_for_status()
        with open(output_file, 'w', encoding='utf-8') as file:
            file.write(response.text)

        logging.info(f"Website content downloaded successfully and saved to {output_file}")

    except requests.exceptions.HTTPError as errh:
        logging.error(f"HTTP Error: {errh}")
    except requests.exceptions.ConnectionError as errc:
        logging.error(f"Error Connecting: {errc}")
    except requests.exceptions.Timeout as errt:
        logging.error(f"Timeout Error: {errt}")
    except requests.exceptions.RequestException as err:
        logging.error(f"Request Exception: {err}")


def download_rfc(rfc_number: int, logger):
    rfc_name = f"rfc{rfc_number}.html"
    rfc_url = BASE_RFC_URL + rfc_name
    output_file = RAW_PATH_DATASET / rfc_name
    download_website(rfc_url, output_file, logger)


In [None]:
import concurrent.futures
from tqdm import tqdm

with concurrent.futures.ThreadPoolExecutor(max_workers=24) as executor:
    for _ in tqdm(executor.map(download_rfc, range(10000)), total=10000):
        pass