You can make web requests to invoke Flask-based HTTP functions as needed to load more cleaning more data

In [1]:
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed

def dispatch_etl_requests(
    start_row_index, 
    end_row_index, 
    source, 
    batch_size=20, 
    max_workers=10, 
    base_url="http://127.0.0.1:8181/etl"
):
    """
    Dispatches concurrent ETL requests to the specified endpoint.

    Args:
        start_row_index (int): Start index (inclusive)
        end_row_index (int): End index (inclusive)
        source (str): Specifies the data source.
            - 'training': Uses the knowledge base or an external database.
            - 'testing': Uses the Heymate internal database.
        batch_size (int): Number of rows per batch
        max_workers (int): Number of concurrent workers
        base_url (str): Base URL of the ETL endpoint

    Returns:
        List of tuples: (status_code, url, response_text)
    """
    # Generate a list of URLs
    urls = [
        f"{base_url}?start_row_index={i}&end_row_index={min(i+batch_size-1, end_row_index)}&source={source}"
        for i in range(start_row_index, end_row_index + 1, batch_size)
    ]

    def fetch(url):
        try:
            response = requests.get(url)
            return response.status_code, url, response.text
        except Exception as e:
            return None, url, str(e)

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(fetch, url) for url in urls]
        for future in as_completed(futures):
            status, url, result = future.result()
            print(f"[{status}] {url} → {result[:200]}...")



In [None]:
# Example usage of dispatch_etl_requests

# This will dispatch ETL requests for rows 500 to 600 (inclusive),
# using the 'training' source, in batches of 10 rows each.
# Up to 5 requests will be processed concurrently.
# The requests will be sent to the specified ETL endpoint.

# dispatch_etl_requests(500, 600, "training", batch_size=10, max_workers=5)