# 1. ThreadPoolExecutor

Thread pool is generally used in I/O-bounded tasks.

TODO: add more description


# Example: Wikipedia scraper
Scenario: for a given list of terms (`TERMS`) get the content of first paragraph from Wikiepdia article of each term.
Given:
 - `WIKI_URL`: base url of English Wikipedia. In each task the term is appended at the end of this string (e.g. `'https://en.wikipedia.org/wiki/' + 'kite'` gives an [URL for an article about a kite](https://en.wikipedia.org/wiki/kite).
 - `TERMS`: a list of terms to search
 - `get_from_wiki`: a function which requests given URL, checks the response status code and returns a tuple of status code and text of the first paragraph of given Wikipedia's article.
 - `get_first_paragraph`: a helper function for parsing the html content and extracting the first paragraph's text.
 - `timeit`: a decorator function for measuring the execution time of wrapped function.

In [None]:
from lxml import html
import time
from typing import *

import requests

WIKI_URL = 'https://en.wikipedia.org/wiki/'
TERMS = [
    'family',
    'measurement',
    'leader',
    'atmosphere',
    'possibility',
    'housing',
    'payment',
    'sympathy',
    'meal',
    'description',
    'intention',
    'community',
    'preference',
    'menu',
    'volume',
    'brewery',
    'abcdefgh',  # no article
    'assumption',
    'patience',
    'recipe',
]


def timeit(func):
    """Wraps the function for measuring its execution time."""
    
    def wrapped(*args, **kwargs):
        t_start = time.time()
        result = func(*args, **kwargs)
        print(f'Executed `{func.__name__}` in {(time.time() - t_start):.2f}s')
        return result
    
    return wrapped


def get_first_paragraph(html_text: str) -> str:
    """
    Returns a text from first paragraph of given html content.
    """
    tree = html.fromstring(html_text)
    paragraph = tree.find('body//p')
    if isinstance(paragraph, html.HtmlElement):
        return paragraph.text_content().strip()
    return ''
    

def get_from_wiki(term: str) -> Tuple[int, str]:
    """
    Returns the status code and text of first paragraph
    from wikipedia article in form of a tuple.
    """
    res = requests.get(WIKI_URL + term)
    status = res.status_code
    if res.status_code != 200:
        return status, ''
    return status, get_first_paragraph(res.content)


### Sample output for the [article about data scraping](https://en.wikipedia.org/wiki/Data_scraping) with use of the  `get_from_wiki` function:


In [None]:
code, text = get_from_wiki('data_scraping')
print(f'response code: {code}')
print(f'text: {text}')



## A standard sequential task which executes function for each term:


In [None]:
@timeit
def task_sequential(terms):
    return [get_from_wiki(term) for term in terms]



#### Test `task_sequential`:


In [None]:
result_sequential = task_sequential(TERMS)

for term, (code, text) in zip(TERMS, result_sequential):
    print(f'{term}, response code: {code}')
    print(text, '\n')

## Parallel task: take 1
Paralallelized task using the `ThreadPoolExecutor` and `submit()` methods. Return a list of results.

In [None]:
from concurrent.futures import ThreadPoolExecutor


@timeit
def task_parallel(terms, n_workers=10):
    with ThreadPoolExecutor(n_workers) as pool:
        futures = [pool.submit(get_from_wiki, term) for term in terms]
    return [future.result() for future in futures]

#### Test `task_parallel`:


In [None]:
result_parallel = task_parallel(TERMS)
result_sequential == result_parallel  # same result?


## Parallel task: take 2
Paralallelized task using the `ThreadPoolExecutor` and `map()` method. Return a list of results.

In [None]:
@timeit
def task_parallel_2(terms, n_workers=10):
    with ThreadPoolExecutor(n_workers) as pool:
        result = pool.map(get_from_wiki, terms)
    return list(result)

In [None]:
longer_list = TERMS * 3
_ = task_parallel_2(longer_list)
_ = task_parallel_2(longer_list, n_workers=30)

## Parallel task: take 3
Paralallelized task using the `ThreadPoolExecutor` and `map()` method. Lazy.

In [None]:
@timeit
def task_parallel_3(terms, n_workers=10):
    with ThreadPoolExecutor(n_workers) as pool:
        yield from pool.map(get_from_wiki, terms)

In [None]:
res = task_parallel_3(TERMS, n_workers=2)
res
time.sleep(1)
for code, text in res:
    print(code)