In [1]:
from multiprocessing import Pool
from lxml import etree
import aiohttp
import asyncio
import time

In [2]:
!pip install nest_asyncio --quiet
import nest_asyncio
nest_asyncio.apply()

Exercise: Use async for http requests and multiprocessing for parsing html. Compare the performance with the practices in the lecture note



In [3]:
urls = ['https://arxiv.org/abs/2201.000%02d'%i for i in range(1, 11)]

In [4]:
htmls: list
async def get_html(url):
      async with aiohttp.ClientSession() as session:
          async with session.request('GET', url) as resp:
              html = await resp.read()
              htmls.append(html)

def parse_html(html):
  title = etree.HTML(html).xpath('//h1[contains(@class, "title")]/text()')
  return title


Complete the following routines

In [5]:
def main_get_html():
  '''
  use asyncio
  '''
  loop = asyncio.get_event_loop()
  loop.run_until_complete(asyncio.gather(*(get_html(url) for url in urls)))

def main_parse_html():
  '''
  use multiprocessing
  '''
  with Pool(10) as p:
    p.map(parse_html, htmls)

In [6]:
%%time
htmls = []
main_get_html()
main_parse_html()

CPU times: user 89.6 ms, sys: 75.1 ms, total: 165 ms
Wall time: 820 ms


How about using multi threads for parsing html? Try it out

In [7]:
import threading

In [8]:
%%time
threads = [threading.Thread(target=parse_html, args=(html,)) for html in htmls]
for thread in threads:
    thread.start()
for thread in threads:
    thread.join()

CPU times: user 21.4 ms, sys: 4.43 ms, total: 25.8 ms
Wall time: 29.3 ms
