In [1]:
from multiprocessing import Pool
from lxml import etree
import aiohttp
import asyncio
import time

In [2]:
!pip install nest_asyncio --quiet
import nest_asyncio
nest_asyncio.apply()

Exercise: Use async for http requests and multiprocessing for parsing html. Compare the performance with the practices in the lecture note



In [3]:
urls = ['https://arxiv.org/abs/2201.000%02d'%i for i in range(1, 11)]

In [4]:
htmls: list
async def get_html(url):
      async with aiohttp.ClientSession() as session:
          async with session.request('GET', url) as resp:
              html = await resp.read()
              htmls.append(html)

def parse_html(html, cnt):
  title = etree.HTML(html).xpath('//h1[contains(@class, "title")]/text()')
  print('Title %d: %s' % (cnt,''.join(title)))

Complete the following routines

In [5]:
def main_get_html():
  '''
  use asyncio
  '''
  loop = asyncio.get_event_loop()
  tasks = [get_html(url) for url in urls]
  loop.run_until_complete(asyncio.gather(*tasks))

def main_parse_html():
  '''
  use multiprocessing
  '''
  p = Pool()
  for i, html in enumerate(htmls):
    p.apply_async(parse_html, args=(html, i))
  p.close()
  p.join() 


In [6]:
%%time
htmls = []
main_get_html()
main_parse_html()

Title 0: A Literature Review on Length of Stay Prediction for Stroke Patients using Machine Learning and Statistical ApproachesTitle 1: Locally finite free space as limiting case of PT-symmetric medium

Title 2: A Lightweight and Accurate Spatial-Temporal Transformer for Traffic ForecastingTitle 3: Time-Dependent Duhamel Renormalization method with Multiple Conservation and Dissipation Laws

Title 4: Robust reliability-based topology optimization under random-field material modelTitle 5: Improving Deep Neural Network Classification Confidence using Heatmap-based eXplainable AI

Title 6: Modeling Advection on Directed Graphs using Matérn Gaussian Processes for Traffic Flow
Title 7: Simulating local fields in carbon nanotube reinforced composites for infinite strip with voids
Title 9: Confidence-Aware Multi-Teacher Knowledge DistillationTitle 8: AttentionLight: Rethinking queue length and attention mechanism for traffic signal control

CPU times: user 167 ms, sys: 50.5 ms, total: 218 ms


How about using multi threads for parsing html? Try it out

In [7]:
import threading

In [8]:
def main_parse_html_mt():
  '''
  use multithreading
  '''
  p = threading.Pool()
  for i, html in enumerate(htmls):
    p.apply_async(parse_html, args=(html, i))
  p.close()
  p.join() 

In [9]:
%%time
main_parse_html()

Title 1: Locally finite free space as limiting case of PT-symmetric mediumTitle 0: A Literature Review on Length of Stay Prediction for Stroke Patients using Machine Learning and Statistical Approaches

Title 3: Time-Dependent Duhamel Renormalization method with Multiple Conservation and Dissipation LawsTitle 2: A Lightweight and Accurate Spatial-Temporal Transformer for Traffic Forecasting
Title 4: Robust reliability-based topology optimization under random-field material model

Title 5: Improving Deep Neural Network Classification Confidence using Heatmap-based eXplainable AI
Title 6: Modeling Advection on Directed Graphs using Matérn Gaussian Processes for Traffic Flow
Title 7: Simulating local fields in carbon nanotube reinforced composites for infinite strip with voidsTitle 8: AttentionLight: Rethinking queue length and attention mechanism for traffic signal control
Title 9: Confidence-Aware Multi-Teacher Knowledge Distillation

CPU times: user 74.8 ms, sys: 36.5 ms, total: 111 ms