In [20]:
import multiprocessing as mp
import time
from urllib.request import urlopen, urljoin
from bs4 import BeautifulSoup
import re

base_url = "https://mofanpy.com/"
# base_url = 'https://mofanpy.com/'

# DON'T OVER CRAWL THE WEBSITE OR YOU MAY NEVER VISIT AGAIN
if base_url != "http://127.0.0.1:4000/":
    restricted_crawl = True
else:
    restricted_crawl = False

In [21]:
def crawl(url):
    response = urlopen(url)
    time.sleep(0.1)             # slightly delay for downloading
    return response.read().decode('utf-8')

In [24]:
def parse(html):
    soup = BeautifulSoup(html, 'lxml')
    urls = soup.find_all("a", {"href": re.compile('/tutorials/.*/$')})
    title = soup.find('h1').get_text().strip()
    page_urls = set([urljoin(base_url, url['href']) for url in urls])
    url = soup.find('meta', {'property': "og:url"})['content']
    return title, page_urls, url

In [25]:
unseen = set([base_url,])
seen = set()

count, t1 = 1, time.time()

while len(unseen) != 0:                 # still get some url to visit
    if restricted_crawl and len(seen) > 19:
            break
        
    print('\nDistributed Crawling...')
    htmls = [crawl(url) for url in unseen]

    print('\nDistributed Parsing...')
    results = [parse(html) for html in htmls]

    print('\nAnalysing...')
    seen.update(unseen)         # seen the crawled
    unseen.clear()              # nothing unseen

    for title, page_urls, url in results:
        print(count, title, url)
        count += 1
        unseen.update(page_urls - seen)     # get new url to crawl
print('Total time: %.1f s' % (time.time()-t1, ))    # 53 s


Distributed Crawling...

Distributed Parsing...

Analysing...
1 莫烦Python主页 https://mofanpy.com/

Distributed Crawling...

Distributed Parsing...

Analysing...
2 多线程 (Threading) | 莫烦Python https://mofanpy.com/tutorials/python-basic/threading/
3 有趣的机器学习 | 莫烦Python https://mofanpy.com/tutorials/machine-learning/ML-intro/
4 Linux 简易教学 | 莫烦Python https://mofanpy.com/tutorials/others/linux-basic/
5 Tensorflow | 莫烦Python https://mofanpy.com/tutorials/machine-learning/tensorflow/
6 自然语言处理 | 莫烦Python https://mofanpy.com/tutorials/machine-learning/nlp/
7 Numpy & Pandas 数据处理 | 莫烦Python https://mofanpy.com/tutorials/data-manipulation/np-pd/
8 Theano | 莫烦Python https://mofanpy.com/tutorials/machine-learning/theano/
9 SciKit-Learn | 莫烦Python https://mofanpy.com/tutorials/machine-learning/sklearn/
10 窗口视窗 (Tkinter) | 莫烦Python https://mofanpy.com/tutorials/python-basic/tkinter/
11 多进程 (Multiprocessing) | 莫烦Python https://mofanpy.com/tutorials/python-basic/multiprocessing/
12 机器学习实战 | 莫烦Python https:/

In [26]:
from urllib.request import urlopen
#import ssl
#import requests
#ssl._create_default_https_context = ssl._create_unverified_context
#print(requests.get("https://mofanpy.com/").url)
print(urlopen("https://mofanpy.com/tutorials/machine-learning/nlp").read().decode())

HTTPError: HTTP Error 308: PERMANENT REDIRECT

In [27]:
import time


def job(t):
    print('Start job ', t)
    time.sleep(t)               # wait for "t" seconds
    print('Job ', t, ' takes ', t, ' s')
    

def main():
    [job(t) for t in range(1, 3)]
    
    
t1 = time.time()
main()
print("NO async total time : ", time.time() - t1)

Start job  1
Job  1  takes  1  s
Start job  2
Job  2  takes  2  s
NO async total time :  3.0120837688446045


In [28]:
import asyncio
import nest_asyncio
nest_asyncio.apply()

async def job(t):
    print('Start job ', t)
    await asyncio.sleep(t)          # wait for "t" seconds, it will look for another job while await
    print('Job ', t, ' takes ', t, ' s')
    

async def main(loop):
    tasks = [loop.create_task(job(t)) for t in range(1, 3)]     # just create, not run job
    await asyncio.wait(tasks)                                   # run jobs and wait for all tasks done

t1 = time.time()
loop = asyncio.get_event_loop()
loop.run_until_complete(main(loop))
# loop.close()                          # Ipython notebook gives error if close loop
print("Async total time : ", time.time() - t1)

Start job  1
Start job  2
Job  1  takes  1  s
Job  2  takes  2  s
Async total time :  2.0161380767822266


In [31]:
import requests

URL = 'https://mofanpy.com/'


def normal():  
    for i in range(2):
        r = requests.get(URL)
        url = r.url
        print(url)
    
t1 = time.time()
normal()
print("Normal total time:", time.time()-t1)

https://mofanpy.com/
https://mofanpy.com/
Normal total time: 0.25673961639404297


In [32]:
import aiohttp


async def job(session):
    response = await session.get(URL)
    return str(response.url)


async def main(loop):
    async with aiohttp.ClientSession() as session:
        tasks = [loop.create_task(job(session)) for _ in range(2)]
        finished, unfinished = await asyncio.wait(tasks)
        all_results = [r.result() for r in finished]        # get return from job
        print(all_results)
    
t1 = time.time()
loop = asyncio.get_event_loop()
loop.run_until_complete(main(loop))
# loop.close()                      # Ipython notebook gives error if close loop
print("Async total time:", time.time() - t1)

['https://mofanpy.com/', 'https://mofanpy.com/']
Async total time: 0.11934757232666016
