# 第十七章

## 17.1 示例：网络下载的三种风格

https://www.youtube.com/watch?v=A9e9Cy1UkME

### 17.1.1依序下载的脚本 

In [10]:
#flags.py
import os
import time
import sys

import requests  # <1>

POP20_CC = ('CN IN US ID BR PK NG BD RU JP '
            'MX PH VN ET EG DE IR TR CD FR').split()  # <2>

BASE_URL = 'http://flupy.org/data/flags'  # <3>

DEST_DIR = 'downloads/'  # <4>


def download_many(cc_list):
    for cc in cc_list:
        img=get_flag(cc)
        show(cc)
        save_flag(img,cc.lower()+'.gif')
    return len(cc_list)

def get_flag(cc):
    url='{}/{cc}/{cc}.gif'.format(BASE_URL,cc=cc.lower())
    resp=requests.get(url)
    return resp.content

def show(cc):
    print(cc,end='  ')
    sys.stdout.flush()

def save_flag(img,filename):
    path=os.path.join(DEST_DIR,filename)
    with open(path,'wb') as fp:
        fp.write(img)

def main(download_many):
    t0=time.time()
    count=download_many(POP20_CC)
    elapsed=time.time()-t0
    msg='\n{} flags downloaded in {:.2f}s'
    print(msg.format(count,elapsed))

if __name__ =='__main__':
    main(download_many)
        

CN  IN  US  ID  BR  PK  NG  BD  RU  JP  MX  PH  VN  ET  EG  DE  IR  TR  CD  FR  
20 flags downloaded in 8.42s


### 17.1.2 使用concurrent.futures模块下载

In [12]:
#flags_threadpool.py
from concurrent import futures
MAX_workers = 20
def download_one(cc):
    image=get_flag(cc)
    show(cc)
    save_flag(image,cc.lower()+'.gif')
    return cc

def download_many(cc_list):
    workers=min(MAX_workers,len(cc_list))#设置工作的线程数量
    with futures.ThreadPoolExecutor(workers) as executor:
        res=executor.map(download_one,sorted(cc_list))#download_one函数会在多个线程中并发调用；map方法返回一个生成器
    return len(list(res))

if __name__ == '__main__':
    main(download_many)

BD  DE  BRID  EG    CN  FR  RU  NG  TR  VN  IN  JP  ET  PH  CD  MX  US  IR  PK  
20 flags downloaded in 0.63s


### 17.1.3 期物在哪里

In [11]:
#flags_threadpool_ac.py
from concurrent import futures
MAX_workers = 20
def download_one(cc):
    image=get_flag(cc)
    show(cc)
    save_flag(image,cc.lower()+'.gif')
    return cc

def download_many(cc_list):
    cc_list=cc_list[:5]
    with  futures.ThreadPoolExecutor(max_workers=3) as executor:
        to_do=[]
        for cc in sorted(cc_list):
            future=executor.submit(download_one,cc)#排定可调用对象的执行时间，然后返回一个期物，表示这个待执行的操作
            to_do.append(future)#存储各个期物，后面传给as_completed函数
            msg='Scheduled for {}:{}'
            print(msg.format(cc,future))
        
        results=[]
        for future in futures.as_completed(to_do):#as_completed函数在期物运行结束后产出期物
            res=future.result()
            msg='{} result:{!r}'
            print(msg.format(future,res))
            results.append(res)
    return len(results)

if __name__ == '__main__':
    main(download_many)

Scheduled for BR:<Future at 0x1c3353eb0b8 state=running>
Scheduled for CN:<Future at 0x1c3353645f8 state=running>
Scheduled for ID:<Future at 0x1c3353fa940 state=running>
Scheduled for IN:<Future at 0x1c3353fa860 state=pending>
Scheduled for US:<Future at 0x1c3353fa128 state=pending>
BR  <Future at 0x1c3353eb0b8 state=finished returned str> result:'BR'
ID  <Future at 0x1c3353fa940 state=finished returned str> result:'ID'
CN  <Future at 0x1c3353645f8 state=finished returned str> result:'CN'
IN  <Future at 0x1c3353fa860 state=finished returned str> result:'IN'
US  <Future at 0x1c3353fa128 state=finished returned str> result:'US'

5 flags downloaded in 0.95s


严格来说，我们目前测试的并发脚本都不能并行下载。使用concurrent.futures库实现的那两个示例受GIL（Global Interpreter Lock,全局解释锁）的限制。
故有此疑问：既然python线程受GIL限制，任何时候都只允许运行一个线程，那么flags_threadpool.py脚本下载速度为什么会比flags.py脚本块5倍。

## 17.2 阻塞型I/O和GIL

![chapter17-2](images/chapter17-2.jpg)

## 17.3 使用concurrent.futures模块启动进程

concurrent.futures模块可以使用ProcessPoolExecutor类把工作分配给多个python进程处理。因此，如果需要CPU密集型处理，使用这个模块能绕开GIL，利用所有可用的cpu核心。
下载国旗的示例或其他I/O密集型作业使用ProcessPoolExecutor类得不到任何好处。

In [None]:
from concurrent import futures
def download_one(cc):
    image=get_flag(cc)
    show(cc)
    save_flag(image,cc.lower()+'.gif')
    return cc

def download_many(cc_list):
    #ProcessPoolExecutor类不需设置max_workers参数，默认使用电脑cpu的数量
    with futures.ProcessPoolExecutor() as executor:
        res=executor.map(download_one,sorted(cc_list))#download_one函数会在多个线程中并发调用；map方法返回一个生成器
    return len(list(res))

if __name__ == '__main__':
    main(download_many)

![chapter17-3](images/chapter17-3.png)

可见，ProcessPoolExecutor类在I/O密集型作业中不占优。它的价值主要体现在cpu密集型作业上。

## 17.4使用executor.map方法

In [14]:
#demo_executor_map.py
from time import sleep,strftime
from concurrent import futures

def display(*args):
    print(strftime('[%H:%M:%S]'),end=' ')
    print(*args)

def loiter(n):
    msg='{}loiter({}):doing nothing for {}s...'
    display(msg.format('\t'*n,n,n))
    sleep(n)
    msg = '{}loiter({}): done.'
    display(msg.format('\t'*n, n))
    return n * 10

def main():
    display('Script starting.')
    executor=futures.ThreadPoolExecutor(max_workers=3)#创建ThreadPoolExecutor实例，有三个线程
    results=executor.map(loiter,range(5))#把五个任务交给executor(因为只有三个线程，所以有三个任务会立即开始：loiter(0)，loiter(1)，loiter(2))；这是非阻塞调用
    display('results:',results)
    display('Waiting for individual results:')
    for i, result in enumerate(results):  
        display('result {}: {}'.format(i, result))
main()

[10:48:45] Script starting.
[10:48:45] loiter(0):doing nothing for 0s...
[10:48:45] 	loiter(1):doing nothing for 1s...
[10:48:45] loiter(0): done.
[10:48:45] 		loiter(2):doing nothing for 2s...
[10:48:45][10:48:45]  			loiter(3):doing nothing for 3s...
results: <generator object Executor.map.<locals>.result_iterator at 0x000001C33584B2B0>
[10:48:45] Waiting for individual results:
[10:48:45] result 0: 0
[10:48:46] 	loiter(1): done.
[10:48:46] 				loiter(4):doing nothing for 4s...
[10:48:46] result 1: 10
[10:48:47] 		loiter(2): done.
[10:48:47] result 2: 20
[10:48:48] 			loiter(3): done.
[10:48:48] result 3: 30
[10:48:50] 				loiter(4): done.
[10:48:50] result 4: 40


## 17.5显示下载进度并处理错误

### 17.5.1 flags2系列示例处理错误的方式

In [15]:
#flags2_commom.py

import os
import time
import sys
import string
import argparse
from collections import namedtuple
from enum import Enum


Result = namedtuple('Result', 'status data')

HTTPStatus = Enum('Status', 'ok not_found error')

POP20_CC = ('CN IN US ID BR PK NG BD RU JP '
            'MX PH VN ET EG DE IR TR CD FR').split()

DEFAULT_CONCUR_REQ = 1
MAX_CONCUR_REQ = 1

SERVERS = {
    'REMOTE': 'http://flupy.org/data/flags',
    'LOCAL':  'http://localhost:8001/flags',
    'DELAY':  'http://localhost:8002/flags',
    'ERROR':  'http://localhost:8003/flags',
}
DEFAULT_SERVER = 'REMOTE'

DEST_DIR = 'downloads/'
COUNTRY_CODES_FILE = 'country_codes.txt'


def save_flag(img, filename):
    path = os.path.join(DEST_DIR, filename)
    with open(path, 'wb') as fp:
        fp.write(img)


def initial_report(cc_list, actual_req, server_label):
    if len(cc_list) <= 10:
        cc_msg = ', '.join(cc_list)
    else:
        cc_msg = 'from {} to {}'.format(cc_list[0], cc_list[-1])
    print('{} site: {}'.format(server_label, SERVERS[server_label]))
    msg = 'Searching for {} flag{}: {}'
    plural = 's' if len(cc_list) != 1 else ''
    print(msg.format(len(cc_list), plural, cc_msg))
    plural = 's' if actual_req != 1 else ''
    msg = '{} concurrent connection{} will be used.'
    print(msg.format(actual_req, plural))


def final_report(cc_list, counter, start_time):
    elapsed = time.time() - start_time
    print('-' * 20)
    msg = '{} flag{} downloaded.'
    plural = 's' if counter[HTTPStatus.ok] != 1 else ''
    print(msg.format(counter[HTTPStatus.ok], plural))
    if counter[HTTPStatus.not_found]:
        print(counter[HTTPStatus.not_found], 'not found.')
    if counter[HTTPStatus.error]:
        plural = 's' if counter[HTTPStatus.error] != 1 else ''
        print('{} error{}.'.format(counter[HTTPStatus.error], plural))
    print('Elapsed time: {:.2f}s'.format(elapsed))


def expand_cc_args(every_cc, all_cc, cc_args, limit):
    codes = set()
    A_Z = string.ascii_uppercase
    if every_cc:
        codes.update(a+b for a in A_Z for b in A_Z)
    elif all_cc:
        with open(COUNTRY_CODES_FILE) as fp:
            text = fp.read()
        codes.update(text.split())
    else:
        for cc in (c.upper() for c in cc_args):
            if len(cc) == 1 and cc in A_Z:
                codes.update(cc+c for c in A_Z)
            elif len(cc) == 2 and all(c in A_Z for c in cc):
                codes.add(cc)
            else:
                msg = 'each CC argument must be A to Z or AA to ZZ.'
                raise ValueError('*** Usage error: '+msg)
    return sorted(codes)[:limit]


def process_args(default_concur_req):
    server_options = ', '.join(sorted(SERVERS))
    parser = argparse.ArgumentParser(
                description='Download flags for country codes. '
                'Default: top 20 countries by population.')
    parser.add_argument('cc', metavar='CC', nargs='*',
                help='country code or 1st letter (eg. B for BA...BZ)')
    parser.add_argument('-a', '--all', action='store_true',
                help='get all available flags (AD to ZW)')
    parser.add_argument('-e', '--every', action='store_true',
                help='get flags for every possible code (AA...ZZ)')
    parser.add_argument('-l', '--limit', metavar='N', type=int,
                help='limit to N first codes', default=sys.maxsize)
    parser.add_argument('-m', '--max_req', metavar='CONCURRENT', type=int,
                default=default_concur_req,
                help='maximum concurrent requests (default={})'
                      .format(default_concur_req))
    parser.add_argument('-s', '--server', metavar='LABEL',
                default=DEFAULT_SERVER,
                help='Server to hit; one of {} (default={})'
                      .format(server_options, DEFAULT_SERVER))
    parser.add_argument('-v', '--verbose', action='store_true',
                help='output detailed progress info')
    args = parser.parse_args()
    if args.max_req < 1:
        print('*** Usage error: --max_req CONCURRENT must be >= 1')
        parser.print_usage()
        sys.exit(1)
    if args.limit < 1:
        print('*** Usage error: --limit N must be >= 1')
        parser.print_usage()
        sys.exit(1)
    args.server = args.server.upper()
    if args.server not in SERVERS:
        print('*** Usage error: --server LABEL must be one of',
              server_options)
        parser.print_usage()
        sys.exit(1)
    try:
        cc_list = expand_cc_args(args.every, args.all, args.cc, args.limit)
    except ValueError as exc:
        print(exc.args[0])
        parser.print_usage()
        sys.exit(1)

    if not cc_list:
        cc_list = sorted(POP20_CC)
    return args, cc_list


def main(download_many, default_concur_req, max_concur_req):
    args, cc_list = process_args(default_concur_req)
    actual_req = min(args.max_req, max_concur_req, len(cc_list))
    initial_report(cc_list, actual_req, args.server)
    base_url = SERVERS[args.server]
    t0 = time.time()
    counter = download_many(cc_list, base_url, args.verbose, actual_req)
    assert sum(counter.values()) == len(cc_list), \
        'some downloads are unaccounted for'
    final_report(cc_list, counter, t0)

In [2]:
#flags2_sequential.py:负责下载的基本函数
import requests
import collections
import tqdm


def get_flag(base_url,cc):
    url='{}/{cc}/{cc}.gif'.format(base_url,cc=cc.lower())
    resp=requests.get(url)
    if resp.status_code != 200:
        resp.raise_for_status()
    return resp.content

def download_one(cc,base_url,verbose=False):
    try:
        img=get_flag(base_url,cc)
    except requests.exceptions.HTTPError as exc:
        res=exc.response
        if res.status_code ==404:
            status=HTTPStatus.not_found
            msg='not found'
        else:
            raise
    else:
        save_flag(img,cc.lower()+'.gif')
        status=HTTPStatus.ok
        msg='ok'
    if verbose:
        print(cc,msg)
    return Result(status,cc)

def download_many(cc_list,base_url,verbose,max_req):
    counter = collections.Counter()
    cc_iter = sorted(cc_list)
    if not verbose:
        cc_iter=tqdm.tqdm(cc_iter)
    for cc in cc_iter:
        try:
            res=download_one(cc,base_url,verbose)
        except requests.exceptions.HTTPError as exc:
            error_msg = 'HTTP error {res.status_code}-{res.reason}'
            error_msg = error_msg.format(res=exc.response)
        except requests.exceptions.ConnectionError as exc:
            error_msg = 'Connection Error'
        else:
            error_msg = ''
            status = res.status
        if error_msg:
            status = HTTPStatus.error
        counter[status]+=1
        if verbose and error_msg:
            print('*** Errir for {}:{}'.format(cc,error_msg))
    return counter
if __name__ == '__main__':
    main(download_many, DEFAULT_CONCUR_REQ, MAX_CONCUR_REQ)

### 17.5.2 使用futures.as_completed函数

In [None]:
"""Download flags of countries (with error handling).

ThreadPool version

Sample run::

    $ python3 flags2_threadpool.py -s ERROR -e
    ERROR site: http://localhost:8003/flags
    Searching for 676 flags: from AA to ZZ
    30 concurrent connections will be used.
    --------------------
    150 flags downloaded.
    361 not found.
    165 errors.
    Elapsed time: 7.46s

"""

# BEGIN FLAGS2_THREADPOOL
import collections
from concurrent import futures

import requests
import tqdm  # <1>

from flags2_common import main, HTTPStatus  # <2>
from flags2_sequential import download_one  # <3>

DEFAULT_CONCUR_REQ = 30  # <4>
MAX_CONCUR_REQ = 1000  # <5>


def download_many(cc_list, base_url, verbose, concur_req):
    counter = collections.Counter()
    with futures.ThreadPoolExecutor(max_workers=concur_req) as executor:  # <6>
        to_do_map = {}  # <7>
        for cc in sorted(cc_list):  # <8>
            future = executor.submit(download_one,
                            cc, base_url, verbose)  # <9>
            to_do_map[future] = cc  # <10>
        done_iter = futures.as_completed(to_do_map)  # <11>
        if not verbose:
            done_iter = tqdm.tqdm(done_iter, total=len(cc_list))  # <12>
        for future in done_iter:  # <13>
            try:
                res = future.result()  # <14>
            except requests.exceptions.HTTPError as exc:  # <15>
                error_msg = 'HTTP {res.status_code} - {res.reason}'
                error_msg = error_msg.format(res=exc.response)
            except requests.exceptions.ConnectionError as exc:
                error_msg = 'Connection error'
            else:
                error_msg = ''
                status = res.status

            if error_msg:
                status = HTTPStatus.error
            counter[status] += 1
            if verbose and error_msg:
                cc = to_do_map[future]  # <16>
                print('*** Error for {}: {}'.format(cc, error_msg))

    return counter


if __name__ == '__main__':
    main(download_many, DEFAULT_CONCUR_REQ, MAX_CONCUR_REQ)
# END FLAGS2_THREADPOOL