# 1. 确定当前使用的Python版本

In [1]:
import sys
print(sys.version_info)
print(sys.version)

sys.version_info(major=3, minor=6, micro=4, releaselevel='final', serial=0)
3.6.4 (v3.6.4:d48eceb, Dec 19 2017, 06:54:40) [MSC v.1900 64 bit (AMD64)]


# 3. 了解bytes, str与Unicode的区别

In [3]:
# 接受str或bytes，并总是返回str的方法
def to_str(bytes_or_str):
    if isinstance(bytes_or_str, bytes):
        value = bytes_or_str.decode('utf-8')
    else:
        value = bytes_or_str
    return value    # instance of str

# 接受str或bytes，并总是放回bytes的方法
def to_byte(bytes_or_str):
    if isinstance(bytes_or_str, str):
        value = bytes_or_str.encode('utf-8')
    else:
        value = bytes_or_str
    return value    # instance of bytes

In [None]:
random_bits = 0
for i in range(64):
    if randint(0,1):
        random_bits |= 1 << i
random_bits

# 36. 用subprocess模块来管理子进程

In [2]:
# 用Popen构造器来启动进程。然后用communicate方法读取子进程的输出信息，并等待终止
import subprocess
proc = subprocess.Popen(['echo', 'Hello from the child'], stdout=subprocess.PIPE)

In [3]:
proc

<subprocess.Popen at 0x1e8b0a1f470>

In [4]:
out, err = proc.communicate()
out, err

(b'Hello from the child\n', None)

In [5]:
print(out.decode('utf-8'))

Hello from the child



In [None]:
# 一边定期查询子进程的状态，一边处理其他事务
proc = subprocess.Popen(['sleep', '0.01'])
while proc.poll() is None:
    print('Working...')
    
print('Exit status', proc.poll())

In [10]:
# 把子进程从父进程中剥离(decouple， 解耦)，意味着父进程可以随意运行很多条平行的子进程。
from time import time

def run_sleep(period):
    proc = subprocess.Popen(['sleep', str(period)])
    return proc

start = time()
procs = []
for _ in range(10):
    proc = run_sleep(0.1)
    procs.append(proc)
    
for proc in procs:
    proc.communicate()
end = time()
print('Finished in %.3f seconds' % (end - start))

Finished in 0.275 seconds


In [None]:
import os
os.environ

> 可以用subprocess模块运行子进程，并管理其输入流与输出流

# 37. 可以用线程来执行阻塞式I/O，但不要用它做平行计算
标准的Python实现叫做CPython。CPython分两步来运行Python程序。首先，把文本形式的源代码解析并编译成字节码。然后，用一种基于栈的解释器来运行这份字节码。执行Python程序时，字节码解释器必须保持协调一致的状态。Python采用GIL机制来确保这种协调性。

GIL实际上就是一把互斥锁(mutual-exclusion lock, mutex)，用以防止CPython受到展现式多线程切换(preemptive multithreading)操作的干扰。

GIL显著的负面影响，尽管Python支持多线程，但由于收到GIL的保护，所以同一时刻，只有一条线程可以向前执行。这意味着，无法使用多线程做平行计算(parallel computation).

In [13]:
# 0
def factorize(number):
    for i in range(1, number+1):
        if number % i == 0:
            yield i
            
numbers = [2134134, 2112345, 234534, 654647]
start = time()
for number in numbers:
    list(factorize(number))
end = time()
print('Took %.3f second' % (end-start))

Took 0.696 second


In [15]:
# 1
from threading import Thread

class FactorizeThread(Thread):
    def __init__(self, number):
        super().__init__()
        self.number = number
    
    def run(self):
        self.factors = list(factorize(self.number))
        
start = time()
threads = []
for number in numbers:
    thread = FactorizeThread(number)
    thread.start()
    threads.append(thread)

for thread in threads:
    thread.join()
    
end = time()
print('Took %.3f seconds' % (end-start))

Took 0.707 seconds


In [None]:
# 使用系统调用select模拟I/O操作
import select

def slow_systemcall():
    select.select([], [], [], 0.1)
    
start = time()
for _ in range(5):
    slow_systemcall()
end = time()
print('Took %.3f seconds' % (end-start))


# 38 在线程中使用Lock来防止数据竞争
GIL并不会保护开发者自己编写的代码。Python解释器在执行两个连续的字节码指令时，其他痫证可能会在中途突然插进来。
为了防止诸如此类的数据竞争(data race)行为，Python在内置threading模块里提供了一套健壮的工具，包括Lock类，该类相当于互斥锁。
我们可以用互斥锁保护**数据**对象，使得多个线程同时访问该**数据**时，不会将其破坏。同一时刻，只有一个线程能够获得锁。

In [22]:
class Counter:
    def __init__(self):
        self.count = 0
        
    def increment(self, offset):
        self.count += offset
        
def worker(sensor_index, how_many, counter):
    for _ in range(how_many):
        counter.increment(1)

def run_threads(func, how_many, counter):
    threads = []
    for i in range(5):
        args = (i, how_many, counter)
        thread = Thread(target=func, args=args)
        threads.append(thread)
        thread.start()
    for thread in threads:
        thread.join()

from threading import Lock
class LockingCounter:
    def __init__(self):
        self.lock = Lock()
        self.count = 0
        
    def increment(self, offset):
        with self.lock:
            self.count += offset

In [23]:
# 平行地执行这5条线程
# 1. 不加锁
how_many = 10**5
counter = Counter()
run_threads(worker, how_many, counter)
print('Counter shoud be %d, found %d' % (5*how_many, counter.count))

Counter shoud be 500000, found 299072


In [24]:
# 2. 加锁
lock_counter = LockingCounter()
run_threads(worker, how_many, lock_counter)
print('Counter shoud be %d, found %d' % (5*how_many, lock_counter.count))

Counter shoud be 500000, found 500000


# 39. 用Queue来协调各线程之间的工作
如果Python程序同时要执行许多事务，那么开发者经常需要协调这些事务。而在各种协调方式中，较为高效地一种，则是采用函数管线(pipeline)。

In [25]:
# Queue类是的工作线程无需再频繁地查询输入队列的状态，它的get方法会持续阻塞，知道由新的数据加入。
from queue import Queue
queue = Queue()

def consumer():
    print('Consumer waiting')
    queue.get()
    print('Consumer done')

thread = Thread(target=consumer)
thread

<Thread(Thread-31, initial)>

In [26]:
thread.start()

Consumer waiting


In [27]:
queue.put(object())

Consumer done


In [28]:
thread

<Thread(Thread-31, stopped 14780)>

# 40. 考虑用协程来并发地运行多个函数
用python线程来实现并发，有三个显著的缺点
1. 为了确保数据安全，需要工具协调这些线程，加锁或者通信
2. 线程需要占用大量内存，每个正在执行的线程，大约占据8MB内存
3. 线程启动时的开销比较大。
python的协程(coroutine)可以避免上述问题。协程的实现方式，实际上时对生成器的一种扩展。
协程的工作原理：
每当生成器函数执行到yield表达式的时候，消耗生成器的那段代码，就通过send方法给生成器回传一个值。而生成器再收到了经由send函数所传进来的这个值之后，会将其是为yield表达式的执行结果。

In [29]:
def my_coroutine():
    print('gen')
    while True:
        received = yield
        print('Recieved:', received)
it = my_coroutine()
it

<generator object my_coroutine at 0x000001E8B0BD9308>

In [32]:
next(it)

gen


In [33]:
it.send('hello')

Recieved: hello


In [34]:
it.send('world')

Recieved: world


In [42]:
def minimize():
    print('gen')
    current = yield
    while True:
        received = yield current
        current = min(received, current)
        
it = minimize()
next(it)

gen


In [43]:
it.send(10)

10

In [44]:
it.send(34)

10

In [45]:
it.send(3)

3

# 41. 考虑用concurrent.futures来实现真正的平行计算
concurrent.futures模块的multiprocessing，会以子进程的形式，平行地运行多个解释器，从而令Python程序能够利用多核心CPU来提升执行速度。由于子进程与主解释器相分离，所以它们的全局解释器锁也是相互独立的。每个子进程都可以完整地利用一个CPU内核，而且这些子进程，斗鱼主进程之间有联系，通过这条联系渠道，子进程可以接收主进程发过来的指令，并把计算结果返回给主进程。

In [56]:
from functools import wraps

def timer(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        start = time()
        result = func(*args, **kwargs)
        print('func:', func.__name__, ' used time:', time()-start)
        return result
    return wrapper

# 查找两数最大公约数的算法
@timer
def gcd(pair):
    a, b = pair
    low = min(a, b)
    for i in range(low, 0, -1):
        if a % i == 0 and b % i == 0:
            return i

In [None]:
numbers = [(12341324, 123415), (234139898, 1234997)]
results = list(map(gcd, numbers))
results

In [63]:
# 使用ThreadPoolExecutor类及两个工作线程来实现
from concurrent.futures import ThreadPoolExecutor
start = time()
pool = ThreadPoolExecutor(max_workers=2)  # max_workers表示工作线程的数量，此参数应与CPU核心数相同
results = list(pool.map(gcd, numbers))
print('Took %.3f seconds' % (time()-start))

func: gcd  used time: 0.01999831199645996
func: gcd  used time: 0.17899656295776367
Took 0.208 seconds


In [None]:
# 使用ProcessPoolExecutor类, won't work in jupyter
# It is now mentioned in the docs for ProcessPoolExecutor that it will not work in interactive consoles. 
# It says The main module must be importable by worker subprocesses. 
# This means that ProcessPoolExecutor will not work in the interactive interpreter
from concurrent.futures import ProcessPoolExecutor
start = time()
pool = ProcessPoolExecutor()  # max_workers表示工作线程的数量，此参数应与CPU核心数相同
results = list(pool.map(gcd, numbers))
print('Took %.3f seconds' % (time()-start))

对于某些较为孤立，且数据利用率较高的应用，上面的方案合适。否则multiprocessing所产生的开销，坑能使我们无法通过parallelization来提升程序速度。

# 45. 应该用datetime模块来处理本地时间，而不是用time模块

In [65]:
# time
now = time()
now

1583930133.6037855

In [67]:
from time import localtime
local_tuple = localtime(now)

In [68]:
local_tuple

time.struct_time(tm_year=2020, tm_mon=3, tm_mday=11, tm_hour=20, tm_min=35, tm_sec=33, tm_wday=2, tm_yday=71, tm_isdst=0)

In [70]:
from time import strftime
time_format = '%Y-%m-%d %H:%M:%S'
time_str = strftime(time_format, local_tuple)
time_str

'2020-03-11 20:35:33'

In [71]:
from time import mktime, strptime
time_tuple = strptime(time_str, time_format)
time_tuple

time.struct_time(tm_year=2020, tm_mon=3, tm_mday=11, tm_hour=20, tm_min=35, tm_sec=33, tm_wday=2, tm_yday=71, tm_isdst=-1)

In [72]:
utc_now = mktime(time_tuple)

In [73]:
utc_now

1583930133.0

In [74]:
# datetime
time_str

'2020-03-11 20:35:33'

In [76]:
import datetime
now = datetime.datetime.strptime(time_str, time_format)
now

datetime.datetime(2020, 3, 11, 20, 35, 33)

In [77]:
time_tuple = now.timetuple()
time_tuple

time.struct_time(tm_year=2020, tm_mon=3, tm_mday=11, tm_hour=20, tm_min=35, tm_sec=33, tm_wday=2, tm_yday=71, tm_isdst=-1)

In [78]:
utc_now = mktime(time_tuple)
utc_now

1583930133.0

# 46. 使用内置算法与数据结构

In [87]:
# 双向队列
from collections import deque
queue = deque()
queue.append(1)
queue

deque([1])

In [88]:
queue.append(2)
queue

deque([1, 2])

In [89]:
queue.popleft()

1

In [90]:
# 有序字典
from collections import OrderedDict
a = OrderedDict()
a['foo'] = 1
a['bar'] = 2
b = OrderedDict()
b['foo'] = 'red'
b['bar'] = 'blue'
for value1, value2 in zip(a.values(), b.values()):
    print(value1, value2)

1 red
2 blue


In [91]:
# 带有默认值的字典
from collections import defaultdict
stats = defaultdict(int)
stats['my_counter'] += 1

In [92]:
stats

defaultdict(int, {'my_counter': 1})

In [95]:
# 堆队列，优先队列
import heapq
a = []
heapq.heappush(a, 5)
heapq.heappush(a, 3)
heapq.heappush(a, 7)
for _ in range(len(a)+1):
    print(heapq.heappop(a))

3
5
7


IndexError: index out of range

In [96]:
# 二分查找
import bisect
# 再list上使用index方法来搜素元素，线性复杂度
x = list(range(10**2))
i = x.index(99)
i


99

In [98]:
i = bisect.bisect_left(x, 99)   # O(logn)
i

99

In [101]:
i = bisect.bisect_right(x, 99)  # default left: bisect.bisect()
i

100