# 11 多线程，多进程，线程池

## 11.1 GIL

In [1]:
# GIL:globel interpreter lock (cpython) 全局解释器锁
# python中的一个线程对应于C语言中的一个线程
# GIL使得同一时刻只有一个线程运行在一个cpu上执行字节码,无法将多个线程映射到多个CPU上
import dis
def add(a):
    a = a + 1
    return a
print(dis.dis(add))

  6           0 LOAD_FAST                0 (a)
              2 LOAD_CONST               1 (1)
              4 BINARY_ADD
              6 STORE_FAST               0 (a)

  7           8 LOAD_FAST                0 (a)
             10 RETURN_VALUE
None


In [2]:
total = 0
def add():
    global total
    for i in range(1000000):
        total += 1
        
def desc():
    global total
    for i in range(1000000):
        total -= 1

import threading
threading1 = threading.Thread(target=add)
threading2 = threading.Thread(target=desc)
threading1.start()
threading2.start()

threading1.join()
threading2.join() #等待完成
print(total)

-33888


GIL会在遇到io操作或者时间片消耗完时释放锁

## 11.2 多线程编程

In [3]:
# 线程是操作系统能切换调度的最小单元
# 线程依赖于进程
# 对于多IO操作来说，多线程与多进程性能差别不大
import time

def get_detail_html(url):
    print("get detail html started")
    time.sleep(2)
    print("get detail html end")

def get_detail_url(url):
    print("get detail url started")
    time.sleep(2)
    print("get detail url end")

import threading

threading1 = threading.Thread(target=get_detail_html,args=("",))
threading2 = threading.Thread(target=get_detail_url,args=("",))
threading1.setDaemon(True)
threading2.setDaemon(True)  # 设置为守护线程，则主线程完成后杀掉此线程
start_time = time.time()
threading1.start()
threading2.start()  # 此时有三个线程，分别是创建的两个线程和主线程
threading1.join()
threading2.join()   # join是使主线程等待该线程，否则主线程可以先执行完
print("last time:{}".format(time.time()-start_time))

get detail html startedget detail url started

get detail html end
get detail url end
last time:2.0044353008270264


In [4]:
# 也可以通过继承Thread实现多线程
class GetDetailHtml(threading.Thread):
    def __init__(self,name):
        super().__init__(name = name)
        
        
    
    def run(self):
        print("get detail html started")
        time.sleep(2)
        print("get detail html end")
        

class GetDetailUrl(threading.Thread):
    
    def __init__(self,name):
        super().__init__(name = name)
        
    def run(self):
        print("get detail url started")
        time.sleep(2)
        print("get detail url end")

In [5]:
thread1 = GetDetailHtml("get_detail_html")
thread2 = GetDetailUrl("get_detail_url")
start_time = time.time()
thread1.start()
thread2.start()
thread1.join()
thread2.join()
print("last time:{}".format(time.time()-start_time))

get detail html started
get detail url started
get detail html end
get detail url end
last time:2.0055091381073


## 11.3 线程间通信

线程间通信方式：

1、共享变量

In [19]:
detail_list = []

def get_detail_html(detail_list):
    #爬取文章详情页
    for url in detail_list:   # 这样设计时不合理的，因为详情页比列表页处理慢，用一个单独的线程处理会拖累整体速度
        print("get detail html started")
        time.sleep(2)
        print("get detail html end")
    
    
def get_detail_url(detail_list):
    #爬取文章列表页
    print("get detail url started")
    while True:
        time.sleep(2)
        for i in range(20):  # 一次性解析二十条
            detail_list.append("http://projectend.com/{id}".format(id=i))
        if len(detail_list)>19:
            break
    print("get detail url end")

    
def get_detail_html2(detail_list):
    # 爬取文章详情页,每次调用方法爬取一次
    while True:
        if len(detail_list):
            url = detail_list.pop()   # 但是pop不是线程安全的
            print(url)
            # print("get detail html started")
            time.sleep(2)
            # print("get detail html end")

In [20]:
thread_detail_url = threading.Thread(target=get_detail_url,args=(detail_list,))
for i in range(10):
    html_thead = threading.Thread(target=get_detail_html2,args=(detail_list,))
    html_thead.start()
thread_detail_url.start()

get detail url started
get detail url endhttp://projectend.com/19http://projectend.com/18http://projectend.com/17http://projectend.com/16http://projectend.com/15http://projectend.com/14





http://projectend.com/13http://projectend.com/12http://projectend.com/11http://projectend.com/7

http://projectend.com/9
http://projectend.com/10http://projectend.com/8



http://projectend.com/6http://projectend.com/5http://projectend.com/4http://projectend.com/3




http://projectend.com/2http://projectend.com/1

http://projectend.com/0


小技巧：可以将共享的变量移到一个py文件中统一管理

但是，这种情况下不要使用（from 文件夹.文件 import 变量）这种方式，因为这样别的线程修改变量对这个线程是不可见的

2、通过queue来同步

In [28]:

def get_detail_url(queue):
    #爬取文章列表页
    print("get detail url started")
    while True:
        time.sleep(2)
        for i in range(20):  # 一次性解析二十条
            queue.put("http://projectend.com/{id}".format(id=i))
        # queue.qsize()
        # queue.empty()
        # queue.full()
        # queue.put_nowait()
        # queue.get_nowait()
        # queue.task_down()      # 给queue一个任务完成的信号
        # queue.join()           # queue阻塞主线程，直到收到taskdone信号
        break
    print("get detail url end")

    
def get_detail_html2(queue):
    # 爬取文章详情页,每次调用方法爬取一次
    while True:
    
        url = queue.get()   # get是阻塞的，当queue为空，阻塞线程
        print(url)
        # print("get detail html started")
        time.sleep(2)
        # print("get detail html end")

In [29]:
import queue
detail_url_queue = queue.Queue(maxsize=1000)
thread_detail_url = threading.Thread(target=get_detail_url,args=(detail_url_queue,))
for i in range(10):
    html_thead = threading.Thread(target=get_detail_html2,args=(detail_url_queue,))
    html_thead.start()
thread_detail_url.start()

get detail url started
get detail url endhttp://projectend.com/0http://projectend.com/1http://projectend.com/2http://projectend.com/3http://projectend.com/4http://projectend.com/5http://projectend.com/8http://projectend.com/10http://projectend.com/6









http://projectend.com/14
http://projectend.com/17http://projectend.com/7http://projectend.com/9http://projectend.com/11http://projectend.com/12http://projectend.com/13




http://projectend.com/19http://projectend.com/16

http://projectend.com/15http://projectend.com/18


