# 实例一：邮政编码查询网站
- 网址：http://www.ip138.com/post/
- 目标：获取各省份的链接编号

## 补充：
- xml库：xml为一种固定分层数据格式，利用树结构表示的最自然的一种方式。该库包含两个模块，分别表示不同的目的
 - ElementTree：该子模块将整个XML文件表示为一个树结构；
 - Element：表示这个树结构中一个单个节点

In [1]:
import requests
import xml.etree.ElementTree as ET
from xml.parsers.expat import ParserCreate

In [17]:
parser = ParserCreate()
help(parser.Parse)

Help on built-in function Parse:

Parse(data, isfinal=False, /) method of pyexpat.xmlparser instance
    Parse XML data.
    
    `isfinal' should be true at end of input.



In [46]:
class DefaultSaxHandler(object):
    def __init__(self, provinces):
        self.provinces = provinces
    
    # 处理起始标签内容
    def start_element(self, name, attrs):
        # 判断标签名称是否为“map”
        if name != 'map':
            # 若标签名称不是“map”，则获取标签“title”和“href”的内容
            name = attrs['title']
            number = attrs['href']
            self.provinces.append((name, number))
    
    # 处理结尾标签内容
    def end_element(self, name):
        pass
    
    # 处理字符串
    def char_data(self, text):
        pass
 
def get_province_entry(url):
    # 由于该网站的编码为gb2312，若为Unicode则对其解码（也可使用content.encoding = content.apparent_encoding）
    content = requests.get(url).content.decode('gb2312') 

    # 确定要查找字符串的开始位置，并用切片获取内容
    start = content.find('<map name=\"map_86\" id=\"map_86\">')
    
    # 确定要查找字符串的结束位置，并用切片获取内容
    end = content.find('</map>')
    
    # 返回起始位置—结束位置对应的内容，end+len('</map>')将标签“</map>”添加到末尾
    content = content[start:end + len('</map>')].strip()
    #print(content)
    
    province = []
    
    # 生成Sax解析器
    handle = DefaultSaxHandler(province)
    
    # 初始化分析器
    parser = ParserCreate()
    
    # 分别处理起始内容、结束内容、字符串内容
    parser.StartElementHandler = handle.start_element
    parser.EndElementHandler = handle.end_element
    parser.CharacterDataHandler = handle.char_data
    
    # 解析数据
    parser.Parse(content)
    return province
   
get_province_entry('http://www.ip138.com/post')

[('新疆', '/83/'),
 ('西藏', '/85/'),
 ('青海', '/81/'),
 ('甘肃', '/73/'),
 ('四川', '/61/'),
 ('云南', '/65/'),
 ('宁夏', '/75/'),
 ('内蒙古', '/01/'),
 ('黑龙江', '/15/'),
 ('吉林', '/13/'),
 ('辽宁', '/11/'),
 ('河北', '/50/'),
 ('北京', '/10/'),
 ('天津', '/30/'),
 ('陕西', '/71/'),
 ('山西', '/03/'),
 ('山东', '/25/'),
 ('河南', '/45/'),
 ('重庆', '/40/'),
 ('湖北', '/43/'),
 ('安徽', '/23/'),
 ('江苏', '/21/'),
 ('上海', '/20/'),
 ('贵州', '/55/'),
 ('广西', '/53/'),
 ('湖南', '/41/'),
 ('江西', '/33/'),
 ('浙江', '/31/'),
 ('福建', '/35/'),
 ('广东', '/51/'),
 ('海南', '/57/'),
 ('台湾', '/taiwang/'),
 ('澳门', '/aomen/'),
 ('香港', '/xianggang/')]

# 实例二：股票数据抓取
- 新浪股票数据接口：http://hq.sinajs.cn/list=sh600001

In [33]:
import requests
import threading

In [47]:
# 发起请求，获取每个股票的页面内容
def display_info(code):
    url = 'http://hq.sinajs.cn/list=' + code
    wb_data = requests.get(url).text
    print(wb_data)
    
def single_thread(codes):
    for code in codes:
        code = code.strip()
        display_info(code)

def multi_thread(tasks):
    # 用列表解析式生成线程（注意：codes后边的“,”）
    thread = [threading.Thread(target = single_thread, args = (codes, )) for codes in tasks]
    
    # 启动线程
    for t in thread:
        t.start()
    
    # 等待线程结束
    for t in thread:
        t.join()

if __name__ == '__main__':
    codes = ['sh600001', 'sh600002', 'sh600003', 'sh600004', 'sh600005']
    
    # 计算每个线程的工作量
    thread_len = int(len(codes) / 4)
    t1 = codes[0 : thread_len]
    t2 = codes[thread_len : thread_len * 2]
    t3 = codes[thread_len * 2 : thread_len * 3]
    t4 = codes[thread_len * 3 : ]
    
    # 多线程启动
    multi_thread([t1, t2, t3, t4])

var hq_str_sh600001="";
var hq_str_sh600002="";

var hq_str_sh600004="白云机场,16.670,16.750,16.820,17.050,16.670,16.820,16.830,20900116,352790872.000,4000,16.820,10987,16.810,16900,16.800,42000,16.790,42176,16.780,55200,16.830,24500,16.840,100231,16.850,39705,16.860,117805,16.870,2017-06-02,15:00:00,00";
var hq_str_sh600003="";



var hq_str_sh600005="武钢股份,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0,0.000,0,0.000,0,0.000,0,0.000,0,0.000,0,0.000,0,0.000,0,0.000,0,0.000,0,0.000,0,0.000,2017-06-02,09:14:42,00";

