# 开源爬虫工具OpenDataTools及其使用样例

## 项目介绍

+ OpenDataTools是一个开源爬虫工具,通过爬虫将各种数据接口简化,方便用户使用. 由QuantOS团队开发. 

+ 目前的版本是:0.0.5

+ 目前只支持 空气质量AQI 的数据获取.

+ 项目地址: https://github.com/PKUJohnson/OpenData, 感兴趣的同学可以去研究代码.

## 样例介绍

空气质量数据AQI, 数据来源于 环保部网站. http://datacenter.mep.gov.cn/

## 准备工作

安装opendatatools(开源的数据爬虫工具)
+ pip install opendatatools 

安装pyecharts 
+ pip install pyecharts
+ pip install echarts-countries-pypkg  
+ pip install echarts-china-provinces-pypkg  
+ pip install echarts-china-cities-pypkg  

## Case 1: API介绍(OpenDataTools获取空气质量数据)

In [66]:
from opendatatools import aqi

In [68]:
# 获取历史某日全国各大城市的AQI数据
# 返回DataFrame
df_aqi = aqi.get_daily_aqi('2018-05-27')
df_aqi

Unnamed: 0,aqi,city,code,date,indicator,level
0,90,北京市,110000,2018-05-27,颗粒物(PM10),良
1,99,天津市,120000,2018-05-27,颗粒物(PM10),良
2,89,石家庄市,130100,2018-05-27,颗粒物(PM10),良
3,104,唐山市,130200,2018-05-27,颗粒物(PM10),轻度污染
4,95,秦皇岛市,130300,2018-05-27,臭氧8小时,良
5,101,邯郸市,130400,2018-05-27,臭氧8小时,轻度污染
6,88,邢台市,130500,2018-05-27,颗粒物(PM10),良
7,97,保定市,130600,2018-05-27,臭氧8小时,良
8,82,承德市,130800,2018-05-27,颗粒物(PM10),良
9,100,沧州市,130900,2018-05-27,臭氧8小时,良


In [69]:
# 获取实时全国各大城市的AQI数据
#df_aqi = aqi.get_hour_aqi('2018-05-28 11:00:00')

# 如果不指定时间点,会尝试获取最近的数据
df_aqi = aqi.get_hour_aqi()
df_aqi

Unnamed: 0,aqi,city,code,indicator,level,time
0,69,北京市,110000,颗粒物(PM10),良,2018-05-28 12:00:00
1,367,天津市,120000,颗粒物(PM10),严重污染,2018-05-28 12:00:00
2,73,石家庄市,130100,颗粒物(PM10),良,2018-05-28 12:00:00
3,249,唐山市,130200,颗粒物(PM10),重度污染,2018-05-28 12:00:00
4,88,秦皇岛市,130300,颗粒物(PM10),良,2018-05-28 12:00:00
5,65,邯郸市,130400,臭氧1小时,良,2018-05-28 12:00:00
6,93,邢台市,130500,颗粒物(PM10),良,2018-05-28 12:00:00
7,126,保定市,130600,颗粒物(PM10),轻度污染,2018-05-28 12:00:00
8,60,承德市,130800,颗粒物(PM10),良,2018-05-28 12:00:00
9,176,沧州市,130900,颗粒物(PM10),中度污染,2018-05-28 12:00:00


In [70]:
# 获取单个城市的AQI历史数据
aqi.get_daily_aqi_onecity('北京市')

Unnamed: 0,aqi,date,indicator,level
0,90,2018-05-27,颗粒物(PM10),良
1,133,2018-05-26,细颗粒物(PM2.5),轻度污染
2,162,2018-05-25,臭氧8小时,中度污染
3,151,2018-05-24,臭氧8小时,中度污染
4,138,2018-05-23,颗粒物(PM10),轻度污染
5,135,2018-05-22,颗粒物(PM10),轻度污染
6,83,2018-05-21,细颗粒物(PM2.5),良
7,63,2018-05-20,臭氧8小时,良
8,110,2018-05-19,臭氧8小时,轻度污染
9,94,2018-05-18,臭氧8小时,良


In [71]:
#获取单个城市某日的AQI小时数据
aqi_hour = aqi.get_hour_aqi_onecity('北京市', '2018-05-26')
aqi_hour.set_index('time', inplace=True)
aqi_hour

Unnamed: 0_level_0,aqi,city,indicator,level
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-05-26 23:00:00,88,北京市,细颗粒物(PM2.5),良
2018-05-26 22:00:00,91,北京市,颗粒物(PM10),良
2018-05-26 21:00:00,100,北京市,颗粒物(PM10),良
2018-05-26 20:00:00,119,北京市,颗粒物(PM10),轻度污染
2018-05-26 19:00:00,137,北京市,颗粒物(PM10),轻度污染
2018-05-26 18:00:00,165,北京市,颗粒物(PM10),中度污染
2018-05-26 17:00:00,144,北京市,颗粒物(PM10),轻度污染
2018-05-26 16:00:00,178,北京市,细颗粒物(PM2.5),中度污染
2018-05-26 15:00:00,178,北京市,细颗粒物(PM2.5),中度污染
2018-05-26 14:00:00,183,北京市,细颗粒物(PM2.5),中度污染


## Case 2 : 获取实时全国AQI数据并画地图展示

In [72]:
# encoding: utf-8

from opendatatools import aqi
from pyecharts import Geo

import pandas as pd

def draw_realtime_aqi_map(time = None):
    
    from opendatatools import aqi
    df_aqi = aqi.get_hour_aqi(time)

    # some city cannot by process by echart
    echart_unsupported_city = [
        "菏泽市", "襄阳市", "恩施州", "湘西州","阿坝州", "延边州",
        "甘孜州", "凉山州", "黔西南州", "黔东南州", "黔南州", "普洱市", "楚雄州", "红河州",
        "文山州", "西双版纳州", "大理州", "德宏州", "怒江州", "迪庆州", "昌都市", "山南市",
        "林芝市", "临夏州", "甘南州", "海北州", "黄南州", "海南州", "果洛州", "玉树州", "海西州",
        "昌吉州", "博州", "克州", "伊犁哈萨克州"]

    if time is None and len(df_aqi) > 0:
        time = df_aqi['time'][0]
    
    data = []
    for index, row in df_aqi.iterrows():
        city = row['city']
        aqi  = row['aqi']

        if city in echart_unsupported_city:
            continue

        data.append( (city, aqi) )

    geo = Geo("全国最新主要城市空气质量（AQI) - %s" % time , "数据来源于环保部网站",
              title_color="#fff",
              title_pos="center", width=1000,
              height=600, background_color='#404a59')

    attr, value = geo.cast(data)

    geo.add("", attr, value, visual_range=[0, 150], 
            maptype='china',visual_text_color="#fff",
            symbol_size=10, is_visualmap=True,
            label_formatter='{b}',             # 指定 label 只显示城市名
            tooltip_formatter='{c}',           # 格式：经度、纬度、值
            label_emphasis_textsize=15,        # 指定标签选中高亮时字体大小
            label_emphasis_pos='right'         # 指定标签选中高亮时字体位置
           )

    return geo


In [73]:
draw_realtime_aqi_map()

## Case 3: 获取历史某日全国AQI数据并画地图展示

In [74]:
# encoding: utf-8

from opendatatools import aqi
from pyecharts import Geo

import pandas as pd


def draw_his_aqi_map(date):
    
    from opendatatools import aqi
    df_aqi = aqi.get_daily_aqi(date)
    #df_aqi.to_csv("aqi_daily.csv")

    # some city cannot by process by echart
    echart_unsupported_city = [
        "菏泽市", "襄阳市", "恩施州", "湘西州","阿坝州", "延边州",
        "甘孜州", "凉山州", "黔西南州", "黔东南州", "黔南州", "普洱市", "楚雄州", "红河州",
        "文山州", "西双版纳州", "大理州", "德宏州", "怒江州", "迪庆州", "昌都市", "山南市",
        "林芝市", "临夏州", "甘南州", "海北州", "黄南州", "海南州", "果洛州", "玉树州", "海西州",
        "昌吉州", "博州", "克州", "伊犁哈萨克州"]

    data = []
    for index, row in df_aqi.iterrows():
        city = row['city']
        aqi  = row['aqi']

        if city in echart_unsupported_city:
            continue

        data.append( (city, aqi) )

    geo = Geo("全国主要城市空气质量（AQI) - %s" % date , "数据来源于环保部网站",
              title_color="#fff",
              title_pos="center", width=1000,
              height=600, background_color='#404a59')

    attr, value = geo.cast(data)

    geo.add("", attr, value, visual_range=[0, 150], 
            maptype='china',visual_text_color="#fff",
            symbol_size=10, is_visualmap=True,
            label_formatter='{b}',             # 指定 label 只显示城市名
            tooltip_formatter='{c}',           # 格式：经度、纬度、值
            label_emphasis_textsize=15,        # 指定标签选中高亮时字体大小
            label_emphasis_pos='right'         # 指定标签选中高亮时字体位置
           )

    return geo


SyntaxError: invalid syntax (<ipython-input-74-6549a965af95>, line 43)

In [13]:
draw_his_aqi_map('2018-05-27')

In [14]:
draw_his_aqi_map('2017-05-27')

In [15]:
draw_his_aqi_map('2016-05-27')

## Case 4 : 看某几个城市历史一段时间的走势图

In [16]:
# encoding: utf-8

from pyecharts import Line
import pandas as pd

def draw_city_aqi(cities, start_date = None, end_date = None):
    from opendatatools import aqi
    line = Line("城市AQI趋势图")
    
    data_dict = {}
    for city in cities:
        print("getting data for %s" % city)
        df_aqi = aqi.get_daily_aqi_onecity(city)
        df_aqi.set_index('date', inplace=True)
        df_aqi.sort_index(ascending=True, inplace=True)

        if start_date is not None:
            df_aqi = df_aqi[df_aqi.index >= start_date]
        
        if end_date is not None:
            df_aqi = df_aqi[df_aqi.index <= end_date]
        
        
        data_dict[city] = df_aqi
    
        axis_x = df_aqi.index
        axis_y = df_aqi['aqi']

        line.add("aqi curve for %s" % (city), axis_x, axis_y, mark_point=["average"])

    return line


In [17]:
draw_city_aqi(['北京市','上海市'], start_date = '2018-01-01', end_date = '2018-05-31')

getting data for 北京市
getting data for 上海市


## Case 5 : 看某个城市日内小时走势图

In [19]:
from pyecharts import Line
import pandas as pd

def draw_city_aqi_hour(cities, date):
    from opendatatools import aqi
    line = Line("城市AQI小时趋势图")
    
    data_dict = {}
    for city in cities:
        print("getting data for %s" % city)
        df_aqi = aqi.get_hour_aqi_onecity(city, date)
        df_aqi.set_index('time', inplace=True)
        df_aqi.sort_index(ascending=True, inplace=True)

        data_dict[city] = df_aqi
    
        axis_x = df_aqi.index
        axis_y = df_aqi['aqi']

        line.add("aqi curve for %s" % (city), axis_x, axis_y, mark_point=["average"])

    return line

In [22]:
draw_city_aqi_hour(['北京市', '上海市'], '2018-05-28')

getting data for 北京市
getting data for 上海市
