# 清洗水库数据

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

%matplotlib inline

In [None]:
reservoir = pd.read_csv(r"../data/hydrology/reservoir.csv", index_col=0)
reservoir.head()

In [None]:
import requests
import json


def convert_name_to_gps(address):
    secret_key = "tSdDxv3r1hRtOZ2LGQG5pSyavKSx4Ia3"  # 我的百度API token
    api = (
        "http://api.map.baidu.com/geocoding/v3/?address={0}&output=json&ak={1}"
    )
    baidu_map_api = api.format(address, secret_key)
    content = json.loads(requests.get(baidu_map_api).text)
    if content["status"] is 0:
        gps = content["result"]["location"]
        lng, lat = gps["lng"], gps["lat"]
        if content["result"]["precise"] is 0:
            if content["result"]["confidence"] < 30:
                print(
                    "{} is not precise, error > 5km, please check".format(
                        address
                    )
                )
    else:
        print("{} is not match, please check".format(address))
        return 0.0, 0.0
    return lng, lat


def convert_name_to_province(name):
    lng, lat = convert_name_to_gps(name)
    secret_key = "tSdDxv3r1hRtOZ2LGQG5pSyavKSx4Ia3"  # 百度密钥
    # 注意coordtype为wgs84ll(GPS经纬度),否则定位会出现偏差
    api = "http://api.map.baidu.com/reverse_geocoding/v3/?ak={2}&output=json&coordtype=wgs84ll&location={0},{1}"
    baidu_map_api = api.format(lat, lng, secret_key)
    content = requests.get(baidu_map_api).text
    gps_address = json.loads(content)
    # 结构化的地址
    formatted_address = gps_address["result"]["formatted_address"]
    # 国家（若需访问境外POI，需申请逆地理编码境外POI服务权限）
    country = gps_address["result"]["addressComponent"]["country"]
    # 省
    province = gps_address["result"]["addressComponent"]["province"]
    # 城市
    city = gps_address["result"]["addressComponent"]["city"]
    return province


convert_name_to_province("红领巾水库")

In [None]:
reservoir_names = []
for col in ["名称", "名称.1", "名称.2", "名称.3"]:
    reservoir_names.extend(reservoir[col].dropna().unique())

name_province = {}
for name in reservoir_names:
    name_province[name] = convert_name_to_province(name)

name_province

In [None]:
correct = {
    "盐锅峡": "甘肃省",
    "巴家嘴水库": "甘肃省",
    "陆浑水库": "河南省",
    "雪野水库": "山东省",
    "羊毛湾水库": "山西省",
    "天桥水电站": "山西省",
    "冯家山水库": "陕西省",
    "八盘峡水电站": "甘肃省",
    "东大滩水库": "青海省",
    "王瑶水库": "陕西省",
    "李家峡水电站": "青海省",
    "克孜尔水库": "UnKnown",
    "万家寨水利枢纽工程": "山西省",
    "济南鹊山引黄调蓄水库": "山东省",
    "黄河源水电站": "青海省",
    "尼那水电站": "青海省",
    "苏只水电站": "青海省",
    "乌金峡水电站": "甘肃省",
    "黄河龙口水利枢纽": "山西省",
    "沁河河口村水库": "山西省",
    "大峡水电站": "甘肃省",
    "黄丰水电站": "青海省",
    "卧虎山水库": "山东省",
    "三盛公枢纽": "内蒙古",
    "文峪河水库": "山西省",
    "黑泉水利枢纽工程": "青海省",
    "直岗拉卡水电站": "青海省",
    "康扬水电站": "青海省",
    "寺沟峡水电站": "甘肃省",
    "拉西瓦水电站": "青海省",
    "班多水电站": "青海省",
    "小峡水电站": "甘肃省",
}

# 修改不正确的判断
for k, v in correct.items():
    if k in name_province:
        name_province[k] = v

print("There are total {} reservoires.".format(len(reservoir_names)))
print(
    "There are {} ({:.2%}) reservoires' name need correction.".format(
        len(correct), len(correct) / len(reservoir_names)
    )
)

In [None]:
year_list = []
data_list = []
for row in (
    reservoir[["亿立方米", "亿立方米.1", "亿立方米.2", "亿立方米.3"]]
    .fillna(0.0)
    .set_index(reservoir["年"])
    .itertuples(index=True)
):
    yr = row[0]
    for i in row[1:]:
        if i > 0:
            year_list.append(yr)
            data_list.append(i)

fig, ax = plt.subplots(figsize=(4, 3))
plt.hist(year_list, 20)

In [None]:
plt.scatter(x=year_list, y=np.log10(np.array(data_list)))

In [None]:
PROVINCE_LIST = [
    "青海省",
    "甘肃省",
    "宁夏回族自治区",
    "内蒙古自治区",
    "陕西省",
    "山西省",
    "河南省",
    "河北省",
    "山东省",
]

PROVINCE_TO_SUBREGION = {
    "SR": [PROVINCE_LIST[0]] + ["青海", "Qinghai"],
    "UR": PROVINCE_LIST[1:4]
    + ["甘肃", "宁夏", "内蒙", "Gansu", "Ningxia", "Neimeng"],
    "MR": PROVINCE_LIST[4:6] + ["陕西", "山西", "Shanxi", "Shaanxi"],
    "DR": PROVINCE_LIST[6:] + ["河南", "河北", "天津", "山东", "Henan", "Shandong"],
}


def judge_province(row, how="capacity"):
    result = {region: 0 for region in ["SR", "UR", "MR", "DR"]}
    result2 = {region: 0 for region in ["SR", "UR", "MR", "DR"]}
    names = [row[col] for col in ["名称", "名称.1", "名称.2", "名称.3"]]
    rcs = [row[col] for col in ["亿立方米", "亿立方米.1", "亿立方米.2", "亿立方米.3"]]
    for name, rc in zip(names, rcs):
        if name is np.nan or rc is np.nan:
            continue
        else:
            province = name_province.get(name)
            for k, v in PROVINCE_TO_SUBREGION.items():
                if province in v:
                    result[k] += rc
                    result2[k] += 1
    if how == "capacity":
        return result
    elif how == "amount":
        return result2


new_reservoirs = pd.DataFrame(
    reservoir.apply(judge_province, axis=1).values.tolist(),
    index=reservoir["年"].tolist(),
)
new_reservoirs.head()

In [None]:
num_reservoirs = pd.DataFrame(
    reservoir.apply(judge_province, axis=1, how="amount").values.tolist(),
    index=reservoir["年"].tolist(),
)
num_reservoirs.head()

In [None]:
new_reservoirs.sum(axis=1).plot()
plt.show();

In [None]:
new_reservoirs

In [None]:
reservoirs_capacity_cumulating = new_reservoirs.cumsum()
reservoirs_capacity_cumulating.plot()
plt.title("Accumulating reservoirs' capacity of each region")
plt.ylabel("Reservoir capacities (10^8 m3)")
plt.show();

# 最终作图

In [None]:
# 全局变量
import matplotlib.colors as col

period_colors = ["#0889A6", "#F1801F", "#006C43"]

region_colors = ["#0077b6", "#e07a5f", "#f2cc8f", "#81b29a"]
index_colors = ["#7D9DB5", "#B8B08D", "#F2D492"]
regional_colormap = col.ListedColormap(region_colors, "indexed")

bins = 7

In [None]:
num = np.array(year_list)
# p0 = num[num<1965]
# p1 = num[(num<1978)&(num>=1965)]
p1 = num[num < 1978]
p2 = num[(num < 2002) & (num >= 1978)]
p3 = num[(num >= 2002) & (num <= 2013)]

In [None]:
len(p1)
len(p2)
len(p3)

In [None]:
# %config InlineBackend.figure_format ='retina'
import seaborn as sns
from matplotlib.gridspec import GridSpec

# 配置画布
fig, ax1 = plt.subplots(figsize=(3, 2.6), constrained_layout=True)

# 作图 1
# ax1.hist(p0, bins=7, histtype="stepfilled", alpha=0.6, color='lightgray', label='Others')
ax1.hist(
    p1,
    bins=7,
    histtype="stepfilled",
    alpha=0.6,
    color=period_colors[0],
    label="Before 1977",
)
ax1.hist(
    p2,
    bins=7,
    histtype="stepfilled",
    alpha=0.6,
    color=period_colors[1],
    label="P2: 1978-2001",
)
ax1.hist(
    p3,
    bins=7,
    histtype="stepfilled",
    alpha=0.6,
    color=period_colors[2],
    label="P3: 2002-2013",
)

# 作图2
ax2 = ax1.twinx()
sns.kdeplot(year_list, shade=True, ax=ax2, alpha=0.05)

# 作图3
reservoirs_capacity_cumulating.plot(ax=ax3, colormap=regional_colormap, lw=2.5)


# 修饰图1
ax1.set_yticks(np.arange(0, 7.9, 2))
ax1.set_xticks(np.arange(1955, 2016, 15))
ax1.legend(loc=2)
ax1.yaxis.grid(color="white", linestyle="-.", linewidth=0.5)
ax1.set_ylabel("Number of new reservoirs")
# ax1.text(2014, 6.9, 'a.', ha='center', va='center', weight='bold', size='large')

# 修饰图2
ax2.set_xlim(1955, 2015)
ax2.axes.get_yaxis().set_visible(False)

for ax in [ax1]:
    ax.axvline(1978, color="gray", ls=":", lw=1.5)
    ax.axvline(2001, color="gray", ls=":", lw=1.5)
    ax.set_xlabel("Year")

for ax in [ax1, ax2]:
    ax.spines["top"].set_visible(False)
    ax.spines["bottom"].set_visible(True)
    ax.spines["left"].set_visible(False)
    ax.spines["right"].set_visible(False)

plt.savefig("../figures/sup/reservoirs.jpg", dpi=300)
plt.savefig("../figures/sup/reservoirs.pdf", dpi=300)
plt.show();