# 加载Zhou数据并分区

In [None]:
import numpy as np
import pandas as pd
import os
import sys

sys.path.append("..")
from tools.processing import dbf_data_list, pd_read_dbf

from matplotlib import pyplot as plt

%matplotlib inline

## PNAS数据

In [None]:
pns = pd.read_excel(
    io=r"../data/Zhou et al_2020_PNAS_dataset.xlsx", sheet_name="D1"
)

# 清洗数据的头部，让两行头部变一行
change_name_dic = {}
last_item = "None"
for col in pns:
    second_row = pns.loc[0, col]
    if "Unnamed" in col:
        change_name_dic[col] = last_item + ": " + second_row
    else:
        if type(second_row) is str:
            change_name_dic[col] = col + ": " + second_row
        last_item = col

pns.rename(change_name_dic, axis=1, inplace=True)
pns = pns.drop(0)

# 重命名表头，取消两边的空格
pns.rename({col: col.strip() for col in pns}, axis=1, inplace=True)

# 更改正确的数据类型
pns = pns.astype(float, errors="ignore")
pns["Year"] = pns["Year"].astype(int)
pns.iloc[:, 2:] = pns.iloc[:, 2:].astype(float)

pns.head()

## 判断黄河流域黄河的分区

- 分别加载源区、上游、中游、下游
- 对每一个与黄河流域相交的市，判断其与四个区域中的哪个相交
- 如果相交，则提取相交部分的面积
- 对单一相交的，进行区域标记，记录面积为其相交面积
- 对重复相交的，判断哪个相交部分面积更大，进行区域标记，以该面积为相交面积
- 对标记好的区域，按照相交部分面积，计算修正比例系数

In [None]:
# 分别加载每个区域的相交市县
regions = ["SR", "UR", "MR", "DR"]

# 对每个市县计算其与各个流域相交部分的面积
intersect_area = {region: {} for region in regions}
for region in regions:
    file_path = r"../data/perfectures/yr_regions/{}_Intersect.dbf".format(
        region
    )
    df = pd_read_dbf(file_path)
    area_index = df.columns.tolist().index("Area_calcu")  # 面积的索引
    city_index = df.columns.tolist().index("Perfecture")  # 市ID的索引
    for row in df.itertuples(index=False):
        city = row[city_index]
        area = row[area_index]
        intersect_area[region][city] = area

perfectures = pd.DataFrame(intersect_area).fillna(0.0)
perfectures.head()

In [None]:
# 对每个市县，比较其在每个区域的面积，取其中最大的
fn = r"../data/perfectures/yr/perfectures_YR.dbf"
yr = pd_read_dbf(fn)
yr = yr.set_index(yr["Perfecture"], drop=True).drop("Perfecture", axis=1)

for row in perfectures.itertuples():
    city_id = row[0]
    area_tuple = row[1:]
    max_area = max(area_tuple)
    region = perfectures.columns[area_tuple.index(max_area)]
    yr.loc[city_id, "Region"] = region
    yr.loc[city_id, "Intersect_area"] = max_area

yr["Ratio"] = yr["Intersect_area"] / yr["Area_calcu"]  # 计算最大面积占比
yr.head()

In [None]:
# 获得某个阈值的数据
def get_data_with_threshold(threshold):
    filtered_yr = yr[yr["Ratio"] > threshold].copy()
    filtered_yr.drop("Province_n", axis=1, inplace=True)
    filtered_yr.reset_index(inplace=True)
    data = pns[pns.City_ID.isin(filtered_yr["Perfecture"].values)]
    return pd.merge(
        left=data, right=yr, left_on="City_ID", right_on="Perfecture"
    )


shresh_05_data = get_data_with_threshold(0.05)
shresh_05_data.head()

In [None]:
# shresh_05_data.to_csv(r"../data/perfectures/yr/perfectures_in_YR_with_threshold_0.05.csv")