## 将json文件转换为csv文件
通过pandas把数据转换为pd类型

然后重命名列

然后存储到csv文件中

In [None]:
import pandas as pd

# 直接用pandas读取JSON文件，orient='index'表示JSON的键将成为DataFrame的索引
data = pd.read_json("data/gb2260.json", orient='index', encoding="utf-8")

# 重置索引，将索引变成一个列
df = data.reset_index()

# 重命名列
df.columns = ["code", "name"]

# 保存为CSV
df.to_csv("data/gb2260.csv", index=False, encoding="utf-8-sig")

print("转换完成，已保存为gb2260.csv")

## 分割数据为不同的省区

In [None]:
import pandas as pd

# 读取CSV文件
df = pd.read_csv("data/gb2260.csv")

# 确保代码列是字符串类型
df['code'] = df['code'].astype(str).str.zfill(6)  # 确保都是6位数，不足的用0填充

# 创建新的列用于存储不同级别的行政区划代码和名称
df['prov_code'] = df['code'].str[:2] + '0000'  # 省级代码，后四位为0
df['city_code'] = df['code'].str[:4] + '00'    # 市级代码，后两位为0
df['county_code'] = df['code']                 # 县级代码就是原始代码

# 创建一个函数来获取每个级别的名称
def get_level_names(df):
    # 创建省级字典
    prov_dict = df[df['code'].str[2:] == '0000'][['code', 'name']].set_index('code').to_dict()['name']
    
    # 创建市级字典
    city_dict = df[df['code'].str[4:] == '00'][['code', 'name']].set_index('code').to_dict()['name']
    
    # 映射省名称
    df['prov_name'] = df['prov_code'].map(prov_dict)
    
    # 映射市名称
    df['city_name'] = df['city_code'].map(city_dict)
    
    # 县级名称就是原始名称，除非代码指示这是一个省或市级单位
    df['county_name'] = df['name']
    
    # 对于省级条目，将市级和县级名称设为同样的值
    prov_mask = df['code'].str[2:] == '0000'
    df.loc[prov_mask, 'city_name'] = df.loc[prov_mask, 'name']
    df.loc[prov_mask, 'county_name'] = df.loc[prov_mask, 'name']
    
    # 对于市级条目，将县级名称设为同样的值
    city_mask = (df['code'].str[4:] == '00') & (df['code'].str[2:] != '0000')
    df.loc[city_mask, 'county_name'] = df.loc[city_mask, 'name']
    
    return df

# 应用函数获取名称
df = get_level_names(df)

# 添加行政区级别标识
df['level'] = None
df.loc[df['code'].str[2:] == '0000', 'level'] = '省级'
df.loc[(df['code'].str[4:] == '00') & (df['code'].str[2:] != '0000'), 'level'] = '市级'
df.loc[df['code'].str[4:] != '00', 'level'] = '县级'

# 创建指示变量
df['is_prov'] = (df['level'] == '省级').astype(int)
df['is_city'] = (df['level'] == '市级').astype(int)
df['is_county'] = (df['level'] == '县级').astype(int)

# 保存结果
save_path = "data/gb2260_structured.csv"
df.to_csv(save_path, index=False, encoding="utf-8-sig")

print(f"处理完成，结果已保存到 {save_path}")

In [None]:
import pandas as pd

# 转换编码格式
pd.read_csv("data/city_geo.geo@github.gbk.csv", encoding='gbk').to_csv("data/city_geo.geo@github.csv", encoding='utf-8', index=False)

# 将GitHub开源的地理数据整合到gb2260文件里

In [None]:
import pandas as pd

# 读取行政区划结构化数据
df_admin = pd.read_csv("data/gb2260_structured.csv")

# 读取地理坐标数据
df_geo = pd.read_csv("data/city_geo.geo@github.csv")

# 确保行政代码在两个数据集中的格式一致
df_admin['code'] = df_admin['code'].astype(str).str.zfill(6)
df_geo['行政代码'] = df_geo['行政代码'].astype(str).str.zfill(6)

# 通过行政代码合并两个数据集
# 使用left join保留所有行政区划记录，即使没有对应的地理坐标
merged_df = pd.merge(
    df_admin,
    df_geo[['行政代码', '东经', '北纬']],
    left_on='code',
    right_on='行政代码',
    how='left'
)

# 删除重复的行政代码列
merged_df.drop('行政代码', axis=1, inplace=True, errors='ignore')

# 将经纬度列重命名为更通用的名称
merged_df.rename(columns={'东经': 'longitude', '北纬': 'latitude'}, inplace=True)

# 保存合并后的数据
merged_df.to_csv("data/gb2260_with_geo.geo@github.csv", index=False, encoding="utf-8-sig")

print("处理完成，结果已保存到 data/gb2260_with_geo.geo@github.csv")
print(f"总记录数: {len(merged_df)}")
print(f"有地理坐标的记录数: {merged_df['longitude'].notna().sum()}")

# 将阿里提供的[地理数据](data/city_geo.geo@ali.json)整合到gb2260文件
**还没写完，后续慢慢写**

In [None]:
import pandas as pd

# TODO: 这里还没做🥺
geo = pd.read_json("data/city_geo.geo@ali.json")
geo = geo["features"]


In [None]:
type(geo)
geo

In [60]:
gdata = geo[0]["geometry"]["coordinates"]

In [61]:
for d in gdata:
    print(d)

[[[117.348611, 40.581141], [117.389879, 40.561593], [117.429915, 40.576141], [117.412669, 40.605226], [117.467487, 40.649738], [117.467487, 40.649738], [117.501364, 40.636569], [117.514914, 40.660181], [117.493973, 40.675161], [117.408973, 40.686961], [117.342451, 40.673799], [117.319662, 40.657911], [117.278394, 40.664267], [117.208177, 40.694675], [117.117018, 40.70012], [117.11209, 40.707379], [117.012308, 40.693767], [116.964881, 40.709647], [116.926692, 40.745022], [116.924229, 40.773581], [116.848468, 40.839264], [116.81336, 40.848319], [116.759773, 40.889954], [116.713577, 40.909858], [116.722201, 40.927495], [116.677853, 40.970888], [116.698795, 41.021477], [116.688324, 41.044501], [116.647672, 41.059394], [116.615643, 41.053076], [116.623034, 41.021026], [116.598397, 40.974503], [116.5676, 40.992574], [116.519557, 40.98128], [116.519557, 40.98128], [116.455499, 40.980828], [116.447492, 40.953715], [116.477057, 40.899907], [116.398216, 40.90624], [116.370499, 40.94377], [116.33