<a href="https://colab.research.google.com/github/Van-Wu1/cycle/blob/main/scr%20/py/s1_roadcleanV3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')
!ls '/content/drive/MyDrive/CASA0004_Cycling/data'

Mounted at /content/drive
BoroughShp  GreatLondonShp  s1	s2_Env	s3


In [2]:
import os, glob, math
import geopandas as gpd
import pandas as pd
import numpy as np
from shapely.geometry import LineString
import networkx as nx

In [3]:
# ========== 路径 ==========
IN_DIR  = "/content/drive/MyDrive/CASA0004_Cycling/data/s1/Roads_OT/OTcleaned"   # 输入：你的“可骑行路网”目录
OUT_DIR = "/content/drive/MyDrive/CASA0004_Cycling/data/s1/Roads_OT/OTcleanedV2"   # 输出：清洗后

In [4]:
os.makedirs(OUT_DIR, exist_ok=True)

# ========== 参数（米制）==========
TARGET_CRS        = "EPSG:27700"  # 统一投影
ASSUME_SRC_CRS    = "EPSG:4326"   # 若输入缺失CRS，按WGS84处理
SNAP_TOL          = 2.0           # 端点吸附容差（米），0=不吸附
MIN_COMP_LENGTH   = 100.0         # 小连通分量总长度阈值（米）
DEADEND_MAX_LEN   = 20.0          # 短死端单边长度阈值（米）
DEADEND_PASSES    = 3             # 死端迭代次数

In [5]:
# ========== 工具 ==========
def to_target_crs(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    if gdf.crs is None:
        # 你说“现在是84的”，缺省就按 WGS84
        gdf = gdf.set_crs(ASSUME_SRC_CRS, allow_override=True)
    # 投到目标CRS
    if str(gdf.crs).lower() != TARGET_CRS.lower():
        gdf = gdf.to_crs(TARGET_CRS)
    return gdf

def keep_only_lines(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    # 仅保留线要素（LineString/MultiLineString），其余全部丢弃
    ok = gdf.geometry.geom_type.isin(["LineString", "MultiLineString"])
    gdf = gdf.loc[ok].copy()
    if gdf.empty:
        return gdf
    # 展开多线
    gdf = gdf.explode(index_parts=False, ignore_index=True)
    # 去掉空/无效
    gdf = gdf[~gdf.geometry.is_empty & gdf.geometry.notna()].reset_index(drop=True)
    return gdf

def snap_endpoints(gdf: gpd.GeoDataFrame, tol=0.0) -> gpd.GeoDataFrame:
    if tol <= 0 or gdf.empty:
        return gdf
    def snap_coord(x):
        return float(round(x / tol) * tol)
    def snap_line(geom):
        if geom.is_empty or geom.geom_type != "LineString":
            return geom
        coords = [(snap_coord(x), snap_coord(y)) for x, y in geom.coords]
        # 去重合点
        uniq = [coords[0]]
        for c in coords[1:]:
            if c != uniq[-1]:
                uniq.append(c)
        if len(uniq) < 2:
            return None
        return LineString(uniq)
    gdf = gdf.copy()
    gdf["geometry"] = gdf.geometry.apply(snap_line)
    gdf = gdf[~gdf.geometry.isna()].reset_index(drop=True)
    return gdf

def build_graph(gdf: gpd.GeoDataFrame):
    G = nx.Graph()
    lengths = gdf.geometry.length.values
    for idx, geom in enumerate(gdf.geometry):
        if geom is None or geom.is_empty or geom.geom_type != "LineString":
            continue
        x1, y1 = geom.coords[0]
        x2, y2 = geom.coords[-1]
        u = (round(x1, 6), round(y1, 6))
        v = (round(x2, 6), round(y2, 6))
        G.add_edge(u, v, idx=idx, length=float(lengths[idx]))
    return G

def remove_small_components(gdf: gpd.GeoDataFrame, min_total_len: float) -> gpd.GeoDataFrame:
    if gdf.empty:
        return gdf
    G = build_graph(gdf)
    keep_idx = []
    for comp in nx.connected_components(G):
        sub_edges = G.subgraph(comp).edges(data=True)
        total_len = sum(e[2].get("length", 0.0) for e in sub_edges)
        if total_len >= min_total_len:
            keep_idx.extend([e[2]["idx"] for e in sub_edges])
    keep_idx = sorted(set(keep_idx))
    return gdf.iloc[keep_idx].reset_index(drop=True)

def prune_deadends(gdf: gpd.GeoDataFrame, max_len: float, passes=1) -> gpd.GeoDataFrame:
    gdf = gdf.copy()
    for _ in range(max(1, passes)):
        if gdf.empty:
            break
        G = build_graph(gdf)
        deg = dict(G.degree())
        drop_edge_idx = set()
        for u, v, data in G.edges(data=True):
            if (deg.get(u, 0) == 1 or deg.get(v, 0) == 1) and data.get("length", 0.0) < max_len:
                drop_edge_idx.add(data["idx"])
        if not drop_edge_idx:
            break
        gdf = gdf.drop(index=list(drop_edge_idx)).reset_index(drop=True)
    return gdf

In [6]:
# ========== 主流程 ==========
files = sorted(glob.glob(os.path.join(IN_DIR, "*.geojson")))
print(f"发现 {len(files)} 个输入文件")
for fp in files:
    name = os.path.basename(fp)
    print("\n" + "="*70)
    print(f"处理：{name}")
    gdf = gpd.read_file(fp)

    n0 = len(gdf)
    # 0) 统一投影
    gdf = to_target_crs(gdf)

    # 1) 仅保留线；点/面/集合体全部丢弃
    gdf = keep_only_lines(gdf)
    n_lines = len(gdf)
    print(f"去点/面后：{n_lines}/{n0}")

    # 2) 端点吸附
    gdf = snap_endpoints(gdf, tol=SNAP_TOL)

    # 3) 删小连通分量
    gdf = remove_small_components(gdf, MIN_COMP_LENGTH)

    # 4) 剪短死端
    gdf = prune_deadends(gdf, DEADEND_MAX_LEN, passes=DEADEND_PASSES)

    # 5) 输出
    out_path = os.path.join(OUT_DIR, name)
    gdf.to_file(out_path, driver="GeoJSON")
    print(f"清理完成：{len(gdf)} 条，保存 {out_path}")

print("\n✅ 全部完成。参数：",
      f"CRS→{TARGET_CRS}；SNAP_TOL={SNAP_TOL} m；MIN_COMP_LENGTH={MIN_COMP_LENGTH} m；DEADEND_MAX_LEN={DEADEND_MAX_LEN} m ×{DEADEND_PASSES}")

发现 9 个输入文件

处理：export (1).geojson
去点/面后：26037/26082
清理完成：15864 条，保存 /content/drive/MyDrive/CASA0004_Cycling/data/s1/Roads_OT/CQI_cleanedV2/export (1).geojson

处理：export (2).geojson
去点/面后：28071/28155
清理完成：17423 条，保存 /content/drive/MyDrive/CASA0004_Cycling/data/s1/Roads_OT/CQI_cleanedV2/export (2).geojson

处理：export (3).geojson
去点/面后：10481/10505
清理完成：6401 条，保存 /content/drive/MyDrive/CASA0004_Cycling/data/s1/Roads_OT/CQI_cleanedV2/export (3).geojson

处理：export (4).geojson
去点/面后：42791/42929
清理完成：27247 条，保存 /content/drive/MyDrive/CASA0004_Cycling/data/s1/Roads_OT/CQI_cleanedV2/export (4).geojson

处理：export (5).geojson
去点/面后：85830/86972
清理完成：58731 条，保存 /content/drive/MyDrive/CASA0004_Cycling/data/s1/Roads_OT/CQI_cleanedV2/export (5).geojson

处理：export (6).geojson
去点/面后：34913/35024
清理完成：22529 条，保存 /content/drive/MyDrive/CASA0004_Cycling/data/s1/Roads_OT/CQI_cleanedV2/export (6).geojson

处理：export (7).geojson
去点/面后：26950/26985
清理完成：16677 条，保存 /content/drive/MyDrive/CASA0004_Cycling/data/s1/Roa