In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
from shapely.ops import unary_union
from shapely.validation import make_valid
from tqdm import tqdm
import time
import logging
from collections import defaultdict
from shapely.strtree import STRtree
import warnings
warnings.filterwarnings('ignore')
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def process_dlmc_group(dlmc_value, group_data, threshold, spatial_index, all_geometries):
    """处理单个DLMC组的图斑，使用更高效的方式"""
    try:
        merge_pairs = []
        processed = set()
        
        # 使用空间索引快速查找邻居
        for idx, row in group_data.iterrows():
            if idx in processed:
                continue
                
            area = row['area']
            if area >= threshold:
                continue
                
            geom = row.geometry
            possible_neighbors = spatial_index.query(geom)
            
            # 直接在循环中找到最佳邻居
            best_neighbor = None
            max_area = 0
            
            for neighbor_idx in possible_neighbors:
                if (neighbor_idx != idx and 
                    neighbor_idx not in processed and 
                    geom.touches(all_geometries[neighbor_idx])):
                    
                    neighbor_area = group_data.loc[neighbor_idx, 'area'] if neighbor_idx in group_data.index else 0
                    if neighbor_area > max_area:
                        max_area = neighbor_area
                        best_neighbor = neighbor_idx
            
            if best_neighbor is not None:
                merge_pairs.append((idx, best_neighbor))
                processed.update([idx, best_neighbor])
        
        return merge_pairs
        
    except Exception as e:
        logging.error(f"处理DLMC组 {dlmc_value} 时出错: {str(e)}")
        return []

def execute_merges(gdf, merge_operations):
    """执行合并操作，使用更高效的方式"""
    try:
        merged_count = 0
        skipped_count = 0
        
        # 使用布尔索引进行批量操作
        gdf['to_delete'] = False
        
        for small_idx, large_idx in tqdm(merge_operations, desc="执行合并操作"):
            try:
                if small_idx not in gdf.index or large_idx not in gdf.index:
                    skipped_count += 1
                    continue
                
                small_geom = gdf.loc[small_idx, 'geometry']
                large_geom = gdf.loc[large_idx, 'geometry']
                
                new_geometry = safe_union(small_geom, large_geom)
                if new_geometry is None:
                    skipped_count += 1
                    continue
                
                gdf.loc[large_idx, 'geometry'] = new_geometry
                gdf.loc[large_idx, 'area'] = new_geometry.area
                gdf.loc[small_idx, 'to_delete'] = True
                
                merged_count += 1
                
            except Exception as e:
                logging.error(f"合并图斑 {small_idx} -> {large_idx} 时出错: {str(e)}")
                skipped_count += 1
        
        # 批量删除已合并的图斑
        gdf = gdf[~gdf['to_delete']]
        gdf = gdf.drop(columns=['to_delete'])
        
        logging.info(f"合并了 {merged_count} 个图斑，跳过了 {skipped_count} 个")
        return gdf, merged_count
        
    except Exception as e:
        logging.error(f"执行合并操作时出错: {str(e)}")
        return gdf, 0

def merge_small_parcels(input_shp, output_base, dldm_field, dlmc_field, thresholds, default_threshold=50, max_iterations=10):
    """主函数，结合两个脚本的优点"""
    start_time = time.time()
    logging.info(f"开始处理。输入Shapefile: {input_shp}")
    
    try:
        # 读取数据
        gdf = gpd.read_file(input_shp)
        original_count = len(gdf)
        original_area = gdf.geometry.area.sum()
        logging.info(f"读取了 {original_count} 个图斑，总面积: {original_area}")
        logging.info(f"原始坐标系统: {gdf.crs}")
        
        # 检查多部件要素
        logging.info("正在将多部件要素转换为单部件...")
        if any(geom.geom_type.startswith('Multi') for geom in gdf.geometry):
            gdf = gdf.explode(index_parts=True)
        logging.info(f"转换后共有 {len(gdf)} 个图斑")
        
        # 重命名字段
        gdf = gdf.rename(columns={dldm_field: 'dldm', dlmc_field: 'dlmc'})
        
        iteration = 0
        total_merged = 0
        
        while iteration < max_iterations:
            iteration += 1
            logging.info(f"开始第 {iteration} 轮处理")
            
            # 计算面积
            gdf['area'] = gdf.geometry.area
            
            # 识别小图斑
            small_parcels = gdf[gdf.apply(lambda row: row['area'] < thresholds.get(row['dldm'], default_threshold), axis=1)]
            logging.info(f"小面积图斑数量: {len(small_parcels)}, 大面积图斑数量: {len(gdf) - len(small_parcels)}")
            
            if len(small_parcels) == 0:
                break
                
            # 记录小图斑的DLDM分布
            dldm_counts = small_parcels['dldm'].value_counts().to_dict()
            logging.info(f"小面积图斑DLDM分布: {dldm_counts}")
            
            # 构建空间索引
            spatial_index = STRtree(gdf.geometry.values)
            all_geometries = gdf.geometry.values
            
            merge_operations = []
            
            # 按DLMC分组处理
            for dlmc_value, group in small_parcels.groupby('dlmc'):
                threshold = thresholds.get(group.iloc[0]['dldm'], default_threshold)
                group_results = process_dlmc_group(
                    dlmc_value, group, threshold, spatial_index, all_geometries
                )
                merge_operations.extend(group_results)
            
            if not merge_operations:
                break
                
            gdf, merged_count = execute_merges(gdf, merge_operations)
            total_merged += merged_count
            
            current_area = gdf.geometry.area.sum()
            logging.info(f"第 {iteration} 轮处理完成。本轮合并 {merged_count} 个图斑。当前总面积: {current_area}")
            
            if merged_count == 0:
                break
        
        # 恢复原始字段名
        gdf = gdf.rename(columns={'dldm': dldm_field, 'dlmc': dlmc_field})
        
        # 保存结果
        output_shp = f"{output_base}.shp"
        result_truncated = gdf.rename(columns={col: col[:10] for col in gdf.columns if len(col) > 10})
        result_truncated.to_file(output_shp, encoding='utf-8')
        
        end_time = time.time()
        logging.info(f"总处理时间: {(end_time - start_time) / 60:.2f} 分钟")
        logging.info(f"总共合并: {total_merged} 个图斑")
        logging.info(f"最终图斑数量: {len(gdf)}")
        logging.info(f"减少的图斑数量: {original_count - len(gdf)}")
        logging.info(f"总面积变化: {gdf.geometry.area.sum() - original_area}")
        
    except Exception as e:
        logging.error(f"处理过程中发生错误: {str(e)}")

def safe_union(geom1, geom2):
    """安全地合并两个几何体"""
    try:
        # 检查输入几何体的有效性
        if geom1 is None or geom2 is None:
            logging.warning("输入几何体为空")
            return None
            
        # 修复无效几何体
        if not geom1.is_valid:
            geom1 = make_valid(geom1)
        if not geom2.is_valid:
            geom2 = make_valid(geom2)
            
        # 尝试合并
        union = unary_union([geom1, geom2])
        
        # 验证结果
        if not union.is_valid:
            union = make_valid(union)
        if union.is_empty:
            logging.warning("合并结果为空几何体")
            return None
            
        return union
        
    except Exception as e:
        logging.error(f"合并几何形状时出错: {str(e)}")
        return None

if __name__ == '__main__':
    input_shp = r"C:\Users\Runker\Desktop\ele_sb\sb_merge_data_single.shp"
    output_base = r"C:\Users\Runker\Desktop\ele_sb\sb_merge_data_single_result_fast_cursor_ipynb"
    dldm_field = "DLDM"
    dlmc_field = "DLMC"
    thresholds = {"01": 50, "02": 50, "03": 2000, "04": 2000}
    default_threshold = 50
    
    merge_small_parcels(input_shp, output_base, dldm_field, dlmc_field, thresholds, default_threshold)

2024-10-29 20:23:17,570 - INFO - 开始处理。输入Shapefile: C:\Users\Runker\Desktop\ele_sb\sb_merge_data_single.shp
2024-10-29 20:23:53,765 - INFO - 读取了 261981 个图斑，总面积: 1462965997.409759
2024-10-29 20:23:53,766 - INFO - 原始坐标系统: EPSG:4545
2024-10-29 20:23:53,767 - INFO - 正在将多部件要素转换为单部件...
2024-10-29 20:23:54,670 - INFO - 转换后共有 261981 个图斑
2024-10-29 20:23:54,713 - INFO - 开始第 1 轮处理
2024-10-29 20:23:55,837 - INFO - 小面积图斑数量: 109010, 大面积图斑数量: 152971
2024-10-29 20:23:55,860 - INFO - 小面积图斑DLDM分布: {'03': 80851, '01': 24358, '04': 2502, '02': 917, '12': 382}
执行合并操作: 100%|██████████| 22633/22633 [00:09<00:00, 2455.40it/s]
2024-10-29 20:25:03,986 - INFO - 合并了 22633 个图斑，跳过了 0 个
2024-10-29 20:25:04,102 - INFO - 第 1 轮处理完成。本轮合并 22633 个图斑。当前总面积: 1462965997.4097588
2024-10-29 20:25:04,102 - INFO - 开始第 2 轮处理
2024-10-29 20:25:05,169 - INFO - 小面积图斑数量: 83882, 大面积图斑数量: 155466
2024-10-29 20:25:05,170 - INFO - 小面积图斑DLDM分布: {'03': 60665, '01': 20158, '04': 1901, '02': 802, '12': 356}
执行合并操作: 100%|██████████| 13312/13312

In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
from shapely.ops import unary_union
from shapely.validation import make_valid
from tqdm import tqdm
import time
import logging
from collections import defaultdict
from shapely.strtree import STRtree
import warnings
warnings.filterwarnings('ignore')
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def process_dlmc_group(dlmc_value, group_data, threshold, spatial_index, all_geometries):
    """处理单个DLMC组的图斑，优化性能"""
    try:
        merge_pairs = []
        processed = set()
        
        # 预先计算所有可能的邻居关系
        neighbor_dict = defaultdict(set)
        group_indices = set(group_data.index)
        
        # 批量处理空间索引查询
        for idx, row in group_data.iterrows():
            if row['area'] >= threshold:
                continue
                
            geom = row.geometry
            possible_neighbors = spatial_index.query(geom)
            
            # 过滤出有效的邻居
            for n_idx in possible_neighbors:
                if n_idx != idx and n_idx in group_indices and geom.touches(all_geometries[n_idx]):
                    neighbor_dict[idx].add(n_idx)
        
        # 使用预计算的邻居关系进行合并
        for idx in group_indices:
            if idx in processed or idx not in neighbor_dict:
                continue
                
            neighbors = neighbor_dict[idx]
            valid_neighbors = {n for n in neighbors if n not in processed}
            
            if valid_neighbors:
                # 选择面积最大的邻居
                best_neighbor = max(valid_neighbors,
                                  key=lambda x: group_data.loc[x, 'area'])
                merge_pairs.append((idx, best_neighbor))
                processed.update([idx, best_neighbor])
        
        return merge_pairs
        
    except Exception as e:
        logging.error(f"处理DLMC组 {dlmc_value} 时出错: {str(e)}")
        return []

def execute_merges(gdf, merge_operations):
    """执行合并操作，使用更高效的方式"""
    try:
        merged_count = 0
        skipped_count = 0
        
        # 使用布尔索引进行批量操作
        gdf['to_delete'] = False
        
        for small_idx, large_idx in tqdm(merge_operations, desc="执行合并操作"):
            try:
                if small_idx not in gdf.index or large_idx not in gdf.index:
                    skipped_count += 1
                    continue
                
                small_geom = gdf.loc[small_idx, 'geometry']
                large_geom = gdf.loc[large_idx, 'geometry']
                
                new_geometry = safe_union(small_geom, large_geom)
                if new_geometry is None:
                    skipped_count += 1
                    continue
                
                gdf.loc[large_idx, 'geometry'] = new_geometry
                gdf.loc[large_idx, 'area'] = new_geometry.area
                gdf.loc[small_idx, 'to_delete'] = True
                
                merged_count += 1
                
            except Exception as e:
                logging.error(f"合并图斑 {small_idx} -> {large_idx} 时出错: {str(e)}")
                skipped_count += 1
        
        # 批量删除已合并的图斑
        gdf = gdf[~gdf['to_delete']]
        gdf = gdf.drop(columns=['to_delete'])
        
        logging.info(f"合并了 {merged_count} 个图斑，跳过了 {skipped_count} 个")
        return gdf, merged_count
        
    except Exception as e:
        logging.error(f"执行合并操作时出错: {str(e)}")
        return gdf, 0

def merge_small_parcels(input_shp, output_base, dldm_field, dlmc_field, thresholds, default_threshold=50, max_iterations=10):
    """主函数，优化性能"""
    start_time = time.time()
    logging.info(f"开始处理。输入Shapefile: {input_shp}")
    
    try:
        # 读取数据
        gdf = gpd.read_file(input_shp)
        original_count = len(gdf)
        original_area = gdf.geometry.area.sum()
        
        # 初始化处理
        gdf = gdf.rename(columns={dldm_field: 'dldm', dlmc_field: 'dlmc'})
        if any(geom.geom_type.startswith('Multi') for geom in gdf.geometry):
            gdf = gdf.explode(index_parts=True)
        
        # 预先构建空间索引
        spatial_index = STRtree(gdf.geometry.values)
        all_geometries = gdf.geometry.values
        
        iteration = 0
        total_merged = 0
        
        while iteration < max_iterations:
            iteration += 1
            logging.info(f"开始第 {iteration} 轮处理")
            
            # 计算面积和识别小图斑
            gdf['area'] = gdf.geometry.area
            small_parcels = gdf[gdf.apply(lambda row: row['area'] < thresholds.get(row['dldm'], default_threshold), axis=1)]
            
            if len(small_parcels) == 0:
                break
                
            logging.info(f"小面积图斑数量: {len(small_parcels)}, 大面积图斑数量: {len(gdf) - len(small_parcels)}")
            logging.info(f"小面积图斑DLDM分布: {small_parcels['dldm'].value_counts().to_dict()}")
            
            merge_operations = []
            
            # 按DLMC分组批量处理
            for dlmc_value, group in small_parcels.groupby('dlmc'):
                threshold = thresholds.get(group.iloc[0]['dldm'], default_threshold)
                group_results = process_dlmc_group(
                    dlmc_value, group, threshold, spatial_index, all_geometries
                )
                merge_operations.extend(group_results)
            
            if not merge_operations:
                break
                
            # 执行合并操作
            gdf, merged_count = execute_merges(gdf, merge_operations)
            total_merged += merged_count
            
            if merged_count == 0:
                break
                
            # 更新空间索引和几何数组
            spatial_index = STRtree(gdf.geometry.values)
            all_geometries = gdf.geometry.values
        
        # 保存结果
        gdf = gdf.rename(columns={'dldm': dldm_field, 'dlmc': dlmc_field})
        result_truncated = gdf.rename(columns={col: col[:10] for col in gdf.columns if len(col) > 10})
        result_truncated.to_file(f"{output_base}.shp", encoding='utf-8')
        
        # 输出统计信息
        end_time = time.time()
        logging.info(f"总处理时间: {(end_time - start_time) / 60:.2f} 分钟")
        logging.info(f"总共合并: {total_merged} 个图斑")
        logging.info(f"最终图斑数量: {len(gdf)}")
        logging.info(f"减少的图斑数量: {original_count - len(gdf)}")
        logging.info(f"总面积变化: {gdf.geometry.area.sum() - original_area}")
        
    except Exception as e:
        logging.error(f"处理过程中发生错误: {str(e)}")

def safe_union(geom1, geom2):
    """安全地合并两个几何体"""
    try:
        # 检查输入几何体的有效性
        if geom1 is None or geom2 is None:
            logging.warning("输入几何体为空")
            return None
            
        # 修复无效几何体
        if not geom1.is_valid:
            geom1 = make_valid(geom1)
        if not geom2.is_valid:
            geom2 = make_valid(geom2)
            
        # 尝试合并
        union = unary_union([geom1, geom2])
        
        # 验证结果
        if not union.is_valid:
            union = make_valid(union)
        if union.is_empty:
            logging.warning("合并结果为空几何体")
            return None
            
        return union
        
    except Exception as e:
        logging.error(f"合并几何形状时出错: {str(e)}")
        return None

if __name__ == '__main__':
    input_shp = r"C:\Users\Runker\Desktop\ele_sb\sb_merge_data_single.shp"
    output_base = r"C:\Users\Runker\Desktop\ele_sb\sb_merge_data_single_result_fast_cursor_ipynb"
    dldm_field = "DLDM"
    dlmc_field = "DLMC"
    thresholds = {"01": 50, "02": 50, "03": 2000, "04": 2000}
    default_threshold = 50
    
    merge_small_parcels(input_shp, output_base, dldm_field, dlmc_field, thresholds, default_threshold)