<a href="https://colab.research.google.com/github/TARTAR4600/CASA0004/blob/main/JSONmatching2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import google.colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import json
import re
from tqdm import tqdm

# ===== 配置路径 =====
gdrive_input_dir = "/content/drive/MyDrive/Dissertation2/2016north_Mask2Former_output_json"
big_json_path = "/content/drive/MyDrive/Dissertation2/2016north_image_final/label_studio_config.json"
gdrive_output_dir = "/content/drive/MyDrive/Dissertation2/2016north_Mask2Former_with_geo_batched"

batch_size = 500   # 每批处理多少个文件
os.makedirs(gdrive_output_dir, exist_ok=True)

# ===== 提取 point_id =====
def extract_point_id(filename):
    match = re.search(r"(point_\d+)", filename)
    return match.group(1) if match else None

# ===== 读取大 JSON 并建立查找表 =====
print("读取大 JSON 数据...")
with open(big_json_path, "r", encoding="utf-8") as f:
    big_data = json.load(f)

geo_lookup = {}
for entry in big_data:
    filename = os.path.basename(entry["data"]["image"])
    point_id = extract_point_id(filename)
    if point_id:
        geo_lookup[point_id] = {
            "longitude": entry["data"].get("longitude"),
            "latitude": entry["data"].get("latitude"),
            "heading": entry["data"].get("heading")
        }

# ===== 批次处理 =====
files = [f for f in os.listdir(gdrive_input_dir) if f.endswith(".json")]
print(f"总文件数: {len(files)}，批次大小: {batch_size}")

for start in range(0, len(files), batch_size):
    batch = files[start:start+batch_size]
    batch_id = start // batch_size
    out_subdir = os.path.join(gdrive_output_dir, f"batch_{batch_id:03d}")
    os.makedirs(out_subdir, exist_ok=True)

    print(f"\n处理批次 {batch_id + 1} / {(len(files)-1)//batch_size + 1} ...")

    for file in tqdm(batch):
        input_path = os.path.join(gdrive_input_dir, file)
        output_path = os.path.join(out_subdir, file)

        # 如果已存在就跳过（可断点续跑）
        if os.path.exists(output_path):
            continue

        try:
            with open(input_path, "r", encoding="utf-8") as f:
                mask_data = json.load(f)

            point_id = extract_point_id(mask_data["image_name"])
            if point_id and point_id in geo_lookup:
                mask_data.update(geo_lookup[point_id])
            else:
                mask_data.update({"longitude": None, "latitude": None, "heading": None})

            with open(output_path, "w", encoding="utf-8") as f:
                json.dump(mask_data, f, ensure_ascii=False)

        except Exception as e:
            print(f"处理失败: {file} - {e}")


读取大 JSON 数据...
总文件数: 50765，批次大小: 500

处理批次 1 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 4053.27it/s]



处理批次 2 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 3496.27it/s]



处理批次 3 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 541.46it/s]



处理批次 4 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 1584.86it/s]



处理批次 5 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 3364.37it/s]



处理批次 6 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 4699.72it/s]



处理批次 7 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 2566.15it/s]



处理批次 8 / 102 ...


100%|██████████| 500/500 [00:06<00:00, 82.05it/s]



处理批次 9 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 4148.09it/s]



处理批次 10 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 3892.28it/s]



处理批次 11 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 4093.55it/s]



处理批次 12 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 998.10it/s] 



处理批次 13 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 817.37it/s]



处理批次 14 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 3149.30it/s]



处理批次 15 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 4571.02it/s]



处理批次 16 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 4190.40it/s]



处理批次 17 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 4093.20it/s]



处理批次 18 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 4046.40it/s]



处理批次 19 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 4252.01it/s]



处理批次 20 / 102 ...


100%|██████████| 500/500 [00:20<00:00, 24.49it/s]



处理批次 21 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 2297.96it/s]



处理批次 22 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 1039.08it/s]



处理批次 23 / 102 ...


100%|██████████| 500/500 [00:01<00:00, 283.82it/s]



处理批次 24 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 2497.46it/s]



处理批次 25 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 2894.16it/s]



处理批次 26 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 545.91it/s]



处理批次 27 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 2822.35it/s]



处理批次 28 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 3863.04it/s]



处理批次 29 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 3204.11it/s]



处理批次 30 / 102 ...


100%|██████████| 500/500 [00:16<00:00, 30.01it/s]



处理批次 31 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 4177.71it/s]



处理批次 32 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 4334.31it/s]



处理批次 33 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 4258.80it/s]



处理批次 34 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 2667.60it/s]



处理批次 35 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 997.44it/s] 



处理批次 36 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 4439.07it/s]



处理批次 37 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 3878.34it/s]



处理批次 38 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 4663.59it/s]



处理批次 39 / 102 ...


100%|██████████| 500/500 [00:07<00:00, 68.76it/s]



处理批次 40 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 4878.43it/s]



处理批次 41 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 5011.93it/s]



处理批次 42 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 2896.21it/s]



处理批次 43 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 1894.38it/s]



处理批次 44 / 102 ...


100%|██████████| 500/500 [00:12<00:00, 40.14it/s]



处理批次 45 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 5019.27it/s]



处理批次 46 / 102 ...


100%|██████████| 500/500 [00:00<00:00, 4770.29it/s]



处理批次 47 / 102 ...


100%|██████████| 500/500 [04:25<00:00,  1.88it/s]



处理批次 48 / 102 ...


100%|██████████| 500/500 [03:42<00:00,  2.24it/s]



处理批次 49 / 102 ...


100%|██████████| 500/500 [03:38<00:00,  2.28it/s]



处理批次 50 / 102 ...


100%|██████████| 500/500 [03:43<00:00,  2.24it/s]



处理批次 51 / 102 ...


100%|██████████| 500/500 [03:31<00:00,  2.36it/s]



处理批次 52 / 102 ...


100%|██████████| 500/500 [03:57<00:00,  2.11it/s]



处理批次 53 / 102 ...


100%|██████████| 500/500 [03:41<00:00,  2.26it/s]



处理批次 54 / 102 ...


100%|██████████| 500/500 [03:44<00:00,  2.22it/s]



处理批次 55 / 102 ...


100%|██████████| 500/500 [03:40<00:00,  2.27it/s]



处理批次 56 / 102 ...


100%|██████████| 500/500 [03:33<00:00,  2.34it/s]



处理批次 57 / 102 ...


100%|██████████| 500/500 [09:07<00:00,  1.10s/it]



处理批次 58 / 102 ...


100%|██████████| 500/500 [03:54<00:00,  2.13it/s]



处理批次 59 / 102 ...


100%|██████████| 500/500 [03:36<00:00,  2.31it/s]



处理批次 60 / 102 ...


100%|██████████| 500/500 [04:05<00:00,  2.03it/s]



处理批次 61 / 102 ...


100%|██████████| 500/500 [03:35<00:00,  2.32it/s]



处理批次 62 / 102 ...


100%|██████████| 500/500 [03:35<00:00,  2.33it/s]



处理批次 63 / 102 ...


100%|██████████| 500/500 [03:32<00:00,  2.36it/s]



处理批次 64 / 102 ...


100%|██████████| 500/500 [03:38<00:00,  2.29it/s]



处理批次 65 / 102 ...


100%|██████████| 500/500 [03:38<00:00,  2.29it/s]



处理批次 66 / 102 ...


100%|██████████| 500/500 [03:47<00:00,  2.20it/s]



处理批次 67 / 102 ...


100%|██████████| 500/500 [03:47<00:00,  2.20it/s]



处理批次 68 / 102 ...


100%|██████████| 500/500 [03:38<00:00,  2.28it/s]



处理批次 69 / 102 ...


100%|██████████| 500/500 [03:36<00:00,  2.31it/s]



处理批次 70 / 102 ...


100%|██████████| 500/500 [03:50<00:00,  2.17it/s]



处理批次 71 / 102 ...


100%|██████████| 500/500 [03:34<00:00,  2.33it/s]



处理批次 72 / 102 ...


100%|██████████| 500/500 [03:45<00:00,  2.22it/s]



处理批次 73 / 102 ...


100%|██████████| 500/500 [03:54<00:00,  2.13it/s]



处理批次 74 / 102 ...


100%|██████████| 500/500 [04:01<00:00,  2.07it/s]



处理批次 75 / 102 ...


100%|██████████| 500/500 [03:58<00:00,  2.10it/s]



处理批次 76 / 102 ...


100%|██████████| 500/500 [03:47<00:00,  2.20it/s]



处理批次 77 / 102 ...


100%|██████████| 500/500 [03:44<00:00,  2.23it/s]



处理批次 78 / 102 ...


100%|██████████| 500/500 [03:35<00:00,  2.32it/s]



处理批次 79 / 102 ...


100%|██████████| 500/500 [03:54<00:00,  2.13it/s]



处理批次 80 / 102 ...


100%|██████████| 500/500 [03:56<00:00,  2.12it/s]



处理批次 81 / 102 ...


100%|██████████| 500/500 [03:33<00:00,  2.35it/s]



处理批次 82 / 102 ...


100%|██████████| 500/500 [03:44<00:00,  2.23it/s]



处理批次 83 / 102 ...


100%|██████████| 500/500 [03:49<00:00,  2.18it/s]



处理批次 84 / 102 ...


100%|██████████| 500/500 [03:45<00:00,  2.22it/s]



处理批次 85 / 102 ...


100%|██████████| 500/500 [03:39<00:00,  2.28it/s]



处理批次 86 / 102 ...


100%|██████████| 500/500 [03:30<00:00,  2.37it/s]



处理批次 87 / 102 ...


100%|██████████| 500/500 [03:35<00:00,  2.32it/s]



处理批次 88 / 102 ...


100%|██████████| 500/500 [03:38<00:00,  2.28it/s]



处理批次 89 / 102 ...


100%|██████████| 500/500 [04:07<00:00,  2.02it/s]



处理批次 90 / 102 ...


100%|██████████| 500/500 [03:39<00:00,  2.27it/s]



处理批次 91 / 102 ...


100%|██████████| 500/500 [04:08<00:00,  2.01it/s]



处理批次 92 / 102 ...


100%|██████████| 500/500 [03:28<00:00,  2.40it/s]



处理批次 93 / 102 ...


100%|██████████| 500/500 [03:31<00:00,  2.36it/s]



处理批次 94 / 102 ...


100%|██████████| 500/500 [03:32<00:00,  2.35it/s]



处理批次 95 / 102 ...


100%|██████████| 500/500 [03:32<00:00,  2.36it/s]



处理批次 96 / 102 ...


100%|██████████| 500/500 [03:33<00:00,  2.34it/s]



处理批次 97 / 102 ...


100%|██████████| 500/500 [03:27<00:00,  2.41it/s]



处理批次 98 / 102 ...


100%|██████████| 500/500 [03:22<00:00,  2.47it/s]



处理批次 99 / 102 ...


100%|██████████| 500/500 [03:28<00:00,  2.40it/s]



处理批次 100 / 102 ...


100%|██████████| 500/500 [03:42<00:00,  2.25it/s]



处理批次 101 / 102 ...


100%|██████████| 500/500 [03:27<00:00,  2.40it/s]



处理批次 102 / 102 ...


100%|██████████| 265/265 [02:03<00:00,  2.14it/s]
