In [104]:
#——————————————————————————————98——>97
import os

def get_all_filenames(directory):
    """返回指定目录下的所有文件名。"""
    return set(os.listdir(directory))

def get_all_subdirs(directory):
    """返回指定目录下的所有子目录。"""
    return [d for d in os.listdir(directory) if os.path.isdir(os.path.join(directory, d))]

def main(parent_directory):
    # 自动检索所有子文件夹
    subdirs = get_all_subdirs(parent_directory)
    
    if len(subdirs) != 4:
        print(f"Warning: Found {len(subdirs)} subdirectories, expected 4.")
        return
    
    # 获取第一个文件夹中的文件名作为初始交集
    common_filenames = get_all_filenames(os.path.join(parent_directory, subdirs[0]))
    
    # 计算交集
    for subdir in subdirs[1:]:
        subdir_path = os.path.join(parent_directory, subdir)
        common_filenames.intersection_update(get_all_filenames(subdir_path))
    
    # 删除不在交集中的文件
    for subdir in subdirs:
        subdir_path = os.path.join(parent_directory, subdir)
        for filename in os.listdir(subdir_path):
            if filename not in common_filenames:
                os.remove(os.path.join(subdir_path, filename))

# 调用主函数
parent_directory = 'F:/all/驾驶证/////////'  # 替换为你的主文件夹路径
main(parent_directory)


In [105]:
#——————————————————————处理label，按照points合并value
import os
import json
from collections import defaultdict

def process_data(data):
    # 按类别分组
    grouped_data = defaultdict(list)
    for item in data:
        y_avg = sum(point[1] for point in item['points']) / len(item['points'])
        grouped_data[item['category']].append((item, y_avg))

    # 合并相同类别的value字段
    merged_data = []
    for category, items in grouped_data.items():
        items.sort(key=lambda x: x[1])  # 按y值的平均值排序
        merged_value = ''.join(item[0]['value'] for item in items)
        merged_item = {
            "points": items[0][0]['points'],  # 以第一个项目的points为准，你也可以选择其他策略
            "category": category,
            "value": merged_value
        }
        merged_data.append(merged_item)

    # 按最终的y坐标的平均值排序
    merged_data.sort(key=lambda item: sum(point[1] for point in item['points']) / len(item['points']))

    return merged_data

def process_directory(directory):
    """处理指定文件夹，更新其json文件，并保存更改。"""
    for filename in os.listdir(directory):
        # 确保是json文件
        if filename.endswith('.json'):
            filepath = os.path.join(directory, filename)
            with open(filepath, 'r', encoding='utf-8') as file:
                data = json.load(file)

            # 处理数据
            new_data = process_data(data)

            # 保存更改
            with open(filepath, 'w', encoding='utf-8') as file:
                json.dump(new_data, file, ensure_ascii=False, indent=4)

def main():
    process_directory('F:/all/驾驶证/Labels/')

# 执行脚本
main()

In [106]:
#————————————————————删除labels文件夹下json文件的points
import os
import json

def remove_points_from_json_file(filepath):
    """读取JSON文件，删除每个对象的points键，然后保存修改后的数据。"""
    with open(filepath, 'r', encoding='utf-8') as file:
        data = json.load(file)
        
        for item in data:
            if 'points' in item:
                del item['points']

    with open(filepath, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)  # 设置 ensure_ascii=False

def main(directory):
    for filename in os.listdir(directory):
        # 确保文件是JSON文件
        if filename.endswith('.json'):
            filepath = os.path.join(directory, filename)
            remove_points_from_json_file(filepath)

# 替换为你的文件夹路径
directory = 'F:/all/驾驶证/Labels/'
main(directory)


In [108]:
#————————————————————————————————处理dataelem——————————————>label
import os
import json

def convert_format(data):
    """将数据从原格式转换为新格式。"""
    new_format = []
    for key, values in data.items():
        entry = {
            "category": key,
            "value": ''.join(values) if values else ""
        }
        new_format.append(entry)
    return new_format

def process_json_file(filepath):
    """处理单个JSON文件：转换格式并覆盖原文件。"""
    with open(filepath, 'r', encoding='utf-8') as file:
        data = json.load(file)
    
    new_data = convert_format(data)

    # 用新格式的内容覆盖原文件
    with open(filepath, 'w', encoding='utf-8') as file:
        json.dump(new_data, file, ensure_ascii=False, indent=4)

def main(directory):
    for filename in os.listdir(directory):
        # 确保文件是JSON文件
        if filename.endswith('.json'):
            filepath = os.path.join(directory, filename)
            process_json_file(filepath)

# 替换为你的文件夹路径
directory = 'F:/all/驾驶证/dataelem/'
main(directory)


In [67]:
#————————————————————————————————处理duguang——————————————>label
import os
import json

def process_first_format(filepath):
    """处理单个JSON文件：只保留data部分并覆盖原文件。"""
    with open(filepath, 'r', encoding='utf-8') as file:
        data = json.load(file)
    
    new_data = data.get('data', {})

    # 用新数据覆盖原文件
    with open(filepath, 'w', encoding='utf-8') as file:
        json.dump(new_data, file, ensure_ascii=False, indent=4)

def convert_to_new_format(data):
    """将原格式的数据转换为新格式。"""
    new_data = []
    for key, value in data.items():
        new_data.append({
            "category": key,
            "value": value
        })
    return new_data

def process_second_format(filepath):
    """处理单个JSON文件：转换数据格式并覆盖原文件。"""
    with open(filepath, 'r', encoding='utf-8') as file:
        data = json.load(file)

    new_data = convert_to_new_format(data)

    # 用新格式的数据覆盖原文件
    with open(filepath, 'w', encoding='utf-8') as file:
        json.dump(new_data, file, ensure_ascii=False, indent=4)

def main(directory):
    for filename in os.listdir(directory):
        # 确保文件是JSON文件
        if filename.endswith('.json'):
            filepath = os.path.join(directory, filename)
            
            # 先进行第一步处理
            process_first_format(filepath)
            # 再进行第二步处理
            process_second_format(filepath)

# 替换为你的文件夹路径
directory = 'F:/all/房产证/duguang/'
main(directory)


In [109]:
#————————————————————————————————处理textin——————————————>label————————————驾驶证√房产证√
import os
import json

def convert_to_labels_format(data):
    """将textin格式的数据转换为Labels格式。"""
    items = data.get('result', {}).get('item_list', [])
    
    # 提取每个项的description作为category，value作为value
    new_data = [{"category": item["description"], "value": item["value"]} for item in items if item["value"].strip()]
    
    return new_data

def process_json_file(filepath):
    """处理单个JSON文件：转换数据格式并覆盖原文件。"""
    with open(filepath, 'r', encoding='utf-8') as file:
        data = json.load(file)

    new_data = convert_to_labels_format(data)

    # 用新格式的数据覆盖原文件
    with open(filepath, 'w', encoding='utf-8') as file:
        json.dump(new_data, file, ensure_ascii=False, indent=4)

def main(directory):
    for filename in os.listdir(directory):
        # 确保文件是JSON文件
        if filename.endswith('.json'):
            filepath = os.path.join(directory, filename)
            process_json_file(filepath)

# 替换为你的文件夹路径
directory = 'F:/all/驾驶证/textin/'
main(directory)


In [110]:
#————————————————————————————————处理baidu——————————————>label——————————————驾驶证√
import os
import json

def convert_format_B(data):
    # Extract 'words_result' from data
    words_result = data.get("words_result", {})
    output = []

    # Iterate over each key-value in words_result
    for key, value in words_result.items():
        output.append({
            "category": key,
            "value": value.get("words", "")
        })

    return output

# Path to folder "B"
folder_path = 'F:/all/驾驶证/baidu/'

# Iterate over each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.json'):
        filepath = os.path.join(folder_path, filename)

        # Read the JSON file
        with open(filepath, 'r', encoding='utf-8') as file:
            data = json.load(file)

        # Convert the data to desired format
        converted_data = convert_format_B(data)

        # Save the converted data back to the file
        with open(filepath, 'w', encoding='utf-8') as file:
            json.dump(converted_data, file, ensure_ascii=False, indent=4)

print("Conversion for folder B complete!")


Conversion for folder B complete!


In [70]:
#__________________________处理duguang，删除空值
import os
import json

def remove_empty_values(data):
    """删除value为空的项."""
    return [item for item in data if item['value'].strip() != ""]

def process_json_file(filepath):
    """处理单个JSON文件：删除value为空的项并覆盖原文件."""
    with open(filepath, 'r', encoding='utf-8') as file:
        data = json.load(file)

    new_data = remove_empty_values(data)

    # 用新格式的数据覆盖原文件
    with open(filepath, 'w', encoding='utf-8') as file:
        json.dump(new_data, file, ensure_ascii=False, indent=4)

def main():
    root_directory = "F:/all/房产证/"

    # 遍历每一个子文件夹
    for subdir, _, _ in os.walk(root_directory):
        # 遍历子文件夹中的每一个JSON文件
        for filename in os.listdir(subdir):
            if filename.endswith('.json'):
                filepath = os.path.join(subdir, filename)
                process_json_file(filepath)

if __name__ == "__main__":
    main()


In [42]:
#——————————————————————————————————————————————删除空格和换行
import os
import json

def clean_value(item):
    """移除value中的空格并检查是否为空."""
    item['value'] = item['value'].replace(' ', '').replace('\n' , '')
    return item

def remove_empty_values(data):
    """删除value为空的项，并清理value中的空格."""
    cleaned_data = [clean_value(item) for item in data]
    return [item for item in cleaned_data if item['value'].strip() != ""]

def process_json_file(filepath):
    """处理单个JSON文件：删除value为空的项，清理value中的空格，并覆盖原文件."""
    with open(filepath, 'r', encoding='utf-8') as file:
        data = json.load(file)

    new_data = remove_empty_values(data)

    # 用新格式的数据覆盖原文件
    with open(filepath, 'w', encoding='utf-8') as file:
        json.dump(new_data, file, ensure_ascii=False, indent=4)

def main():
    root_directory = "F:/all/电子承兑汇票/"

    # 遍历每一个子文件夹
    for subdir, _, _ in os.walk(root_directory):
        # 遍历子文件夹中的每一个JSON文件
        for filename in os.listdir(subdir):
            if filename.endswith('.json'):
                filepath = os.path.join(subdir, filename)
                process_json_file(filepath)

if __name__ == "__main__":
    main()


In [112]:
import os
import json

def get_categories_from_file(filepath):
    """从单个JSON文件中提取所有的category字段值。"""
    with open(filepath, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return {item.get('category', '') for item in data}

def process_directory(directory):
    """处理指定文件夹中的所有JSON文件，并收集所有的category字段值。"""
    categories_set = set()

    for filename in os.listdir(directory):
        # 确保文件是JSON文件
        if filename.endswith('.json'):
            filepath = os.path.join(directory, filename)
            categories_set.update(get_categories_from_file(filepath))

    return sorted(categories_set)  # 对集合进行排序以提供一致的输出

def main(base_directory):
    # 获取base_directory的同级的所有文件夹
    parent_directory = os.path.dirname(base_directory)
    folders = [os.path.join(parent_directory, d) for d in os.listdir(parent_directory) if os.path.isdir(os.path.join(parent_directory, d))]

    # 对每个文件夹进行处理
    outer_index = 1  # 用于标记外部序号
    for folder in folders:
        categories = process_directory(folder)
        # 替换路径中的 \ 为 /
        formatted_folder = folder.replace("\\", "/")
        print(f"{outer_index}. Categories for folder '{formatted_folder}':")
        
        inner_index = 1  # 用于标记内部序号
        for category in categories:
            print(f"{inner_index}. {category}")
            inner_index += 1
        
        print('-' * 40)  # 打印分隔线
        outer_index += 1

# 替换为你的文件夹路径
base_directory = 'F:/all/驾驶证/'
main(base_directory)


1. Categories for folder 'F:/all/驾驶证/baidu':
1. 住址
2. 准驾车型
3. 出生日期
4. 初次领证日期
5. 发证单位
6. 国籍
7. 姓名
8. 当前时间
9. 性别
10. 有效期限
11. 条形码下编号
12. 档案编号
13. 状态
14. 生成时间
15. 累积记分
16. 至
17. 证号
----------------------------------------
2. Categories for folder 'F:/all/驾驶证/dataelem':
1. 住址
2. 准驾车型
3. 出生日期
4. 初次领证日期
5. 国籍
6. 姓名
7. 性别
8. 有效期限
9. 标题
10. 档案编号
11. 记录
12. 证号
----------------------------------------
3. Categories for folder 'F:/all/驾驶证/Labels':
1. 住址
2. 准驾车型
3. 出生日期
4. 初次领证日期
5. 国籍
6. 姓名
7. 性别
8. 有效期限
9. 标题
10. 档案编号
11. 记录
12. 证号
----------------------------------------
4. Categories for folder 'F:/all/驾驶证/textin':
1. 住址
2. 准驾车型
3. 出生日期
4. 初次领证日期
5. 发证机关
6. 国籍
7. 姓名
8. 当前时间
9. 性别
10. 有限期始(至)
11. 档案编号
12. 状态
13. 生成时间
14. 类型
15. 累积记分
16. 记录
17. 驾驶证证号
----------------------------------------


In [113]:
import os
import json

MAPPINGS = {
    "F:/all/驾驶证/baidu": {
        "出生日期": "出生日期",
        "有效期限": "有效期限",
        "姓名": "姓名",
        "初次领证日期": "初次领证日期",
        "国籍": "国籍",
        "证号": "证号",
        "档案编号": "档案编号",
        "准驾车型": "准驾车型",
        "性别": "性别",
        "住址": "住址"
    },
    "F:/all/驾驶证/dataelem": {
        "出生日期": "出生日期",
        "有效期限": "有效期限",
        "姓名": "姓名",
        "初次领证日期": "初次领证日期",
        "国籍": "国籍",
        "证号": "证号",
        "档案编号": "档案编号",
        "准驾车型": "准驾车型",
        "性别": "性别",
        "住址": "住址"
    },
    "F:/all/驾驶证/textin": {
        "出生日期": "出生日期",
        "姓名": "姓名",
        "初次领证日期": "初次领证日期",
        "准驾车型": "准驾车型",
        "国籍": "国籍",
        "性别": "性别",
        "住址": "住址",
        "驾驶证证号":"证号"
    }
}
# MAPPINGS = {
#     "F:/all/房产证/dataelem": {
#         "共有情况": "共有情况",
#         "合计建筑面积": "合计建筑面积",
#         "土地使用权取得方式": "土地使用权取得方式",
#         "套内建筑面积": "套内建筑面积",
#         "建筑面积": "建筑面积",
#         "总层数": "总层数",
#         "房屋坐落": "房屋坐落",
#         "房屋性质": "房屋性质",
#         "房屋所有权人": "房屋所有权人",
#         "权证号": "权证号",
#         "登记时间": "登记时间",
#         "规划用途": "规划用途"
#     },
#     "F:/all/房产证/duguang": {
#         "mutualOwnershipState": "共有情况",
#         "buildingArea": "合计建筑面积",
#         "location": "房屋坐落",
#         "rightProperty": "房屋性质",
#         "obligee": "房屋所有权人",
#         "certificateNumber": "权证号",
#         "usage": "规划用途"
#     },
#     "F:/all/房产证/textin": {
#         "共有情况": "共有情况",
#         "合计建筑面积": "合计建筑面积",
#         "使用权取得方式": "土地使用权取得方式",
#         "套内建筑面积": "套内建筑面积",
#         "建筑面积": "建筑面积",
#         "总层数": "总层数",
#         "房地坐落": "房屋坐落",
#         "所有权性质": "房屋性质",
#         "房屋所有权人": "房屋所有权人",
#         "权属编号": "权证号",
#         "登记日期": "登记时间",
#         "用途": "规划用途"
#     }
# }






def process_directory(directory):
    """处理指定文件夹，更新其json文件中的category字段，并保存更改。"""
    directory_mapping = MAPPINGS.get(directory)
    if not directory_mapping:
        print(f"No mappings found for directory: {directory}")
        return
    
    for filename in os.listdir(directory):
        # 确保是json文件
        if filename.endswith('.json'):
            filepath = os.path.join(directory, filename)
            with open(filepath, 'r', encoding='utf-8') as file:
                data = json.load(file)

            # 只保留匹配的category
            new_data = []
            for item in data:
                mapped_category = directory_mapping.get(item['category'])
                if mapped_category:
                    item['category'] = mapped_category
                    new_data.append(item)

            # 保存更改
            with open(filepath, 'w', encoding='utf-8') as file:
                json.dump(new_data, file, ensure_ascii=False, indent=4)

def main():
    # 处理各个文件夹
    directories = [
        "F:/all/驾驶证/baidu",
        "F:/all/驾驶证/dataelem",
        "F:/all/驾驶证/textin"
    ]
    for directory in directories:
        process_directory(directory)

# 执行脚本
if __name__ == '__main__':
    main()


In [11]:
import os
import json
import re

def process_json_file(filepath):
    # 打开并读取json文件
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # 遍历json数据
    for item in data:
        value = item["value"]
        # 使用正则表达式查找数字字符串
        if re.search(r'\d', value):
            # 移除所有非数字字符——————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————
            cleaned_value = re.sub(r'\D', '', value)
            item["value"] = cleaned_value

    # 保存修改后的json数据
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

def main():
    root_folder = "F:/test17"
    
    for subdir, _, files in os.walk(root_folder):
        for file in files:
            if file.endswith('.json'):
                process_json_file(os.path.join(subdir, file))

if __name__ == "__main__":
    main()


In [114]:
import os
import json
import re

def process_json_file(filepath):
    # 打开并读取json文件
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # 遍历json数据
    for item in data:
        value = item["value"]
        # 使用正则表达式移除所有非字母、非数字和非汉字字符————————————————————————————————————————————————————————————————————————————————————————————————————————————————————
        cleaned_value = re.sub(r'[^\w\u4e00-\u9fa5]', '', value)
        item["value"] = cleaned_value

    # 保存修改后的json数据
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

def main():
    root_folder = "F:/all/驾驶证/"
    
    for subdir, _, files in os.walk(root_folder):
        for file in files:
            if file.endswith('.json'):
                process_json_file(os.path.join(subdir, file))

if __name__ == "__main__":
    main()


In [16]:
#for ________________________________________电子承兑汇票
import json
import os
import pandas as pd

# 获取文件夹下的所有文件名
def get_filenames(path):
    return [os.path.join(path, f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]

# 读取json文件内容
def read_json(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

# 写入json文件
def write_json(file_path, data):
    with open(file_path, 'w') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

# 获取三个文件的交集
def get_intersection(dataelem, duguang, textin):
    dataelem_set = set(tuple(item.items()) for item in dataelem)
    duguang_set = set(tuple(item.items()) for item in duguang)
    textin_set = set(tuple(item.items()) for item in textin)

    intersection = dataelem_set & duguang_set & textin_set
    return [dict(t) for t in intersection]

# 计算正确率
def calculate_accuracy(labels, new):
    correct_count = sum(1 for item in new if item in labels)
    return correct_count / len(new)

# 获取每一个category的正确率及其错例
def calculate_category_accuracy(labels, new):
    category_accuracy = {}
    wrong_examples = {}
    
    categories = set(item['category'] for item in labels)
    
    for category in categories:
        labels_items = [item for item in labels if item['category'] == category]
        new_items = [item for item in new if item['category'] == category]
        
        correct_count = sum(1 for item in new_items if item in labels_items)
        
        # 计算正确率
        if new_items:
            accuracy = correct_count / len(new_items)
        else:
            accuracy = 0
        
        category_accuracy[category] = accuracy
        
        # 获取错例
        wrong_items = [item for item in new_items if item not in labels_items]
        if wrong_items:
            wrong_examples[category] = wrong_items
    
    return category_accuracy, wrong_examples

# 主函数
def main():
    base_path = "F:/test17"
    labels_path = os.path.join(base_path, "Labels")
    new_path = os.path.join(base_path, "new")
    os.makedirs(new_path, exist_ok=True)

    filenames = get_filenames(labels_path)

    all_category_accuracy = {}
    all_wrong_examples = {}

    for filename in filenames:
        labels = read_json(filename)
        dataelem = read_json(filename.replace("Labels", "dataelem"))
        duguang = read_json(filename.replace("Labels", "duguang"))
        textin = read_json(filename.replace("Labels", "textin"))
        
        intersection = get_intersection(dataelem, duguang, textin)
        write_json(filename.replace("Labels", "new"), intersection)
        
        category_accuracy, wrong_examples = calculate_category_accuracy(labels, intersection)
        all_category_accuracy[os.path.basename(filename)] = category_accuracy
        all_wrong_examples[os.path.basename(filename)] = wrong_examples

        # 打印错例
        for category, items in wrong_examples.items():
            print(f"Wrong examples for {os.path.basename(filename)}, category: {category}: {items}")

    # 保存每个字段的正确率到Excel
    df = pd.DataFrame(all_category_accuracy)
    df.to_excel(os.path.join("test17", "accuracy_per_category.xlsx"))

if __name__ == "__main__":
    main()


Wrong examples for 790410c86fa2bbc9412c136717a21aea.json, category: 金额小写: [{'category': '金额小写', 'value': '400000000'}]
Wrong examples for a93ac81c3594f5bd8ea0c6d9c684bdf4.json, category: 收款人账号: [{'category': '收款人账号', 'value': '8110301014100004463'}]
Wrong examples for fd33edfe2af529c5246fc2d3ad126a6e.json, category: 金额小写: [{'category': '金额小写', 'value': '100000000'}]
Wrong examples for 合并PDF_04.json, category: 金额大写: [{'category': '金额大写', 'value': '伍拾万圆整'}]
Wrong examples for 合并PDF_05.json, category: 收款人账号: [{'category': '收款人账号', 'value': '201000005384984'}]
Wrong examples for 合并PDF_07.json, category: 金额小写: [{'category': '金额小写', 'value': '50000000'}]
Wrong examples for 合并PDF_07.json, category: 收款人账号: [{'category': '收款人账号', 'value': '201000005384984'}]
Wrong examples for 图片 4.json, category: 收款人全称: [{'category': '收款人全称', 'value': '广东易建建筑劳务有限公司'}]
Wrong examples for 图片 4.json, category: 金额小写: [{'category': '金额小写', 'value': '92166866'}]


In [115]:
#for ________________________________________驾驶证
import json
import os
import pandas as pd

# 获取文件夹下的所有文件名
def get_filenames(path):
    return [os.path.join(path, f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]

# 读取json文件内容
def read_json(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

# 写入json文件
def write_json(file_path, data):
    with open(file_path, 'w') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

# 获取三个文件的交集
def get_intersection(dataelem, baidu, textin):
    dataelem_set = set(tuple(item.items()) for item in dataelem)
    baidu_set = set(tuple(item.items()) for item in baidu)
    textin_set = set(tuple(item.items()) for item in textin)

    intersection = dataelem_set & baidu_set & textin_set
    return [dict(t) for t in intersection]

# 计算正确率
def calculate_accuracy(labels, intersection):
    correct_count = sum(1 for item in intersection if item in labels)
    return correct_count / len(intersection)

# 获取每一个category的正确率及其错例
def calculate_category_accuracy(labels, intersection):
    category_accuracy = {}
    wrong_examples = {}
    
    categories = set(item['category'] for item in labels)
    
    for category in categories:
        labels_items = [item for item in labels if item['category'] == category]
        intersection_items = [item for item in intersection if item['category'] == category]
        
        correct_count = sum(1 for item in intersection_items if item in labels_items)
        
        # 计算正确率
        if intersection_items:
            accuracy = correct_count / len(intersection_items)
        else:
            accuracy = 0
        
        category_accuracy[category] = accuracy
        
        # 获取错例
        wrong_items = [item for item in intersection_items if item not in labels_items]
        if wrong_items:
            wrong_examples[category] = wrong_items
    
    return category_accuracy, wrong_examples

# 主函数
def main():
    base_path = "F:/all/驾驶证/"
    labels_path = os.path.join(base_path, "Labels")
    intersection_path = os.path.join(base_path, "intersection")
    os.makedirs(intersection_path, exist_ok=True)

    filenames = get_filenames(labels_path)

    all_category_accuracy = {}
    all_wrong_examples = {}

    for filename in filenames:
        labels = read_json(filename)
        dataelem = read_json(filename.replace("Labels", "dataelem"))
        baidu = read_json(filename.replace("Labels", "baidu"))
        textin = read_json(filename.replace("Labels", "textin"))
        
        intersection = get_intersection(dataelem, baidu, textin)
        write_json(filename.replace("Labels", "intersection"), intersection)
        
        category_accuracy, wrong_examples = calculate_category_accuracy(labels, intersection)
        all_category_accuracy[os.path.basename(filename)] = category_accuracy
        all_wrong_examples[os.path.basename(filename)] = wrong_examples

        # 打印错例
        for category, items in wrong_examples.items():
            print(f"Wrong examples for {os.path.basename(filename)}, category: {category}: {items}")

    # 保存每个字段的正确率到Excel
    df = pd.DataFrame(all_category_accuracy)
    df.to_excel(os.path.join("F:/all/驾驶证/", "accuracy_per_category.xlsx"))

if __name__ == "__main__":
    main()


In [5]:
#for ________________________________________房产证
import json
import os
import pandas as pd

def get_filenames(path):
    return [os.path.join(path, f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]

def read_json(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

def write_json(file_path, data):
    with open(file_path, 'w') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

def get_intersection(dataelem, duguang, textin):
    dataelem_set = set(tuple(item.items()) for item in dataelem)
    duguang_set = set(tuple(item.items()) for item in duguang)
    textin_set = set(tuple(item.items()) for item in textin)

    intersection = dataelem_set & duguang_set & textin_set
    return [dict(t) for t in intersection]

def calculate_accuracy(labels, intersection):
    correct_count = sum(1 for item in intersection if item in labels)
    return correct_count / len(intersection)

def calculate_category_accuracy(labels, intersection):
    category_accuracy = {}
    wrong_examples = {}
    
    categories = set(item['category'] for item in labels)
    
    for category in categories:
        labels_items = [item for item in labels if item['category'] == category]
        intersection_items = [item for item in intersection if item['category'] == category]
        
        correct_count = sum(1 for item in intersection_items if item in labels_items)
        
        if intersection_items:
            accuracy = correct_count / len(intersection_items)
        else:
            accuracy = 0
        
        category_accuracy[category] = accuracy
        
        wrong_items = [item for item in intersection_items if item not in labels_items]
        if wrong_items:
            wrong_examples[category] = wrong_items
    
    return category_accuracy, wrong_examples

def main():
    base_path = "F:/all/房产证/"
    labels_path = os.path.join(base_path, "Labels")
    intersection_path = os.path.join(base_path, "intersection")
    os.makedirs(intersection_path, exist_ok=True)

    filenames = get_filenames(labels_path)

    all_category_accuracy = {}
    all_wrong_examples = {}

    for filename in filenames:
        labels = read_json(filename)
        dataelem = read_json(filename.replace("Labels", "dataelem"))
        duguang = read_json(filename.replace("Labels", "duguang"))
        textin = read_json(filename.replace("Labels", "textin"))
        
        intersection = get_intersection(dataelem, duguang, textin)
        write_json(filename.replace("Labels", "intersection"), intersection)
        
        category_accuracy, wrong_examples = calculate_category_accuracy(labels, intersection)
        all_category_accuracy[os.path.basename(filename)] = category_accuracy
        all_wrong_examples[os.path.basename(filename)] = wrong_examples

        for category, items in wrong_examples.items():
            print(f"Wrong examples for {os.path.basename(filename)}, category: {category}: {items}")

    df = pd.DataFrame(all_category_accuracy)
    df.to_excel("accuracy_per_category1010.xlsx")

if __name__ == "__main__":
    main()


In [26]:
#————————————————————————————————————————俩俩交集————————————————房产证，电子承兑汇票（duguang）
import os
import json

base_path = "F:/all/电子承兑汇票/"
folders = ["dataelem", "duguang", "textin"]

# 创建目标文件夹
os.makedirs(os.path.join(base_path, "intersection"), exist_ok=True)
os.makedirs(os.path.join(base_path, "dataelem_duguang_intersection"), exist_ok=True)
os.makedirs(os.path.join(base_path, "dataelem_textin_intersection"), exist_ok=True)
os.makedirs(os.path.join(base_path, "duguang_textin_intersection"), exist_ok=True)

# 获取文件列表
file_list = os.listdir(os.path.join(base_path, "dataelem"))

for file in file_list:
    # 读取JSON文件内容
    json_contents = [json.load(open(os.path.join(base_path, folder, file), "r")) for folder in folders]
    
    # 使用字符串形式转化json并求交集
    json_sets = [set(map(json.dumps, content)) for content in json_contents]
    
    intersection = json_sets[0].intersection(*json_sets[1:])
    dataelem_duguang_intersection = json_sets[0].intersection(json_sets[1])
    dataelem_textin_intersection = json_sets[0].intersection(json_sets[2])
    duguang_textin_intersection = json_sets[1].intersection(json_sets[2])
    
    # 将字符串形式的交集转回字典形式
    intersection = list(map(json.loads, intersection))
    dataelem_duguang_intersection = list(map(json.loads, dataelem_duguang_intersection))
    dataelem_textin_intersection = list(map(json.loads, dataelem_textin_intersection))
    duguang_textin_intersection = list(map(json.loads, duguang_textin_intersection))
    
    # 保存交集结果到文件
    with open(os.path.join(base_path, "intersection", file), "w") as f:
        json.dump(intersection, f , ensure_ascii=False , indent=4)
    
    with open(os.path.join(base_path, "dataelem_duguang_intersection", file), "w") as f:
        json.dump(dataelem_duguang_intersection, f , ensure_ascii=False , indent=4)
    
    with open(os.path.join(base_path, "dataelem_textin_intersection", file), "w") as f:
        json.dump(dataelem_textin_intersection, f , ensure_ascii=False , indent=4)
    
    with open(os.path.join(base_path, "duguang_textin_intersection", file), "w") as f:
        json.dump(duguang_textin_intersection, f , ensure_ascii=False , indent=4)

In [29]:
#————————————————————————————————————————俩俩交集————————————————驾驶证（baidu）
import os
import json

base_path = "F:/all/驾驶证/"
folders = ["dataelem", "baidu", "textin"]

# Create target folders
os.makedirs(os.path.join(base_path, "intersection"), exist_ok=True)
os.makedirs(os.path.join(base_path, "dataelem_baidu_intersection"), exist_ok=True)
os.makedirs(os.path.join(base_path, "dataelem_textin_intersection"), exist_ok=True)
os.makedirs(os.path.join(base_path, "baidu_textin_intersection"), exist_ok=True)

# Get the file list
file_list = os.listdir(os.path.join(base_path, "dataelem"))

for file in file_list:
    # Read JSON file contents
    json_contents = [json.load(open(os.path.join(base_path, folder, file), "r")) for folder in folders]
    
    # Convert JSON to string and find the intersection
    json_sets = [set(map(json.dumps, content)) for content in json_contents]
    
    intersection = json_sets[0].intersection(*json_sets[1:])
    dataelem_baidu_intersection = json_sets[0].intersection(json_sets[1])
    dataelem_textin_intersection = json_sets[0].intersection(json_sets[2])
    baidu_textin_intersection = json_sets[1].intersection(json_sets[2])
    
    # Convert the string-form intersection back to dictionary
    intersection = list(map(json.loads, intersection))
    dataelem_baidu_intersection = list(map(json.loads, dataelem_baidu_intersection))
    dataelem_textin_intersection = list(map(json.loads, dataelem_textin_intersection))
    baidu_textin_intersection = list(map(json.loads, baidu_textin_intersection))
    
    # Save the intersection results to files
    with open(os.path.join(base_path, "intersection", file), "w") as f:
        json.dump(intersection, f, ensure_ascii=False, indent=4)
    
    with open(os.path.join(base_path, "dataelem_baidu_intersection", file), "w") as f:
        json.dump(dataelem_baidu_intersection, f, ensure_ascii=False, indent=4)
    
    with open(os.path.join(base_path, "dataelem_textin_intersection", file), "w") as f:
        json.dump(dataelem_textin_intersection, f, ensure_ascii=False, indent=4)
    
    with open(os.path.join(base_path, "baidu_textin_intersection", file), "w") as f:
        json.dump(baidu_textin_intersection, f, ensure_ascii=False, indent=4)


In [39]:
import os
import json

def replace_brackets_in_value(data):
    """Replace all types of brackets in the value field with regular brackets."""
    for item in data:
        if "value" in item:
            # Replacing various types of brackets with the regular ones
            item["value"] = item["value"].replace('（', '(').replace('）', ')')
            item["value"] = item["value"].replace('[', '(').replace(']', ')')
            item["value"] = item["value"].replace('{', '(').replace('}', ')')
            item["value"] = item["value"].replace('〈', '(').replace('〉', ')')
            item["value"] = item["value"].replace('《', '(').replace('》', ')')
            # Add more replacements as needed
    return data

def process_json_file(filepath):
    """Modify a JSON file's value fields with regular brackets and overwrite the file."""
    with open(filepath, 'r', encoding='utf-8') as file:
        data = json.load(file)

    new_data = replace_brackets_in_value(data)

    with open(filepath, 'w', encoding='utf-8') as file:
        json.dump(new_data, file, ensure_ascii=False, indent=4)

def main(directory):
    for subdir, _, files in os.walk(directory):
        for filename in files:
            if filename.endswith('.json'):
                filepath = os.path.join(subdir, filename)
                process_json_file(filepath)

# Root directory where the subdirectories with JSON files are located
directory = 'F:/all/电子承兑汇票/'
main(directory)


In [45]:
#需要将文件格式全部转化完成________________here________________________
import json
import os
import pandas as pd

BASE_PATH = "F:/all/驾驶证"  # <-- Adjust this path when needed

def calculate_counts(standard_data, data_to_check):
    correct_counts = {category['category']: 0 for category in standard_data}
    total_counts = {category['category']: 0 for category in standard_data}
    errors = []
    
    for category in data_to_check:
        if category['category'] in total_counts:
            total_counts[category['category']] += 1
            standard_value = [item for item in standard_data if item['category'] == category['category']]
            if standard_value:
                if category['value'] == standard_value[0]['value']:
                    correct_counts[category['category']] += 1
                else:
                    errors.append({
                        "category": category['category'],
                        "expected": standard_value[0]['value'],
                        "found": category['value']
                    })
    
    return correct_counts, total_counts, errors

folders = ['dataelem', 'baidu', 'textin', 'intersection' , 'dataelem_baidu_intersection' , 'dataelem_textin_intersection' , 'baidu_textin_intersection']
results = {}
category_counts_in_labels = {}

labels_files = [f for f in os.listdir(os.path.join(BASE_PATH, "Labels")) if f.endswith('.json')]

# Count categories in Labels
for json_file in labels_files:
    with open(os.path.join(BASE_PATH, "Labels", json_file), 'r', encoding="utf-8") as standard_file:
        standard_data = json.load(standard_file)
    for category in standard_data:
        cat = category['category']
        category_counts_in_labels[cat] = category_counts_in_labels.get(cat, 0) + 1

# Create a list to collect error records
errors_list = []

for folder in folders:
    folder_correct_counts = {}
    folder_total_counts = {}
    for json_file in labels_files:
        if os.path.exists(os.path.join(BASE_PATH, folder, json_file)):
            with open(os.path.join(BASE_PATH, "Labels", json_file), 'r', encoding="utf-8") as standard_file:
                standard_data = json.load(standard_file)

            with open(os.path.join(BASE_PATH, folder, json_file), 'r', encoding="utf-8") as file:
                data = json.load(file)
            correct_counts, total_counts, file_errors = calculate_counts(standard_data, data)

            # Print errors for the file
            for error in file_errors:
                print(f"In folder '{folder}', file '{json_file}': Expected '{error['category']}' to be '{error['expected']}', but found '{error['found']}'.")

                # Append the error record to errors_list
                errors_list.append({
                    "Folder": folder,
                    "File": json_file,
                    "Category": error["category"],
                    "Expected": error["expected"],
                    "Found": error["found"]
                })

            for cat in correct_counts:
                folder_correct_counts[cat] = folder_correct_counts.get(cat, 0) + correct_counts[cat]
                folder_total_counts[cat] = folder_total_counts.get(cat, 0) + total_counts[cat]
                
    results[folder] = (folder_correct_counts, folder_total_counts)

# Convert errors_list to a DataFrame
errors_df = pd.DataFrame(errors_list)

with pd.ExcelWriter('accuracy_results_驾驶证1023.xlsx') as writer:
    all_categories = list(next(iter(results.values()))[0].keys())
    data = []
    for category in all_categories:
        row = [category, category_counts_in_labels.get(category, 0)]
        for folder in folders:
            correct_counts, total_counts = results[folder]
            accuracy = correct_counts[category] / total_counts[category] if total_counts[category] != 0 else 0
            row.extend([correct_counts[category], total_counts[category], accuracy])
        data.append(row)
    
    headers = ['Category', 'Counts in Labels']
    for folder in folders:
        headers.extend([f"{folder} Correct Count", f"{folder} Total Count", f"{folder} Accuracy"])
    
    df = pd.DataFrame(data, columns=headers)
    df.to_excel(writer, sheet_name='Results', index=False)

    # Save the errors DataFrame as another sheet
    errors_df.to_excel(writer, sheet_name='Errors', index=False)


In folder 'dataelem', file '4206assurance2.json': Expected '住址' to be '广东省龙川县老隆镇居民新村居委会东风西路46号', but found '西路46号广东省龙川县老隆镇居民新村居委会东风'.
In folder 'baidu', file '2562assurance2.json': Expected '有效期限' to be '20151210至20251210', but found '20151210'.
In folder 'baidu', file '2724assurance2.json': Expected '住址' to be '广东省深圳市福田区彩虹新都彩虹大厦15B', but found '广东省深圳市福田区彩虹新都彩虹六厦15B'.
In folder 'baidu', file '2724assurance2.json': Expected '有效期限' to be '20150811至20250811', but found '20150811'.
In folder 'baidu', file '3359assurance2.json': Expected '住址' to be '浙江省平阳县昆阳镇前宕', but found '浙江省平阳县昆阳镇前罗'.
In folder 'baidu', file '3359assurance2.json': Expected '有效期限' to be '20111208至20211208', but found '20111208'.
In folder 'baidu', file '3630assurance2.json': Expected '有效期限' to be '20170906至20230906', but found '20170906'.
In folder 'baidu', file '4206assurance2.json': Expected '有效期限' to be '20151210至20251210', but found '20151210'.
In folder 'baidu', file '下载 (1).json': Expected '有效期限' to be '20151231至202

In [46]:
#需要将文件格式全部转化完成________________here________________________
import json
import os
import pandas as pd

BASE_PATH = "F:/all/房产证"  # <-- Adjust this path when needed

def calculate_counts(standard_data, data_to_check):
    correct_counts = {category['category']: 0 for category in standard_data}
    total_counts = {category['category']: 0 for category in standard_data}
    errors = []
    
    for category in data_to_check:
        if category['category'] in total_counts:
            total_counts[category['category']] += 1
            standard_value = [item for item in standard_data if item['category'] == category['category']]
            if standard_value:
                if category['value'] == standard_value[0]['value']:
                    correct_counts[category['category']] += 1
                else:
                    errors.append({
                        "category": category['category'],
                        "expected": standard_value[0]['value'],
                        "found": category['value']
                    })
    
    return correct_counts, total_counts, errors

folders = ['dataelem', 'duguang', 'textin', 'intersection' , 'dataelem_duguang_intersection' , 'dataelem_textin_intersection' , 'duguang_textin_intersection']
results = {}
category_counts_in_labels = {}

labels_files = [f for f in os.listdir(os.path.join(BASE_PATH, "Labels")) if f.endswith('.json')]

# Count categories in Labels
for json_file in labels_files:
    with open(os.path.join(BASE_PATH, "Labels", json_file), 'r', encoding="utf-8") as standard_file:
        standard_data = json.load(standard_file)
    for category in standard_data:
        cat = category['category']
        category_counts_in_labels[cat] = category_counts_in_labels.get(cat, 0) + 1

# Create a list to collect error records
errors_list = []

for folder in folders:
    folder_correct_counts = {}
    folder_total_counts = {}
    for json_file in labels_files:
        if os.path.exists(os.path.join(BASE_PATH, folder, json_file)):
            with open(os.path.join(BASE_PATH, "Labels", json_file), 'r', encoding="utf-8") as standard_file:
                standard_data = json.load(standard_file)

            with open(os.path.join(BASE_PATH, folder, json_file), 'r', encoding="utf-8") as file:
                data = json.load(file)
            correct_counts, total_counts, file_errors = calculate_counts(standard_data, data)

            # Print errors for the file
            for error in file_errors:
                print(f"In folder '{folder}', file '{json_file}': Expected '{error['category']}' to be '{error['expected']}', but found '{error['found']}'.")

                # Append the error record to errors_list
                errors_list.append({
                    "Folder": folder,
                    "File": json_file,
                    "Category": error["category"],
                    "Expected": error["expected"],
                    "Found": error["found"]
                })

            for cat in correct_counts:
                folder_correct_counts[cat] = folder_correct_counts.get(cat, 0) + correct_counts[cat]
                folder_total_counts[cat] = folder_total_counts.get(cat, 0) + total_counts[cat]
                
    results[folder] = (folder_correct_counts, folder_total_counts)

# Convert errors_list to a DataFrame
errors_df = pd.DataFrame(errors_list)

with pd.ExcelWriter('accuracy_results_房产证1023.xlsx') as writer:
    all_categories = list(next(iter(results.values()))[0].keys())
    data = []
    for category in all_categories:
        row = [category, category_counts_in_labels.get(category, 0)]
        for folder in folders:
            correct_counts, total_counts = results[folder]
            accuracy = correct_counts[category] / total_counts[category] if total_counts[category] != 0 else 0
            row.extend([correct_counts[category], total_counts[category], accuracy])
        data.append(row)
    
    headers = ['Category', 'Counts in Labels']
    for folder in folders:
        headers.extend([f"{folder} Correct Count", f"{folder} Total Count", f"{folder} Accuracy"])
    
    df = pd.DataFrame(data, columns=headers)
    df.to_excel(writer, sheet_name='Results', index=False)

    # Save the errors DataFrame as another sheet
    errors_df.to_excel(writer, sheet_name='Errors', index=False)

In folder 'dataelem', file 'CgqJJ2FnmB-AcgQKAAJpLfGoDTg748.json': Expected '房屋坐落' to be '石景山区五里坨西小街7号院1号楼6层1单元603', but found '603石景山区五里坨西小街7号院1号楼6层1单元'.
In folder 'dataelem', file 'CgqJJWFMHQWAJe_wAARx4RmnrsI980.json': Expected '房屋坐落' to be '丰台区马家堡东路189号院16号楼14层8单元1402', but found '1402丰台区马家堡东路189号院16号楼14层8单元'.
In folder 'dataelem', file 'CgqJJWFoEBmAfNG0AAx7yjf4xks746.json': Expected '合计建筑面积' to be '70.28', but found '70/28'.
In folder 'dataelem', file 'other (9).json': Expected '房屋坐落' to be '西城区广安门车站西街15号院6号楼5层园②-5-502', but found '502西城区广安门车站西街15号院6号楼5层园②-5-'.
In folder 'dataelem', file '房本2429.json': Expected '房屋坐落' to be '昌平区北七家镇名佳花园四区15号楼5层3单元352', but found '352昌平区北七家镇名佳花园四区15号楼5层3单元'.
In folder 'duguang', file '1 (276).json': Expected '权证号' to be 'X京房权证西字第160936号', but found '京房权证西字第160936号'.
In folder 'duguang', file 'CgqJJ2E-BSeAFqeuAAJb3hHGR4Y625.json': Expected '权证号' to be 'X京房权证东字第086876号', but found '京房权证东字第086876号'.
In folder 'duguang', file 'CgqJJ2F0wV2AU-ukAAJnLrZPQqs