In [28]:
import pandas as pd

In [29]:
import numpy as np
import json


data = pd.read_csv('data.csv')


In [51]:
import csv
import json

def compare_and_get(row, key_suffix):
    """
    比较并获取两位标注员的标注内容。
    - key_suffix: 列名的共同后缀，例如 "person1-年龄"
    """
    key1 = f'标注员1-{key_suffix}'
    key2 = f'标注员2-{key_suffix}'
    
    # 获取两个标注员的值，如果列不存在则视为空字符串
    val1 = row.get(key1, '').strip()
    val2 = row.get(key2, '').strip()

    if val1 == val2:
        return val1
    else:
        # 如果其中一方为空，也明确标注出来
        if not val1: val1 = "EMPTY"
        if not val2: val2 = "EMPTY"
        return f"标注员1: {val1} ## 标注员2: {val2}"

def process_csv_to_json(input_file='data.csv', output_file='comparison_output.json'):
    """
    主函数，读取CSV，处理数据，并写入JSON文件。
    """
    results = []
    
    # 使用 utf-8-sig 编码以处理可能存在的 BOM (Byte Order Mark)
    with open(input_file, mode='r', encoding='utf-8-sig') as csvfile:
        reader = csv.DictReader(csvfile)
        
        for row in reader:
            # 基础信息，直接从CSV行中获取
            record = {
                "id": row.get('id', ''),
                "url": row.get('url', ''),
                "工单ID": row.get('工单ID(多盲工单)', ''),
                "一致性": row.get('一致性', '')
            }
            
            # 对比通用信息
            record['general_info'] = {
                "车内人数": compare_and_get(row, '车内人数'),
                "车内物品数": compare_and_get(row, '车内物品数'),
                "车内宠物数": compare_and_get(row, '车内宠物数'),
                "晚上": compare_and_get(row, '晚上')
            }
            
            # 处理人员信息 (假设最多有6个人)
            persons = []
            for i in range(1, 7): 
                if row.get(f'标注员1-person{i}-年龄') or row.get(f'标注员2-person{i}-年龄'):
                    person = {
                        "person_id": i,
                        "年龄": compare_and_get(row, f'person{i}-年龄'),
                        "性别": compare_and_get(row, f'person{i}-性别'),
                        "位置": compare_and_get(row, f'person{i}-位置'),
                        "行为": compare_and_get(row, f'person{i}-行为'),
                        "衣裤": {
                            "着装1类型": compare_and_get(row, f'person{i}-衣裤-着装1类型'),
                            "着装1颜色": compare_and_get(row, f'person{i}-衣裤-着装1颜色'),
                            "着装2类型": compare_and_get(row, f'person{i}-衣裤-着装2类型'),
                            "着装2颜色": compare_and_get(row, f'person{i}-衣裤-着装2颜色')
                        }
                    }
                    persons.append(person)
            record['persons'] = persons

            # ---【代码修正处】---
            # 修正了物品列名的格式，现在可以正确提取物品信息
            objects = []
            for i in range(1, 8): # 假设最多有20个物品
                # 物品名称的列名后缀很可能是 "物品1", "物品2" ...
                item_name_suffix = f'good{i}-种类'
                # 物品位置的列名后缀很可能是 "物品1-位置", "物品2-位置" ...
                item_location_suffix = f'good{i}-位置'

                # 检查两位标注员是否至少有一位填写了物品名称
                if row.get(f'标注员1-{item_name_suffix}') or row.get(f'标注员2-{item_name_suffix}'):
                    item = {
                        "object_id": i,
                        "物品": compare_and_get(row, item_name_suffix),
                        "位置": compare_and_get(row, item_location_suffix)
                    }
                    objects.append(item)
            record['objects'] = objects
        

            pets = []
            for i in range(1, 3): # 
                # 物品名称的列名后缀很可能是 "物品1", "物品2" ...
                item_name_suffix = f'pet{i}-种类'
                # 物品位置的列名后缀很可能是 "物品1-位置", "物品2-位置" ...
                item_location_suffix = f'pet{i}-位置'

                # 检查两位标注员是否至少有一位填写了物品名称
                if row.get(f'标注员1-{item_name_suffix}') or row.get(f'标注员2-{item_name_suffix}'):
                    item = {
                        "pet_id": i,
                        "物品": compare_and_get(row, item_name_suffix),
                        "位置": compare_and_get(row, item_location_suffix)
                    }
                    pets.append(item)
            record['pets'] = pets
            
            results.append(record)

    # 将处理后的结果写入JSON文件
    with open(output_file, 'w', encoding='utf-8') as jsonfile:
        json.dump(results, jsonfile, ensure_ascii=False, indent=4)
        
    print(f"处理完成！结果已保存到 {output_file}")

# --- 执行脚本 ---
# 确保您的CSV文件名为 'data.csv'，或者修改下面的文件名
process_csv_to_json('data3.csv', 'comparison_output3.json')

处理完成！结果已保存到 comparison_output3.json


In [50]:
import csv
import json

def process_data(file_path):
    """
    Processes a CSV file and converts it into a structured JSON format.

    Args:
        file_path (str): The path to the input CSV file.

    Returns:
        list: A list of dictionaries, where each dictionary represents a row
              in the desired JSON format.
    """
    json_data = []

    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            # Skip rows marked as '抛弃' (discard)
            if row['是否抛弃'] == '抛弃':
                continue
            
            # General Info
            general_info = {
                "车内人数": row.get('车内人数', ''),
                "车内物品数": row.get('车内物品数', ''),
                "车内宠物数": row.get('车内宠物数', ''),
                "晚上": row.get('晚上', '')
            }

            # Persons
            persons = []
            for i in range(1, 7):  # Loop through up to 6 possible persons

                if row.get(f'person{i}-年龄') or row.get(f'person{i}-位置'):
                    person_age = row.get(f'person{i}-年龄')
                    # if person_age and person_age != 'UNKNOWN':
                    person = {
                        "person_id": i,
                        "年龄": person_age,
                        "性别": row.get(f'person{i}-性别'),
                        "位置": row.get(f'person{i}-位置'),
                        "行为": row.get(f'person{i}-行为'),
                        "衣裤": {
                            "着装1类型": row.get(f'person{i}-衣裤-着装1类型', ''),
                            "着装1颜色": row.get(f'person{i}-衣裤-着装1颜色', ''),
                            "着装2类型": row.get(f'person{i}-衣裤-着装2类型', ''),
                            "着装2颜色": row.get(f'person{i}-衣裤-着装2颜色', '')
                        }
                    }
                    persons.append(person)

            # Objects and Pets
            objects = []
            pets = []
            object_id = 1
            # Assuming object information starts from column '背包' onwards
            # and follows the pattern "物品名,位置" which is not explicitly
            # in the provided snippet. The provided snippet has column headers
            # for items and locations directly. We will parse based on the column
            # names present in the snippet.
            # object_locations = {
            #     '背包': '一排',
            #     '前排扶手箱': '前排扶手箱',
            #     '挎包(单肩包、手袋)': '一排', # This seems to be a single item, not a column
            #     '二排中': '二排中'
            # }
            
            # The provided CSV snippet has "背包", "前排扶手箱", "挎包(单肩包、手袋)", "二排中" as headers.
            # We'll treat these as locations and their values as the items.
            # This is a bit of a guess based on the limited snippet.
            # A more robust solution would require a full understanding of the CSV structure.
            # Based on the snippet, it looks like a value in a cell under a column header (e.g., "背包")
            # signifies the presence of that object at that location.
            # Let's create a more general approach based on the JSON structure.
            # The JSON has "物品" and "位置". The CSV has item names as headers
            # and "location" as a header value.
            # The snippet shows headers for "背包", "前排扶手箱", "挎包(单肩包、手袋)", "二排中" and "一排".
            # The provided JSON has objects with a "物品" and "位置" key.
            # Let's use the provided JSON's `objects` list as a guide for what to extract.
            
            # Since the mapping is not straightforward from the CSV to the JSON,
            # this part of the code is a best-effort interpretation.
            # The snippet shows item names in column headers like '背包', '前排扶手箱', etc.,
            # and the JSON shows them as '物品'.
            
            # Let's create a list of possible object columns from the CSV.
            object_columns = [col for col in row.keys() if col not in ['﻿id', 'url', '是否抛弃', '车内人数', '车内物品数', '车内宠物数', '晚上'] and 'person' not in col and 'pet' not in col]
            
            for col in object_columns:
                # Assuming the presence of a value indicates an object.
                # A more precise rule would be needed if the data is more complex.
                item_name = col
                item_location = row.get(col)
                if item_location:
                    obj = {
                        "object_id": object_id,
                        "物品": item_name,
                        "位置": item_location.strip()
                    }
                    objects.append(obj)
                    object_id += 1

            # Final JSON structure for the row
            entry = {
                "id": row.get('id', ''),
                "url": row.get('url', ''),
                "工单ID": "...", # This information is not in the CSV, so it's a placeholder.
                "一致性": "...", # This information is not in the CSV, so it's a placeholder.
                "general_info": general_info,
                "persons": persons,
                "objects": objects,
                "pets": pets  # The CSV has a '车内宠物数' column but no detailed pet info.
            }
            json_data.append(entry)

    return json_data

# The user-provided CSV snippet is `data2.csv`.
file_path = 'data2.csv'
processed_json = process_data(file_path)

# Print the resulting JSON for review
# Using a specific example ID from the provided snippet to match the output format.
# for item in processed_json:
#     if item.get('id') == '26':
#         print(json.dumps(item, indent=4, ensure_ascii=False))
#         break
with open('Com_output.json', 'w', encoding='utf-8') as f:
    json.dump(processed_json, f, ensure_ascii=False, indent=4)