In [2]:
import os
import json

def validate_label_format(filepath):
    try:
        with open(filepath, 'r', encoding='utf-8') as file:
            data = json.load(file)
        
        # 검수 기준 1: JSON 기본 구조 확인
        if not isinstance(data, dict):
            return f"Error: File {filepath} is not a valid JSON object."
        
        # 검수 기준 2: 필수 키 확인
        required_keys = ['text', 'is_sentence', 'shapes']
        for key in required_keys:
            if key not in data:
                return f"Error: Missing required key '{key}' in file {filepath}."
        
        # 검수 기준 3: text 필드 검증
        if not isinstance(data['text'], list):
            return f"Error: 'text' is not a list in file {filepath}."
        for idx, item in enumerate(data['text']):
            if not isinstance(item, str):
                return f"Error: Non-string item at index {idx} in 'text' in file {filepath}."
        
        # 추가 검수: text 리스트에서 컴마 누락 검증
        text_raw = open(filepath, 'r', encoding='utf-8').read()  # 원본 텍스트 읽기
        if '"text": [' in text_raw:  # "text" 필드가 시작되는 부분 검사
            text_section = text_raw.split('"text": [')[1].split(']')[0]
            if '"' in text_section and ',' not in text_section:
                return f"Error: Missing commas between items in 'text' list in file {filepath}."
        
        # 검수 기준 4: shapes 항목 검증
        if not isinstance(data['shapes'], list):
            return f"Error: 'shapes' is not a list in file {filepath}."
        
        for shape in data['shapes']:
            if not isinstance(shape, dict):
                return f"Error: Shape entry is not a dictionary in file {filepath}."
            if 'label' not in shape or 'shape_type' not in shape or 'points' not in shape:
                return f"Error: Shape entry missing required keys in file {filepath}."
            if not isinstance(shape['points'], list):
                return f"Error: 'points' is not a list in file {filepath}."
            for point in shape['points']:
                if not isinstance(point, list) or len(point) != 2:
                    return f"Error: Invalid point format in file {filepath}."
                if not all(isinstance(coord, (int, float)) for coord in point):
                    return f"Error: Point coordinates must be int or float in file {filepath}."
        
        return f"File {filepath} passed validation."
    
    except json.JSONDecodeError:
        return f"Error: File {filepath} is not a valid JSON."
    except Exception as e:
        return f"Error: Unexpected error in file {filepath}: {str(e)}"

def validate_directory(directory_path):
    results = []
    for root, _, files in os.walk(directory_path):
        for file in files:
            if file.endswith('.json'):
                filepath = os.path.join(root, file)
                result = validate_label_format(filepath)
                results.append(result)
    return results

# 검수 실행
directory = '/home/work/dataset/reason_seg/ReasonSeg/perishable_test'  # 검사할 디렉토리 경로
validation_results = validate_directory(directory)

# 결과 출력
for result in validation_results:
    print(result)


File /home/work/dataset/reason_seg/ReasonSeg/perishable_test/D2S_99004109_D2S_augmented.json passed validation.
File /home/work/dataset/reason_seg/ReasonSeg/perishable_test/D2S_99009443_D2S_augmented.json passed validation.
File /home/work/dataset/reason_seg/ReasonSeg/perishable_test/D2S_99003909_D2S_augmented.json passed validation.
File /home/work/dataset/reason_seg/ReasonSeg/perishable_test/D2S_035113_D2S_validation_wo_occlusion.json passed validation.
File /home/work/dataset/reason_seg/ReasonSeg/perishable_test/D2S_99003192_D2S_augmented.json passed validation.
File /home/work/dataset/reason_seg/ReasonSeg/perishable_test/D2S_99003823_D2S_augmented.json passed validation.
File /home/work/dataset/reason_seg/ReasonSeg/perishable_test/D2S_99009406_D2S_augmented.json passed validation.
File /home/work/dataset/reason_seg/ReasonSeg/perishable_test/D2S_99003391_D2S_augmented.json passed validation.
Error: File /home/work/dataset/reason_seg/ReasonSeg/perishable_test/D2S_039713_D2S_validatio

In [6]:
import os

# JSON 파일 여러 개 검증
directory = "/home/work/newdataset/reason_seg/ReasonSeg/val"

for filename in os.listdir(directory):
    if filename.endswith(".json"):
        with open(os.path.join(directory, filename), "r") as f:
            try:
                data = json.load(f)
                errors = validate_json(data)
                if errors:
                    print(f"Validation errors in {filename}:")
                    for error in errors:
                        print(f"- {error}")
                else:
                    print(f"{filename}: JSON data is valid!")
            except json.JSONDecodeError as e:
                print(f"Invalid JSON format in {filename}: {e}")


NameError: name 'validate_json' is not defined

CoT적용 Json 형태 파일 검수

In [2]:
import os
import json
from pathlib import Path

# 경로 설정
data_dir = Path("/home/work/dataset/reason_seg/ReasonSeg/val(FTcot)")

# 검수 결과 저장 리스트
valid_files = []
invalid_files = []

# 파일 순회
for json_file in data_dir.rglob("*.json"):
    try:
        with open(json_file, "r", encoding="utf-8") as f:
            data = json.load(f)

        # text 필드 존재 여부 및 타입 확인
        if "text" not in data:
            print(f"[MISSING] 'text' field not found in: {json_file}")
            invalid_files.append(json_file)
        elif not isinstance(data["text"], list):
            print(f"[INVALID] 'text' field is not a list in: {json_file}")
            invalid_files.append(json_file)
        else:
            # 각 문장이 str 타입인지도 확인
            for i, entry in enumerate(data["text"]):
                if not isinstance(entry, str):
                    print(f"[INVALID] Entry at index {i} is not string in: {json_file}")
                    invalid_files.append(json_file)
                    break
            else:
                valid_files.append(json_file)

    except json.JSONDecodeError as e:
        print(f"[JSON ERROR] {json_file}: {e}")
        invalid_files.append(json_file)
    except Exception as e:
        print(f"[ERROR] {json_file}: {e}")
        invalid_files.append(json_file)

# 요약 출력
print("\n--- 검수 결과 ---")
print(f"✅ Valid files: {len(valid_files)}")
print(f"❌ Invalid files: {len(invalid_files)}")



--- 검수 결과 ---
✅ Valid files: 60
❌ Invalid files: 0
