In [1]:
import json
from pathlib import Path

In [4]:
# Input and Output paths
input_path = Path("25_Annotated_Cases_2.jsonl")
output_path = Path("formatted_annotated_cases_2.json")

In [5]:
# Load JSONL file from doccano
data = []
with open(input_path, "r", encoding="utf-8") as f:
    for line in f:
        data.append(json.loads(line))

In [6]:
formatted_data = []

for entry in data:
    text = entry["text"]
    labels = entry.get("label", [])

    # Extract case ID and Machine (e.g., IBM3 and IBM4)
    first_line = text.splitlines()[0]
    case_id = first_line.replace("Case-ID:", "").strip()
    machine = "IBM3" if "IBM3" in case_id else "IBM4" if "IBM4" in case_id else "unknown"

    # Initialize containers
    fault_location = {"name": "", "machine": machine}
    fault_symptoms = []
    fault_reason = []
    fault_measures = []
    resolution_status = "Unknown"

    # Sort labels by start position
    labels = sorted(labels, key=lambda x: x[0])

    for start, end, label_type in labels:
        span_text = text[start:end].strip()
        if label_type == "FaultLocation":
            fault_location = {"name": span_text, "machine": machine}
        elif label_type == "FaultSymptom":
            fault_symptoms.append(span_text)
        elif label_type == "FaultReason":
            fault_reason.append({"name": span_text})
        elif label_type == "FaultMeasure":
            fault_measures.append({"description": span_text})

    # Attempt to extract resolution status from the last line
    if "Resolution Status" in text:
        lines = text.splitlines()
        for line in lines:
            if "Resolution Status" in line:
                parts = line.split(":")
                if len(parts) > 1:
                    resolution_status = parts[1].strip()
                break
    
    formatted_data.append({
        "case_id": case_id,
        "result": {
            "fault_location": fault_location,
            "fault_symptoms": fault_symptoms,
            "fault_reason": fault_reason,
            "fault_measures": fault_measures,
            "resolution_status": resolution_status
        }
    })
    

In [7]:
print(f"✅ Reformatted {len(formatted_data)} cases.")

✅ Reformatted 25 cases.


In [8]:
# Save to JSON
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(formatted_data, f, indent=2, ensure_ascii=False)

print(f"✅ Done! Saved formatted data to: {output_path}")

✅ Done! Saved formatted data to: formatted_annotated_cases_2.json
