In [7]:
import json
from pathlib import Path
from typing import Dict, List, Any

In [26]:
pip install fuzzywuzzy python-Levenshtein

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Downloading levenshtein-0.27.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-Levenshtein)
  Downloading rapidfuzz-3.13.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (12 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp312-cp312-macosx_11_0_arm64.whl (156 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.4/156.4 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading rapidfuzz-3.13.0-cp312-cp312-macosx_11_0_arm64.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m11.9 MB/s

In [27]:
from fuzzywuzzy import fuzz

In [8]:
def load_json(path: Path) -> List[Dict]:
    """Load a JSON file and return a list of dictionaries."""
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

In [29]:
def evaluate_entities_detailed(human: List[Dict], extracted: List[Dict]) -> Dict[str, Any]:
    detailed_results = {}

    def compare_items(h_items, e_items, fuzzy_threshold=85):
        exact_matches, mismatches, missing, hallucinations = set(), [], set(), set()

        h_set = set(i.lower().strip() for i in h_items if i.strip())
        e_set = set(i.lower().strip() for i in e_items if i.strip())

        exact = h_set & e_set
        exact_matches.update(exact)

        miss = h_set - e_set
        extra = e_set - h_set

        # Improved mismatch detection: fuzzy match
        for m in miss.copy():
            best_score = 0
            best_e = None
            for e in extra:
                score = fuzz.partial_ratio(m, e)
                if score > best_score:
                    best_score = score
                    best_e = e

            if best_score >= fuzzy_threshold:
                mismatches.append((m, best_e))
                miss.discard(m)
                extra.discard(best_e)

        missing.update(miss)
        hallucinations.update(extra)

        return {
            "exact_matches": list(exact_matches),
            "mismatches": mismatches,
            "missing": list(missing),
            "hallucinations": list(hallucinations)
        }

    for human_case in human:
        extracted_case = next((ec for ec in extracted if ec["case_id"] == human_case["case_id"]), None)
        if not extracted_case:
            continue

        h_res, e_res = human_case["result"], extracted_case["result"]

        detailed_results[human_case["case_id"]] = {
            "fault_location": compare_items(
                [f"{h_res['fault_location']['name']}_{h_res['fault_location']['machine']}"],
                [f"{e_res['fault_location']['name']}_{e_res['fault_location']['machine']}"]
            ),
            "fault_symptoms": compare_items(
                h_res["fault_symptoms"],
                e_res["fault_symptoms"]
            ),
            "fault_reasons": compare_items(
                [r["name"] for r in h_res["fault_reason"]],
                [r["name"] for r in e_res["fault_reason"]]
            ),
            "fault_measures": compare_items(
                [m["description"] for m in h_res["fault_measures"]],
                [m["description"] for m in e_res["fault_measures"]]
            ),
            "resolution_status": compare_items(
                [h_res["resolution_status"]],
                [e_res["resolution_status"]]
            ),
        }

    return detailed_results


In [33]:
def main():
    human_annotations_path = Path("/Users/wbm/Documents/BIT/Research Topics/Evaluation Knowledge Extraction/formatted_annotated_cases_2.json")
    extracted_output_path = Path("/Users/wbm/Documents/BIT/Research Topics/few-shot-prompting/baml_extracted_20_cases.json")

    human_data = load_json(human_annotations_path)
    extracted_data = load_json(extracted_output_path)

    
    detailed_results = evaluate_entities_detailed(human_data, extracted_data)

    # Optional: aggregate metrics for summary
    total_exact = total_mismatch = total_missing = total_hallucinations = 0

    for case_id, metrics in detailed_results.items():
        for entity_type, entity_metrics in metrics.items():
            total_exact += len(entity_metrics["exact_matches"])
            total_mismatch += len(entity_metrics["mismatches"])
            total_missing += len(entity_metrics["missing"])
            total_hallucinations += len(entity_metrics["hallucinations"])

    total = total_exact + total_mismatch + total_missing + total_hallucinations

    print("\nEvaluation Summary")
    print("============================")
    print(f"Exact Matches: {total_exact} ({total_exact/total:.1%})")
    print(f"Mismatches: {total_mismatch} ({total_mismatch/total:.1%})")
    print(f"Missing: {total_missing} ({total_missing/total:.1%})")
    print(f"Potential Hallucinations: {total_hallucinations} ({total_hallucinations/total:.1%})")

    # Save detailed results for manual checking
    with open("per_case_evaluation_details.json", "w", encoding="utf-8") as f:
        json.dump(detailed_results, f, indent=2, ensure_ascii=False)

    print("\n✅ Per-case evaluation details saved as 'per_case_evaluation_details.json'")

main()



Evaluation Summary
Exact Matches: 68 (51.5%)
Mismatches: 23 (17.4%)
Missing: 21 (15.9%)
Potential Hallucinations: 20 (15.2%)

✅ Per-case evaluation details saved as 'per_case_evaluation_details.json'


In [34]:
from pathlib import Path

def load_json(path: Path) -> list:
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

# Load the two files
human_data = load_json(Path("/Users/wbm/Documents/BIT/Research Topics/Evaluation Knowledge Extraction/formatted_annotated_cases_2.json"))
extracted_data = load_json(Path("/Users/wbm/Documents/BIT/Research Topics/few-shot-prompting/baml_extracted_20_cases.json"))

In [35]:
detailed_results = evaluate_entities_detailed(human_data, extracted_data)

# Save for manual checking
with open("per_case_evaluation_details.json", "w", encoding="utf-8") as f:
    json.dump(detailed_results, f, indent=2, ensure_ascii=False)

print("✅ Per-case evaluation details saved!")


✅ Per-case evaluation details saved!
