In [16]:
import json
import re

with open(r'val_inference_results.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

level1_correct = 0
level2_correct = 0
total = 0
incorrect_entries = []

for i, item in enumerate(data):
    try:
        model_json = json.loads(item['model_output'].strip())
        
        gt_text = item['ground_truth'].replace('<|endoftext|>', '').strip()
        gt_json = json.loads(gt_text)
        
        model_level1 = model_json.get('topic', {}).get('level_1', '')
        model_level2 = model_json.get('topic', {}).get('level_2', '')
        
        gt_level1 = gt_json.get('topic', {}).get('level_1', '')
        gt_level2 = gt_json.get('topic', {}).get('level_2', '')
        
        l1_match = model_level1 == gt_level1
        l2_match = model_level2 == gt_level2
        
        if l1_match:
            level1_correct += 1
        if l2_match:
            level2_correct += 1
        
        if not l1_match or not l2_match:
            incorrect_entries.append({
                'index': i,
                'model': f"{model_level1} / {model_level2}",
                'ground_truth': f"{gt_level1} / {gt_level2}",
                'l1_match': l1_match,
                'l2_match': l2_match,
            })
        
        total += 1
    except Exception as e:
        print(f'Error parsing entry {i}: {e}')
        continue

print(f'Total samples: {total}')
print(f'Level 1 Accuracy: {level1_correct}/{total} = {level1_correct/total*100:.2f}%')
print(f'Level 2 Accuracy: {level2_correct}/{total} = {level2_correct/total*100:.2f}%')

print(f'\n{"="*80}')
print(f'Incorrect Predictions ({len(incorrect_entries)} entries):')
print(f'{"="*80}')
for entry in incorrect_entries:
    print(f"Entry {entry['index']}:")
    print(f"  Model:        {entry['model']}")
    print(f"  Ground Truth: {entry['ground_truth']}")
    print(f"  L1 Match: {entry['l1_match']}, L2 Match: {entry['l2_match']}")
    print()

Error parsing entry 20: Expecting value: line 1 column 1 (char 0)
Error parsing entry 26: Expecting value: line 1 column 1 (char 0)
Error parsing entry 46: Expecting value: line 1 column 1 (char 0)
Total samples: 46
Level 1 Accuracy: 43/46 = 93.48%
Level 2 Accuracy: 43/46 = 93.48%

Incorrect Predictions (3 entries):
Entry 22:
  Model:        Entertainment / Books
  Ground Truth: General / Chitchat
  L1 Match: False, L2 Match: False

Entry 31:
  Model:        Technology / Artificial Intelligence
  Ground Truth: General / Greetings
  L1 Match: False, L2 Match: False

Entry 44:
  Model:        Cybersecurity / Threats
  Ground Truth: Technology / Cybersecurity
  L1 Match: False, L2 Match: False

