# Simple NER Evaluation Metrics

This notebook evaluates the NER model output in JSON format: `{"PER":[...], "LOC":[...], "ORG":[...], "MISC":[...]}`

It compares predicted JSON against expected JSON and computes entity-level metrics.


In [3]:
# Imports
import json
import ast
import pandas as pd
from collections import defaultdict
import os


In [4]:
# Load the inference results
with open("outputs/results/inference_results.json", "r", encoding="utf-8") as f:
    results = json.load(f)

print(f"Loaded {len(results)} inference results")


Loaded 1 inference results


In [5]:
def parse_json_output(output_str):
    """Parse JSON output to extract entities by type"""
    try:
        # Try to parse as JSON
        data = json.loads(output_str)
        
        # Convert to list of (entity_text, entity_type)
        entities = []
        for ent_type in ['PER', 'LOC', 'ORG', 'MISC']:
            if ent_type in data and isinstance(data[ent_type], list):
                for mention in data[ent_type]:
                    if isinstance(mention, list):
                        entities.append((mention, ent_type))
        return entities
    except:
        return []

def extract_tokens_from_input(input_str):
    """Extract token list from string representation"""
    try:
        # Parse the string representation of the list
        tokens = ast.literal_eval(input_str)
        return tokens
    except:
        return []

print("Sample JSON parsing:")
if results:
    sample = results[0]
    tokens = extract_tokens_from_input(sample['input'])
    expected = parse_json_output(sample['expected'])
    predicted = parse_json_output(sample['predicted'])
    
    print(f"Tokens: {tokens}")
    print(f"Expected entities: {expected}")
    print(f"Predicted entities: {predicted}")


Sample JSON parsing:
Tokens: ['SOCCER', '-', 'JAPAN', 'GET', 'LUCKY', 'WIN', ',', 'CHINA', 'IN', 'SURPRISE', 'DEFEAT', '.']
Expected entities: [(['CHINA'], 'PER'), (['JAPAN'], 'LOC')]
Predicted entities: []


In [6]:
def calculate_metrics(results):
    """Calculate precision, recall, F1 for each entity type from JSON"""
    # Initialize counters
    tag_stats = defaultdict(lambda: {'tp': 0, 'fp': 0, 'fn': 0})
    
    for result in results:
        # Parse expected and predicted JSON
        expected_entities = parse_json_output(result['expected'])
        predicted_entities = parse_json_output(result['predicted'])
        
        # Convert to sets for comparison (entity_text, entity_type)
        expected_set = {(tuple(ent[0]), ent[1]) for ent in expected_entities}
        predicted_set = {(tuple(ent[0]), ent[1]) for ent in predicted_entities}
        
        # Calculate TP, FP, FN for each entity type
        for ent_tuple, ent_type in expected_set:
            if (ent_tuple, ent_type) in predicted_set:
                # True positive
                tag_stats[ent_type]['tp'] += 1
            else:
                # False negative
                tag_stats[ent_type]['fn'] += 1
        
        for ent_tuple, ent_type in predicted_set:
            if (ent_tuple, ent_type) not in expected_set:
                # False positive
                tag_stats[ent_type]['fp'] += 1
    
    # Calculate metrics
    metrics = {}
    for tag in ['PER', 'LOC', 'ORG', 'MISC']:
        tp = tag_stats[tag]['tp']
        fp = tag_stats[tag]['fp']
        fn = tag_stats[tag]['fn']
        
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
        
        metrics[tag] = {
            'TP': tp,
            'FP': fp,
            'FN': fn,
            'Precision': precision,
            'Recall': recall,
            'F1-Score': f1
        }
    
    # Calculate overall metrics
    total_tp = sum(tag_stats[tag]['tp'] for tag in ['PER', 'LOC', 'ORG', 'MISC'])
    total_fp = sum(tag_stats[tag]['fp'] for tag in ['PER', 'LOC', 'ORG', 'MISC'])
    total_fn = sum(tag_stats[tag]['fn'] for tag in ['PER', 'LOC', 'ORG', 'MISC'])
    
    overall_precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0.0
    overall_recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0.0
    overall_f1 = 2 * overall_precision * overall_recall / (overall_precision + overall_recall) if (overall_precision + overall_recall) > 0 else 0.0
    
    metrics['OVERALL'] = {
        'TP': total_tp,
        'FP': total_fp,
        'FN': total_fn,
        'Precision': overall_precision,
        'Recall': overall_recall,
        'F1-Score': overall_f1
    }
    
    return metrics

# Calculate metrics
metrics = calculate_metrics(results)
print("Metrics calculated successfully!")


Metrics calculated successfully!


In [7]:
# Print metrics in CSV format
print("\n" + "="*80)
print("NER EVALUATION METRICS (CSV FORMAT)")
print("="*80)
print("\nEntity,TP,FP,FN,Precision,Recall,F1-Score")

# Print metrics for each tag and overall
for tag in ['PER', 'LOC', 'ORG', 'MISC', 'OVERALL']:
    m = metrics[tag]
    print(f"{tag},{m['TP']},{m['FP']},{m['FN']},{m['Precision']:.4f},{m['Recall']:.4f},{m['F1-Score']:.4f}")

print("="*80)



NER EVALUATION METRICS (CSV FORMAT)

Entity,TP,FP,FN,Precision,Recall,F1-Score
PER,0,0,1,0.0000,0.0000,0.0000
LOC,0,0,1,0.0000,0.0000,0.0000
ORG,0,0,0,0.0000,0.0000,0.0000
MISC,0,0,0,0.0000,0.0000,0.0000
OVERALL,0,0,2,0.0000,0.0000,0.0000


In [8]:
# Save metrics to CSV file
df = pd.DataFrame.from_dict(metrics, orient='index')
df.index.name = 'Entity'
df = df.reset_index()

# Reorder columns
df = df[['Entity', 'TP', 'FP', 'FN', 'Precision', 'Recall', 'F1-Score']]

# Save to CSV
csv_path = "outputs/results/simple_ner_metrics.csv"
df.to_csv(csv_path, index=False, float_format='%.4f')
print(f"\nMetrics saved to: {csv_path}")

# Display the dataframe
print("\nDetailed Metrics Table:")
print(df.to_string(index=False))



Metrics saved to: outputs/results/simple_ner_metrics.csv

Detailed Metrics Table:
 Entity  TP  FP  FN  Precision  Recall  F1-Score
    PER   0   0   1        0.0     0.0       0.0
    LOC   0   0   1        0.0     0.0       0.0
    ORG   0   0   0        0.0     0.0       0.0
   MISC   0   0   0        0.0     0.0       0.0
OVERALL   0   0   2        0.0     0.0       0.0


In [9]:
# Summary
print("\n" + "="*80)
print("EVALUATION SUMMARY")
print("="*80)

print(f"\nResults evaluated: {len(results)} samples")
print("\nThis evaluation compares JSON format predictions:")
print("- Expected: JSON with PER, LOC, ORG, MISC keys")
print("- Predicted: Model-generated JSON")
print("- Metrics: Entity-level precision, recall, F1")



EVALUATION SUMMARY

Results evaluated: 1 samples

This evaluation compares JSON format predictions:
- Expected: JSON with PER, LOC, ORG, MISC keys
- Predicted: Model-generated JSON
- Metrics: Entity-level precision, recall, F1
