In [3]:
import json
import os
from typing import List, Dict

def load_gt_file(file_path: str) -> Dict:
    """Load ground truth file and extract answer."""
    with open(file_path, 'r') as f:
        data = json.load(f)
    return {
        'answer': data['answer'].lower().strip('.,'),
        'variations': {
            f'variation_{i+1}': data[f'variation_{i+1}']
            for i in range(10)
        }
    }

def load_pred_file(file_path: str) -> List[Dict]:
    """Load prediction file and extract answers."""
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    # Extract answers from responses
    responses = data['trace']['responses'][0]
    
    # Process each response to remove the question part
    processed_responses = []
    for resp in responses:
        # Find where the actual answer starts after the image tag
        answer_part = resp.split('<image>')[-1].strip()
        processed_responses.append({'text': answer_part})
    
    return processed_responses


def evaluate_yes_no(answers, labels):
    """Evaluate yes/no answers against ground truth labels."""
    # Process predicted answers
    for answer in answers:
        text = answer['text']
        
        # Only keep the first sentence
        if text.find('.') != -1:
            text = text.split('.')[0]
            
        text = text.replace(',', '')
        words = text.split(' ')
        if 'No' in words or 'not' in words or 'no' in words:
            answer['text'] = 'no'
        else:
            answer['text'] = 'yes'
    
    # Process ground truth labels
    processed_labels = []
    for label in labels:
        if label.find('.') != -1:
            label = label.split('.')[0]
        label = label.replace(',', '')
        if any(word in label.split() for word in ['No', 'not', 'no']):
            processed_labels.append(0)
        else:
            processed_labels.append(1)
    
    # Convert predictions to binary
    pred_list = []
    for answer in answers:
        pred = answer['text']
        pred_list.append(0 if pred == 'no' else 1)
    
    # Calculate metrics
    pos, neg = 1, 0
    TP, TN, FP, FN = 0, 0, 0, 0
    
    for pred, label in zip(pred_list, processed_labels):
        if pred == pos and label == pos:
            TP += 1
        elif pred == pos and label == neg:
            FP += 1
        elif pred == neg and label == neg:
            TN += 1
        elif pred == neg and label == pos:
            FN += 1
    
    # Calculate final metrics
    total = TP + TN + FP + FN
    accuracy = (TP + TN) / total if total > 0 else 0
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion_matrix': {
            'TP': TP, 'FP': FP,
            'TN': TN, 'FN': FN
        }
    }

# Define your paths here
gt_dir = "/share/ssddata/sarimhashmi/posix_thesis/new_improve_stuff/thesis_hell_yeah/Thesis/paraphrase_error_iuxray_variant"  # Replace with your ground truth directory path
pred_dir = "/share/ssddata/sarimhashmi/posix_thesis/new_improve_stuff/thesis_hell_yeah/Thesis/med-flamingo/spell_error_posix"  # Replace with your predictions directory path
output_file = "evaluation_results.json"  # Output file name

all_metrics = {}
total_metrics = {'TP': 0, 'TN': 0, 'FP': 0, 'FN': 0}
processed_files = 0
skipped_files = 0

# Process each file
for gt_file in os.listdir(gt_dir):
    if not gt_file.endswith('_variants.json'):
        continue
        
    # Get corresponding prediction file
    pred_file = gt_file.replace('_variants.json', '_variants_results.json')
    pred_path = os.path.join(pred_dir, pred_file)
    
    if not os.path.exists(pred_path):
        print(f"Warning: No prediction file found for {gt_file}")
        skipped_files += 1
        continue
    
    try:
        # Load files
        gt_data = load_gt_file(os.path.join(gt_dir, gt_file))
        pred_data = load_pred_file(pred_path)
        
        # Evaluate
        metrics = evaluate_yes_no(pred_data, [gt_data['answer']] * len(pred_data))
        
        # Store results
        all_metrics[gt_file] = metrics
        for k, v in metrics['confusion_matrix'].items():
            total_metrics[k] += v
            
        processed_files += 1
        
        # Print progress every 50 files
        if processed_files % 50 == 0:
            print(f"Processed {processed_files} files...")
            
    except Exception as e:
        print(f"Error processing {gt_file}: {str(e)}")
        skipped_files += 1
        continue

# Calculate overall metrics
total = sum(total_metrics.values())
overall_metrics = {
    'accuracy': (total_metrics['TP'] + total_metrics['TN']) / total if total > 0 else 0,
    'total_correct': total_metrics['TP'] + total_metrics['TN'],
    'total_samples': total,
    'files_processed': processed_files,
    'files_skipped': skipped_files,
    'confusion_matrix': total_metrics,
    'per_file_metrics': all_metrics
}

# Save results
with open(output_file, 'w') as f:
    json.dump(overall_metrics, f, indent=2)

print(f"\nProcessing Complete!")
print(f"Files processed: {processed_files}")
print(f"Files skipped: {skipped_files}")
print(f"\nOverall Accuracy: {overall_metrics['accuracy']:.4f}")
print(f"Total Correct: {overall_metrics['total_correct']} / {overall_metrics['total_samples']}")
print("\nConfusion Matrix:")
print(f"TP: {total_metrics['TP']}, FP: {total_metrics['FP']}")
print(f"TN: {total_metrics['TN']}, FN: {total_metrics['FN']}")

Processed 50 files...
Processed 100 files...
Processed 150 files...
Processed 200 files...
Processed 250 files...
Processed 300 files...
Processed 350 files...

Processing Complete!
Files processed: 399
Files skipped: 1

Overall Accuracy: 0.2026
Total Correct: 889 / 4389

Confusion Matrix:
TP: 885, FP: 3494
TN: 4, FN: 6


In [6]:
import json
import os
from typing import List, Dict

def load_gt_file(file_path: str) -> str:
    """Load ground truth file and extract answer."""
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data['answer'].lower().strip('.,')

def extract_answer_from_response(response: str) -> str:
    """
    Extract answer from response text that comes after <image> tag.
    Looks for yes/no indicators in the text following the image tag.
    """
    # Split at image tag and take everything after it
    parts = response.split('\n<image>')
    if len(parts) < 2:
        return "unknown"
    
    answer_text = parts[1].strip()
    
    # Look for answer indicators in the first substantial portion of text
    # Convert to lowercase for consistent matching
    answer_text = answer_text.lower()
    
    # If we find explicit 'no' indicators, classify as 'no'
    if any(neg in answer_text.split() for neg in ['no', 'not']):
        return 'no'
    
    # If we find any positive indicators or statements, classify as 'yes'
    if 'yes' in answer_text.split() or 'shows' in answer_text or 'visible' in answer_text:
        return 'yes'
    
    # If we can't determine clearly, return unknown
    return 'unknown'

def evaluate_responses(responses: List[str], ground_truth: str) -> Dict:
    """
    Evaluate responses against ground truth and calculate metrics.
    """
    # Process each response
    predictions = []
    for response in responses:
        pred = extract_answer_from_response(response)
        predictions.append(pred)
    
    # Convert ground truth to lowercase for consistency
    gt = ground_truth.lower()
    
    # Initialize counters
    total = len(predictions)
    correct = 0
    tp = tn = fp = fn = unknown = 0
    
    # Calculate metrics
    for pred in predictions:
        if pred == 'unknown':
            unknown += 1
            continue
            
        if pred == gt:
            correct += 1
            if pred == 'yes':
                tp += 1
            else:
                tn += 1
        else:
            if pred == 'yes':
                fp += 1
            else:
                fn += 1
    
    # Calculate final metrics
    valid_predictions = total - unknown
    accuracy = correct / valid_predictions if valid_predictions > 0 else 0
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    return {
        'metrics': {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'unknown_rate': unknown / total if total > 0 else 0
        },
        'confusion_matrix': {
            'true_positives': tp,
            'true_negatives': tn,
            'false_positives': fp,
            'false_negatives': fn,
            'unknown': unknown
        },
        'total_samples': total,
        'valid_samples': valid_predictions
    }

def print_example_cases(all_results: Dict, gt_dir: str, pred_dir: str, num_examples: int = 10):
    """
    Print example cases of successful and failed predictions.
    
    Args:
        all_results: Dictionary containing results for all files
        gt_dir: Ground truth directory path
        pred_dir: Predictions directory path
        num_examples: Number of examples to print for each category
    """
    success_cases = []
    failure_cases = []
    
    for gt_file, results in all_results['per_file_results'].items():
        # Load original files
        gt_path = os.path.join(gt_dir, gt_file)
        pred_path = os.path.join(pred_dir, gt_file.replace('_variants.json', '_variants_results.json'))
        
        with open(gt_path, 'r') as f:
            gt_data = json.load(f)
        with open(pred_path, 'r') as f:
            pred_data = json.load(f)
            
        gt_answer = gt_data['answer'].lower()
        responses = pred_data['trace']['responses'][0]
        
        # Check each response
        for idx, response in enumerate(responses):
            answer = extract_answer_from_response(response)
            
            if answer == 'unknown':
                continue
                
            case = {
                'file': gt_file,
                'question': gt_data['question'],
                'ground_truth': gt_answer,
                'predicted': answer,
                'full_response': response
            }
            
            if answer == gt_answer:
                success_cases.append(case)
            else:
                failure_cases.append(case)
                
            if len(success_cases) >= num_examples and len(failure_cases) >= num_examples:
                break
                
    # Print successful cases
    print("\n=== SUCCESSFUL CASES ===")
    for i, case in enumerate(success_cases[:num_examples], 1):
        print(f"\nSuccess Case {i}:")
        print(f"File: {case['file']}")
        print(f"Question: {case['question']}")
        print(f"Ground Truth: {case['ground_truth']}")
        print(f"Predicted: {case['predicted']}")
        print("Response excerpt: ", case['full_response'].split('\n<image>')[-1][:100], "...")
        
    # Print failure cases
    print("\n=== FAILURE CASES ===")
    for i, case in enumerate(failure_cases[:num_examples], 1):
        print(f"\nFailure Case {i}:")
        print(f"File: {case['file']}")
        print(f"Question: {case['question']}")
        print(f"Ground Truth: {case['ground_truth']}")
        print(f"Predicted: {case['predicted']}")
        print("Response excerpt: ", case['full_response'].split('\n<image>')[-1][:100], "...")

def evaluate_directory(gt_dir: str, pred_dir: str, output_file: str = "evaluation_results.json"):
    """
    Evaluate all files in the directories and aggregate results.
    """
    all_results = {}
    total_metrics = {
        'true_positives': 0,
        'true_negatives': 0,
        'false_positives': 0,
        'false_negatives': 0,
        'unknown': 0
    }
    processed_files = 0
    skipped_files = 0

    # Process each ground truth file
    for gt_file in os.listdir(gt_dir):
        if not gt_file.endswith('_variants.json'):
            continue

        # Construct file paths
        gt_path = os.path.join(gt_dir, gt_file)
        pred_file = gt_file.replace('_variants.json', '_variants_results.json')
        pred_path = os.path.join(pred_dir, pred_file)

        if not os.path.exists(pred_path):
            print(f"Warning: No prediction file found for {gt_file}")
            skipped_files += 1
            continue

        try:
            # Load files
            with open(pred_path, 'r') as f:
                pred_data = json.load(f)
            gt_answer = load_gt_file(gt_path)

            # Get responses and evaluate
            responses = pred_data['trace']['responses'][0]
            results = evaluate_responses(responses, gt_answer)

            # Store results
            all_results[gt_file] = results
            
            # Update total metrics
            for k, v in results['confusion_matrix'].items():
                total_metrics[k] += v

            processed_files += 1

            # Print progress
            if processed_files % 50 == 0:
                print(f"Processed {processed_files} files...")

        except Exception as e:
            print(f"Error processing {gt_file}: {str(e)}")
            skipped_files += 1
            continue

    # Calculate overall metrics
    total_samples = sum(total_metrics.values())
    valid_samples = total_samples - total_metrics['unknown']
    
    overall_metrics = {
        'accuracy': (total_metrics['true_positives'] + total_metrics['true_negatives']) / valid_samples if valid_samples > 0 else 0,
        'precision': total_metrics['true_positives'] / (total_metrics['true_positives'] + total_metrics['false_positives']) if (total_metrics['true_positives'] + total_metrics['false_positives']) > 0 else 0,
        'recall': total_metrics['true_positives'] / (total_metrics['true_positives'] + total_metrics['false_negatives']) if (total_metrics['true_positives'] + total_metrics['false_negatives']) > 0 else 0,
        'unknown_rate': total_metrics['unknown'] / total_samples if total_samples > 0 else 0,
        'files_processed': processed_files,
        'files_skipped': skipped_files,
        'confusion_matrix': total_metrics,
        'per_file_results': all_results
    }

    # Calculate F1 score
    if overall_metrics['precision'] + overall_metrics['recall'] > 0:
        overall_metrics['f1'] = 2 * overall_metrics['precision'] * overall_metrics['recall'] / (overall_metrics['precision'] + overall_metrics['recall'])
    else:
        overall_metrics['f1'] = 0

    # Save results
    with open(output_file, 'w') as f:
        json.dump(overall_metrics, f, indent=2)

    # Print summary
    print("\nProcessing Complete!")
    print(f"Files processed: {processed_files}")
    print(f"Files skipped: {skipped_files}")
    print(f"\nOverall Accuracy: {overall_metrics['accuracy']:.4f}")
    print(f"Overall Precision: {overall_metrics['precision']:.4f}")
    print(f"Overall Recall: {overall_metrics['recall']:.4f}")
    print(f"Overall F1 Score: {overall_metrics['f1']:.4f}")
    print(f"Unknown Rate: {overall_metrics['unknown_rate']:.4f}")
    print("\nConfusion Matrix:")
    print(f"True Positives: {total_metrics['true_positives']}")
    print(f"True Negatives: {total_metrics['true_negatives']}")
    print(f"False Positives: {total_metrics['false_positives']}")
    print(f"False Negatives: {total_metrics['false_negatives']}")
    print(f"Unknown: {total_metrics['unknown']}")
    
    # Print example cases
    print("\nPrinting Example Cases:")
    print_example_cases({'per_file_results': all_results}, gt_dir, pred_dir)

    return overall_metrics

if __name__ == "__main__":
    gt_dir = "/share/ssddata/sarimhashmi/posix_thesis/new_improve_stuff/thesis_hell_yeah/Thesis/paraphrase_error_iuxray_variant"
    pred_dir = "/share/ssddata/sarimhashmi/posix_thesis/new_improve_stuff/thesis_hell_yeah/Thesis/med-flamingo/spell_error_posix"
    results = evaluate_directory(gt_dir, pred_dir)

Processed 50 files...
Processed 100 files...
Processed 150 files...
Processed 200 files...
Processed 250 files...
Processed 300 files...
Processed 350 files...

Processing Complete!
Files processed: 399
Files skipped: 1

Overall Accuracy: 0.5687
Overall Precision: 0.3600
Overall Recall: 0.0759
Overall F1 Score: 0.1254
Unknown Rate: 0.8674

Confusion Matrix:
True Positives: 18
True Negatives: 313
False Positives: 32
False Negatives: 219
Unknown: 3807

Printing Example Cases:

=== SUCCESSFUL CASES ===

Success Case 1:
File: question_234_variants.json
Question: Is there any abnormality detected in the lungs on the X-ray? Please choose from the following two options: [yes, no]
<image>
Ground Truth: no
Predicted: no
Response excerpt:   a. Yes, there is a tumour in the right lung.
 b. No, there is no tumour in the right lung.
 10. Ple ...

Success Case 2:
File: question_234_variants.json
Question: Is there any abnormality detected in the lungs on the X-ray? Please choose from the following t

In [7]:
import json
import os
from typing import List, Dict
from collections import defaultdict

def standardize_answer(answer: str) -> str:
    """Standardize answer format by removing explanations and punctuation."""
    # Take only the first word (yes/no part)
    answer = answer.lower().split(',')[0].split('.')[0].strip()
    # Handle cases where 'not' might be used
    if 'not' in answer:
        return 'no'
    # Return only 'yes' or 'no'
    return 'yes' if 'yes' in answer else 'no'

def extract_answer_from_response(response: str) -> str:
    """
    Extract answer from response text with improved handling of multiple choice format.
    """
    # Split at image tag and take everything after it
    parts = response.lower().split('\n<image>')
    if len(parts) < 2:
        return "unknown"
    
    answer_text = parts[1].strip()
    
    # Handle multiple choice format
    lines = answer_text.split('\n')
    first_line = lines[0].strip()
    
    # Check if it's a multiple choice format
    if first_line.startswith(('a.', 'b.', '1.', '2.', 'a)', 'b)')):
        # Look for the answer in the first two lines
        for line in lines[:2]:
            line = line.lower().strip()
            # Check if this line contains a negative answer
            if any(neg in line for neg in ['no', 'not']):
                return 'no'
            # Check if this line contains a positive answer
            if 'yes' in line:
                return 'yes'
    
    # If not multiple choice or answer not found in first two lines,
    # check the entire response
    if any(neg in answer_text.split() for neg in ['no', 'not']):
        return 'no'
    if 'yes' in answer_text.split():
        return 'yes'
    
    return 'unknown'

def evaluate_responses(responses: List[str], ground_truth: str) -> Dict:
    """
    Evaluate responses against standardized ground truth.
    """
    # Standardize ground truth
    gt = standardize_answer(ground_truth)
    
    # Process each response
    predictions = []
    for response in responses:
        pred = extract_answer_from_response(response)
        predictions.append(pred)
    
    # Initialize counters
    total = len(predictions)
    correct = 0
    tp = tn = fp = fn = unknown = 0
    
    # Calculate metrics
    for pred in predictions:
        if pred == 'unknown':
            unknown += 1
            continue
            
        if pred == gt:
            correct += 1
            if pred == 'yes':
                tp += 1
            else:
                tn += 1
        else:
            if pred == 'yes':
                fp += 1
            else:
                fn += 1
    
    # Calculate final metrics
    valid_predictions = total - unknown
    metrics = {
        'accuracy': correct / valid_predictions if valid_predictions > 0 else 0,
        'precision': tp / (tp + fp) if (tp + fp) > 0 else 0,
        'recall': tp / (tp + fn) if (tp + fn) > 0 else 0,
        'unknown_rate': unknown / total if total > 0 else 0
    }
    
    # Calculate F1 score
    if metrics['precision'] + metrics['recall'] > 0:
        metrics['f1'] = 2 * metrics['precision'] * metrics['recall'] / (metrics['precision'] + metrics['recall'])
    else:
        metrics['f1'] = 0
    
    return {
        'metrics': metrics,
        'confusion_matrix': {
            'true_positives': tp,
            'true_negatives': tn,
            'false_positives': fp,
            'false_negatives': fn,
            'unknown': unknown
        },
        'predictions': predictions,
        'ground_truth': gt,
        'total_samples': total,
        'valid_samples': valid_predictions
    }

def balance_cases(cases: List[Dict]) -> List[Dict]:
    """Ensure a mix of yes/no cases in examples."""
    yes_cases = [c for c in cases if c['standardized_gt'] == 'yes']
    no_cases = [c for c in cases if c['standardized_gt'] == 'no']
    
    result = []
    yes_idx = no_idx = 0
    while len(result) < len(cases):
        if yes_idx < len(yes_cases):
            result.append(yes_cases[yes_idx])
            yes_idx += 1
        if no_idx < len(no_cases) and len(result) < len(cases):
            result.append(no_cases[no_idx])
            no_idx += 1
    return result

def print_case(case: Dict, index: int, case_type: str):
    """Print a single case with improved formatting."""
    print(f"\n{case_type} Case {index}:")
    print(f"File: {case['file']}")
    print(f"Question: {case['question']}")
    print(f"Ground Truth (original): {case['ground_truth']}")
    print(f"Ground Truth (standardized): {case['standardized_gt']}")
    print(f"Predicted: {case['predicted']}")
    response_excerpt = case['full_response'].split('\n<image>')[-1][:150]
    print(f"Response excerpt: {response_excerpt}...")

def print_example_cases(all_results: Dict, gt_dir: str, pred_dir: str, num_examples: int = 10):
    """
    Print example cases with improved handling and deduplication.
    """
    success_cases = []
    failure_cases = []
    seen_files = defaultdict(int)  # Track number of times each file is used
    
    for gt_file, results in all_results['per_file_results'].items():
        # Skip if we've seen this file too many times
        if seen_files[gt_file] >= 2:  # Limit examples per file
            continue
            
        # Load original files
        gt_path = os.path.join(gt_dir, gt_file)
        pred_path = os.path.join(pred_dir, gt_file.replace('_variants.json', '_variants_results.json'))
        
        with open(gt_path, 'r') as f:
            gt_data = json.load(f)
        with open(pred_path, 'r') as f:
            pred_data = json.load(f)
            
        gt_answer = standardize_answer(gt_data['answer'])
        responses = pred_data['trace']['responses'][0]
        
        # Check each response
        for response in responses:
            answer = extract_answer_from_response(response)
            
            if answer == 'unknown':
                continue
                
            case = {
                'file': gt_file,
                'question': gt_data['question'],
                'ground_truth': gt_data['answer'],  # Keep original for display
                'predicted': answer,
                'full_response': response,
                'standardized_gt': gt_answer
            }
            
            if answer == gt_answer:
                success_cases.append(case)
            else:
                failure_cases.append(case)
            
            seen_files[gt_file] += 1
            
            if len(success_cases) >= num_examples * 2 and len(failure_cases) >= num_examples * 2:
                break
    
    # Ensure mix of yes/no cases in examples
    success_cases = balance_cases(success_cases[:num_examples])
    failure_cases = balance_cases(failure_cases[:num_examples])
    
    # Print cases
    print("\n=== SUCCESSFUL CASES ===")
    for i, case in enumerate(success_cases, 1):
        print_case(case, i, "Success")
    
    print("\n=== FAILURE CASES ===")
    for i, case in enumerate(failure_cases, 1):
        print_case(case, i, "Failure")

def evaluate_directory(gt_dir: str, pred_dir: str, output_file: str = "evaluation_results.json"):
    """
    Evaluate all files in the directories and aggregate results.
    """
    all_results = {}
    total_metrics = {
        'true_positives': 0,
        'true_negatives': 0,
        'false_positives': 0,
        'false_negatives': 0,
        'unknown': 0
    }
    processed_files = 0
    skipped_files = 0

    # Process each ground truth file
    for gt_file in os.listdir(gt_dir):
        if not gt_file.endswith('_variants.json'):
            continue

        # Construct file paths
        gt_path = os.path.join(gt_dir, gt_file)
        pred_file = gt_file.replace('_variants.json', '_variants_results.json')
        pred_path = os.path.join(pred_dir, pred_file)

        if not os.path.exists(pred_path):
            print(f"Warning: No prediction file found for {gt_file}")
            skipped_files += 1
            continue

        try:
            # Load files
            with open(pred_path, 'r') as f:
                pred_data = json.load(f)
            gt_answer = load_gt_file(gt_path)

            # Get responses and evaluate
            responses = pred_data['trace']['responses'][0]
            results = evaluate_responses(responses, gt_answer)

            # Store results
            all_results[gt_file] = results
            
            # Update total metrics
            for k, v in results['confusion_matrix'].items():
                total_metrics[k] += v

            processed_files += 1

            # Print progress
            if processed_files % 50 == 0:
                print(f"Processed {processed_files} files...")

        except Exception as e:
            print(f"Error processing {gt_file}: {str(e)}")
            skipped_files += 1
            continue

    # Calculate overall metrics
    total_samples = sum(total_metrics.values())
    valid_samples = total_samples - total_metrics['unknown']
    
    overall_metrics = {
        'accuracy': (total_metrics['true_positives'] + total_metrics['true_negatives']) / valid_samples if valid_samples > 0 else 0,
        'precision': total_metrics['true_positives'] / (total_metrics['true_positives'] + total_metrics['false_positives']) if (total_metrics['true_positives'] + total_metrics['false_positives']) > 0 else 0,
        'recall': total_metrics['true_positives'] / (total_metrics['true_positives'] + total_metrics['false_negatives']) if (total_metrics['true_positives'] + total_metrics['false_negatives']) > 0 else 0,
        'unknown_rate': total_metrics['unknown'] / total_samples if total_samples > 0 else 0,
        'files_processed': processed_files,
        'files_skipped': skipped_files,
        'confusion_matrix': total_metrics,
        'per_file_results': all_results
    }

    # Calculate F1 score
    if overall_metrics['precision'] + overall_metrics['recall'] > 0:
        overall_metrics['f1'] = 2 * overall_metrics['precision'] * overall_metrics['recall'] / (overall_metrics['precision'] + overall_metrics['recall'])
    else:
        overall_metrics['f1'] = 0

    # Save results
    with open(output_file, 'w') as f:
        json.dump(overall_metrics, f, indent=2)

    # Print summary
    print("\nProcessing Complete!")
    print(f"Files processed: {processed_files}")
    print(f"Files skipped: {skipped_files}")
    print(f"\nOverall Accuracy: {overall_metrics['accuracy']:.4f}")
    print(f"Overall Precision: {overall_metrics['precision']:.4f}")
    print(f"Overall Recall: {overall_metrics['recall']:.4f}")
    print(f"Overall F1 Score: {overall_metrics['f1']:.4f}")
    print(f"Unknown Rate: {overall_metrics['unknown_rate']:.4f}")
    print("\nConfusion Matrix:")
    print(f"True Positives: {total_metrics['true_positives']}")
    print(f"True Negatives: {total_metrics['true_negatives']}")
    print(f"False Positives: {total_metrics['false_positives']}")
    print(f"False Negatives: {total_metrics['false_negatives']}")
    print(f"Unknown: {total_metrics['unknown']}")
    
    # Print example cases
    print("\nPrinting Example Cases:")
    print_example_cases({'per_file_results': all_results}, gt_dir, pred_dir)

    return overall_metrics

def load_gt_file(file_path: str) -> str:
    """Load ground truth file and extract answer."""
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data['answer'].lower().strip('.,')

if __name__ == "__main__":
    gt_dir = "/share/ssddata/sarimhashmi/posix_thesis/new_improve_stuff/thesis_hell_yeah/Thesis/paraphrase_error_iuxray_variant"
    pred_dir = "/share/ssddata/sarimhashmi/posix_thesis/new_improve_stuff/thesis_hell_yeah/Thesis/med-flamingo/spell_error_posix"
    results = evaluate_directory(gt_dir, pred_dir)
    

Processed 50 files...
Processed 100 files...
Processed 150 files...
Processed 200 files...
Processed 250 files...
Processed 300 files...
Processed 350 files...

Processing Complete!
Files processed: 399
Files skipped: 1

Overall Accuracy: 0.2703
Overall Precision: 0.1683
Overall Recall: 0.4526
Overall F1 Score: 0.2454
Unknown Rate: 0.8348

Confusion Matrix:
True Positives: 86
True Negatives: 110
False Positives: 425
False Negatives: 104
Unknown: 3664

Printing Example Cases:

=== SUCCESSFUL CASES ===

Success Case 1:
File: question_29_variants.json
Question: Can calcified granulomas be seen in the left lower lobe? Please choose from the following two options: [yes, no]
<image>
Ground Truth (original): Yes.
Ground Truth (standardized): yes
Predicted: yes
Response excerpt:  11.10.1  Answer
 11.10.1.1  Correct Answer: Yes
 The calcified granulomas can be seen in the left lower lobe.
 The calcified granulomas...

Success Case 2:
File: question_234_variants.json
Question: Is there any abnor