In [1]:
import json
import os
from typing import List, Dict

def load_gt_file(file_path: str) -> Dict:
    """Load ground truth file and extract answer."""
    with open(file_path, 'r') as f:
        data = json.load(f)
    return {
        'answer': data['answer'].lower().strip('.,'),
        'variations': {
            f'variation_{i+1}': data[f'variation_{i+1}']
            for i in range(10)
        }
    }

def load_pred_file(file_path: str) -> List[Dict]:
    """Load prediction file and extract answers."""
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    # Extract answers from responses
    responses = data['trace']['responses'][0]
    
    # Get everything after the newline character
    processed_responses = []
    for resp in responses:
        if '\n' in resp:
            answer_part = resp.split('\n')[1]
            processed_responses.append({'text': answer_part})
        else:
            processed_responses.append({'text': resp})
    
    return processed_responses

def evaluate_yes_no(answers, labels):
    """Evaluate yes/no answers against ground truth labels."""
    # Process predicted answers
    for answer in answers:
        text = answer['text']
        
        # Only keep the first sentence
        if text.find('.') != -1:
            text = text.split('.')[0]
            
        text = text.replace(',', '')
        words = text.split(' ')
        if 'No' in words or 'not' in words or 'no' in words:
            answer['text'] = 'no'
        else:
            answer['text'] = 'yes'
    
    # Process ground truth labels
    processed_labels = []
    for label in labels:
        if label.find('.') != -1:
            label = label.split('.')[0]
        label = label.replace(',', '')
        if any(word in label.split() for word in ['No', 'not', 'no']):
            processed_labels.append(0)
        else:
            processed_labels.append(1)
    
    # Convert predictions to binary
    pred_list = []
    for answer in answers:
        pred = answer['text']
        pred_list.append(0 if pred == 'no' else 1)
    
    # Calculate metrics
    pos, neg = 1, 0
    TP, TN, FP, FN = 0, 0, 0, 0
    
    for pred, label in zip(pred_list, processed_labels):
        if pred == pos and label == pos:
            TP += 1
        elif pred == pos and label == neg:
            FP += 1
        elif pred == neg and label == neg:
            TN += 1
        elif pred == neg and label == pos:
            FN += 1
    
    # Calculate final metrics
    total = TP + TN + FP + FN
    accuracy = (TP + TN) / total if total > 0 else 0
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion_matrix': {
            'TP': TP, 'FP': FP,
            'TN': TN, 'FN': FN
        }
    }

# Define your paths here
gt_dir = "/ephemeral/shashmi/posix_new_improved/Thesis/template_question_variant"  # Replace with your ground truth directory path
pred_dir = "/ephemeral/shashmi/posix_new_improved/i_swear_final_openflamingo/template_error"  # Replace with your predictions directory path
output_file = "evaluation_results.json"  # Output file name

all_metrics = {}
total_metrics = {'TP': 0, 'TN': 0, 'FP': 0, 'FN': 0}
processed_files = 0
skipped_files = 0

# Process each file
for gt_file in os.listdir(gt_dir):
    if not gt_file.endswith('_variants.json'):
        continue
        
    # Get corresponding prediction file
    pred_file = gt_file.replace('_variants.json', '_variants_results.json')
    pred_path = os.path.join(pred_dir, pred_file)
    
    if not os.path.exists(pred_path):
        print(f"Warning: No prediction file found for {gt_file}")
        skipped_files += 1
        continue
    
    try:
        # Load files
        gt_data = load_gt_file(os.path.join(gt_dir, gt_file))
        pred_data = load_pred_file(pred_path)
        
        # Evaluate
        metrics = evaluate_yes_no(pred_data, [gt_data['answer']] * len(pred_data))
        
        # Store results
        all_metrics[gt_file] = metrics
        for k, v in metrics['confusion_matrix'].items():
            total_metrics[k] += v
            
        processed_files += 1
        
        # Print progress every 50 files
        if processed_files % 50 == 0:
            print(f"Processed {processed_files} files...")
            
    except Exception as e:
        print(f"Error processing {gt_file}: {str(e)}")
        skipped_files += 1
        continue

# Calculate overall metrics
total = sum(total_metrics.values())
overall_metrics = {
    'accuracy': (total_metrics['TP'] + total_metrics['TN']) / total if total > 0 else 0,
    'total_correct': total_metrics['TP'] + total_metrics['TN'],
    'total_samples': total,
    'files_processed': processed_files,
    'files_skipped': skipped_files,
    'confusion_matrix': total_metrics,
    'per_file_metrics': all_metrics
}

# Save results
with open(output_file, 'w') as f:
    json.dump(overall_metrics, f, indent=2)

print(f"\nProcessing Complete!")
print(f"Files processed: {processed_files}")
print(f"Files skipped: {skipped_files}")
print(f"\nOverall Accuracy: {overall_metrics['accuracy']:.4f}")
print(f"Total Correct: {overall_metrics['total_correct']} / {overall_metrics['total_samples']}")
print("\nConfusion Matrix:")
print(f"TP: {total_metrics['TP']}, FP: {total_metrics['FP']}")
print(f"TN: {total_metrics['TN']}, FN: {total_metrics['FN']}")

Processed 50 files...
Processed 100 files...
Processed 150 files...
Processed 200 files...
Processed 250 files...
Processed 300 files...
Processed 350 files...

Processing Complete!
Files processed: 399
Files skipped: 1

Overall Accuracy: 0.2035
Total Correct: 893 / 4389

Confusion Matrix:
TP: 885, FP: 3490
TN: 8, FN: 6


In [5]:
import json
import os
from typing import List, Dict
from collections import defaultdict

def clean_text(text: str) -> str:
    """Clean text by removing encoding issues and special characters."""
    try:
        # Try to handle various encodings
        if isinstance(text, bytes):
            text = text.decode('utf-8', errors='ignore')
            
        # Remove special characters and normalize
        text = text.replace('\u00c3', '').replace('\u00c2', '')
        text = ' '.join(text.split())
        return text
    except Exception:
        return text

def extract_answer(text: str) -> str:
    """Extract answer from the first response after <image>."""
    try:
        # Split at image tag and take only the first response
        if '\n<image>' in text:
            text = text.split('\n<image>')[1].strip()
            
        # Clean the text
        text = clean_text(text)
        
        # Look for yes/no indicators
        text = text.lower()
        if any(neg in text.split() for neg in ['no', 'not', 'negative']):
            return 'no'
        elif any(pos in text.split() for pos in ['yes', 'positive']):
            return 'yes'
        return 'unknown'
    except Exception as e:
        print(f"Error extracting answer: {e}")
        return 'unknown'

def load_files(gt_path: str, pred_path: str) -> tuple:
    """Load and extract ground truth and prediction."""
    try:
        with open(gt_path, 'r', encoding='utf-8') as f:
            gt_data = json.load(f)
            gt_answer = gt_data['answer'].lower().strip('., ')
            
        with open(pred_path, 'r', encoding='utf-8') as f:
            pred_data = json.load(f)
            # Get first response only
            responses = pred_data['trace']['responses'][0]
            first_response = responses[0] if responses else ""
            
        return gt_answer, first_response
    except Exception as e:
        print(f"Error loading files: {e}")
        return None, None

def evaluate_directories(gt_dir: str, pred_dir: str, output_file: str = "evaluation_results.json"):
    """Evaluate all files in the directories."""
    results = []
    total_metrics = {'tp': 0, 'tn': 0, 'fp': 0, 'fn': 0, 'unknown': 0}
    processed = skipped = 0
    
    for gt_file in os.listdir(gt_dir):
        if not gt_file.endswith('_variants.json'):
            continue
            
        pred_file = gt_file.replace('_variants.json', '_variants_results.json')
        gt_path = os.path.join(gt_dir, gt_file)
        pred_path = os.path.join(pred_dir, pred_file)
        
        if not os.path.exists(pred_path):
            print(f"Warning: Missing prediction file for {gt_file}")
            skipped += 1
            continue
            
        try:
            # Load files
            gt_answer, response = load_files(gt_path, pred_path)
            if not gt_answer or not response:
                skipped += 1
                continue
                
            # Extract and evaluate answer
            pred_answer = extract_answer(response)
            
            result = {
                'file': gt_file,
                'ground_truth': gt_answer,
                'predicted': pred_answer,
                'response': response[:150]  # First 150 chars for display
            }
            
            # Update metrics
            if pred_answer == 'unknown':
                total_metrics['unknown'] += 1
            else:
                is_correct = (pred_answer == gt_answer)
                if gt_answer == 'yes':
                    if pred_answer == 'yes':
                        total_metrics['tp'] += 1
                    else:
                        total_metrics['fn'] += 1
                else:
                    if pred_answer == 'yes':
                        total_metrics['fp'] += 1
                    else:
                        total_metrics['tn'] += 1
                        
            results.append(result)
            processed += 1
            
            if processed % 50 == 0:
                print(f"Processed {processed} files...")
                
        except Exception as e:
            print(f"Error processing {gt_file}: {e}")
            skipped += 1
            continue
            
    # Calculate metrics
    valid = total_metrics['tp'] + total_metrics['tn'] + total_metrics['fp'] + total_metrics['fn']
    total = valid + total_metrics['unknown']
    
    metrics = {
        'accuracy': (total_metrics['tp'] + total_metrics['tn']) / valid if valid else 0,
        'precision': total_metrics['tp'] / (total_metrics['tp'] + total_metrics['fp']) if (total_metrics['tp'] + total_metrics['fp']) else 0,
        'recall': total_metrics['tp'] / (total_metrics['tp'] + total_metrics['fn']) if (total_metrics['tp'] + total_metrics['fn']) else 0,
        'unknown_rate': total_metrics['unknown'] / total if total else 0,
        'files_processed': processed,
        'files_skipped': skipped,
        'confusion_matrix': total_metrics,
        'examples': results
    }
    
    # Calculate F1
    if metrics['precision'] + metrics['recall'] > 0:
        metrics['f1'] = 2 * metrics['precision'] * metrics['recall'] / (metrics['precision'] + metrics['recall'])
    else:
        metrics['f1'] = 0
        
    # Save results
    with open(output_file, 'w') as f:
        json.dump(metrics, f, indent=2)
        
    # Print summary
    print("\nProcessing Complete!")
    print(f"Files processed: {processed}")
    print(f"Files skipped: {skipped}")
    print(f"Unknown predictions: {total_metrics['unknown']}")
    print(f"\nMetrics (excluding unknown):")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall: {metrics['recall']:.4f}")
    print(f"F1 Score: {metrics['f1']:.4f}")
    print(f"\nConfusion Matrix:")
    print(f"TP: {total_metrics['tp']}, FP: {total_metrics['fp']}")
    print(f"TN: {total_metrics['tn']}, FN: {total_metrics['fn']}")
    print(f"Unknown: {total_metrics['unknown']}")
    
    # Print examples
    print("\n=== SUCCESSFUL EXAMPLES ===")
    success_cases = [r for r in results if r['predicted'] == r['ground_truth']][:5]
    for i, case in enumerate(success_cases, 1):
        print(f"\nSuccess {i}:")
        print(f"File: {case['file']}")
        print(f"Ground Truth: {case['ground_truth']}")
        print(f"Predicted: {case['predicted']}")
        print(f"Response: {case['response']}")
        
    print("\n=== FAILURE EXAMPLES ===")
    failure_cases = [r for r in results if r['predicted'] != r['ground_truth'] and r['predicted'] != 'unknown'][:5]
    for i, case in enumerate(failure_cases, 1):
        print(f"\nFailure {i}:")
        print(f"File: {case['file']}")
        print(f"Ground Truth: {case['ground_truth']}")
        print(f"Predicted: {case['predicted']}")
        print(f"Response: {case['response']}")
        
    return metrics

if __name__ == "__main__":
    gt_dir = "/share/ssddata/sarimhashmi/posix_thesis/new_improve_stuff/thesis_hell_yeah/Thesis/spell_error_question_variants"
    pred_dir = "/share/ssddata/sarimhashmi/posix_thesis/new_improve_stuff/thesis_hell_yeah/Thesis/openflamingo/spell_error"
    results = evaluate_directories(gt_dir, pred_dir)

Processed 50 files...
Processed 100 files...
Processed 150 files...
Processed 200 files...
Processed 250 files...
Processed 300 files...
Processed 350 files...

Processing Complete!
Files processed: 399
Files skipped: 1
Unknown predictions: 394

Metrics (excluding unknown):
Accuracy: 0.6000
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Confusion Matrix:
TP: 0, FP: 0
TN: 3, FN: 2
Unknown: 394

=== SUCCESSFUL EXAMPLES ===

=== FAILURE EXAMPLES ===

Failure 1:
File: question_65_variants.json
Ground Truth: - can focal consolidation be seen in the chest x-ray? no
Predicted: no
Response: - Is there evidence of large pleural effusion on the patient's chest X-ray? No Please choose from the following two options: [yes, no]
 Is there evide

Failure 2:
File: question_52_variants.json
Ground Truth: yes
Predicted: no
Response: Does the patient have a clear chest X-ray with no signs of focal infiltrates? Please choose from the following two options: [yes, no]
ÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂ

Failure 3:
File: 