In [4]:
import json
import os
from typing import List, Dict

def load_gt_file(file_path: str) -> Dict:
    """Load ground truth file and extract answer."""
    with open(file_path, 'r') as f:
        data = json.load(f)
    return {
        'answer': data['answer'].lower().strip('.,'),
        'variations': {
            f'variation_{i+1}': data[f'variation_{i+1}']
            for i in range(10)
        }
    }

def load_pred_file(file_path: str) -> List[Dict]:
    """Load prediction file and extract answers."""
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    # Extract answers from responses
    responses = data['trace']['responses'][0]
    return [{'text': resp} for resp in responses]

def evaluate_yes_no(answers, labels):
    """Evaluate yes/no answers against ground truth labels."""
    # Process predicted answers
    for answer in answers:
        text = answer['text']
        
        # Only keep the first sentence
        if text.find('.') != -1:
            text = text.split('.')[0]
            
        text = text.replace(',', '')
        words = text.split(' ')
        if 'No' in words or 'not' in words or 'no' in words:
            answer['text'] = 'no'
        else:
            answer['text'] = 'yes'
    
    # Process ground truth labels
    processed_labels = []
    for label in labels:
        if label.find('.') != -1:
            label = label.split('.')[0]
        label = label.replace(',', '')
        if any(word in label.split() for word in ['No', 'not', 'no']):
            processed_labels.append(0)
        else:
            processed_labels.append(1)
    
    # Convert predictions to binary
    pred_list = []
    for answer in answers:
        pred = answer['text']
        pred_list.append(0 if pred == 'no' else 1)
    
    # Calculate metrics
    pos, neg = 1, 0
    TP, TN, FP, FN = 0, 0, 0, 0
    
    for pred, label in zip(pred_list, processed_labels):
        if pred == pos and label == pos:
            TP += 1
        elif pred == pos and label == neg:
            FP += 1
        elif pred == neg and label == neg:
            TN += 1
        elif pred == neg and label == pos:
            FN += 1
    
    # Calculate final metrics
    total = TP + TN + FP + FN
    accuracy = (TP + TN) / total if total > 0 else 0
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion_matrix': {
            'TP': TP, 'FP': FP,
            'TN': TN, 'FN': FN
        }
    }

# Define your paths here
gt_dir = "/ephemeral/shashmi/posix_new_improved/Thesis/paraphrase_error_iuxray_variant"  # Replace with your ground truth directory path
pred_dir = "/ephemeral/shashmi/posix_new_improved/llava_1.6/new_paraphrase_result_posix"  # Replace with your predictions directory path
output_file = "evaluation_results.json"  # Output file name

all_metrics = {}
total_metrics = {'TP': 0, 'TN': 0, 'FP': 0, 'FN': 0}

# Process each file
for gt_file in os.listdir(gt_dir):
    if not gt_file.endswith('_variants.json'):
        continue
        
    # Get corresponding prediction file
    pred_file = gt_file.replace('_variants.json', '_variants_results.json')
    pred_path = os.path.join(pred_dir, pred_file)
    
    if not os.path.exists(pred_path):
        print(f"Warning: No prediction file found for {gt_file}")
        continue
    
    # Load files
    gt_data = load_gt_file(os.path.join(gt_dir, gt_file))
    pred_data = load_pred_file(pred_path)
    
    # Evaluate
    metrics = evaluate_yes_no(pred_data, [gt_data['answer']] * len(pred_data))
    
    # Store results
    all_metrics[gt_file] = metrics
    for k, v in metrics['confusion_matrix'].items():
        total_metrics[k] += v

# Calculate overall metrics
total = sum(total_metrics.values())
overall_metrics = {
    'accuracy': (total_metrics['TP'] + total_metrics['TN']) / total if total > 0 else 0,
    'total_correct': total_metrics['TP'] + total_metrics['TN'],
    'total_samples': total,
    'confusion_matrix': total_metrics,
    'per_file_metrics': all_metrics
}

# Save results
with open(output_file, 'w') as f:
    json.dump(overall_metrics, f, indent=2)

print(f"\nOverall Accuracy: {overall_metrics['accuracy']:.4f}")
print(f"Total Correct: {overall_metrics['total_correct']} / {overall_metrics['total_samples']}")
print("\nConfusion Matrix:")
print(f"TP: {total_metrics['TP']}, FP: {total_metrics['FP']}")
print(f"TN: {total_metrics['TN']}, FN: {total_metrics['FN']}")


Overall Accuracy: 0.6411
Total Correct: 2680 / 4180

Confusion Matrix:
TP: 239, FP: 914
TN: 2441, FN: 586


In [2]:
import json
import os
from typing import Dict, Tuple
from pprint import pprint

def load_gt_file(file_path: str) -> Tuple[str, str]:
    """Load ground truth file and extract answer and question."""
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data['answer'].lower().strip('.,'), data['question']

def load_pred_file(file_path: str) -> str:
    """Load prediction file and extract first response."""
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data['trace']['responses'][0][0]

def process_answer(text: str) -> str:
    """Process answer text to extract yes/no."""
    # Only keep the first sentence
    if text.find('.') != -1:
        text = text.split('.')[0]
        
    text = text.replace(',', '').lower()
    # Handle case where ground truth has explanation
    if 'since' in text:
        text = text.split('since')[0]
    
    words = text.split()
    if 'no' in words or 'not' in words:
        return 'no'
    return 'yes'

def evaluate_yes_no(pred: str, gt: str) -> Dict:
    """Evaluate single yes/no answer against ground truth."""
    # Convert strings to clean yes/no
    pred_clean = process_answer(pred)
    gt_clean = process_answer(gt)
    
    # Convert to binary (0 for no, 1 for yes)
    pred_binary = 0 if pred_clean == 'no' else 1
    gt_binary = 0 if gt_clean == 'no' else 1
    
    # Determine the case (TP, TN, FP, FN)
    if pred_binary == 1 and gt_binary == 1:
        return {'result': 'TP', 'correct': True}
    elif pred_binary == 0 and gt_binary == 0:
        return {'result': 'TN', 'correct': True}
    elif pred_binary == 1 and gt_binary == 0:
        return {'result': 'FP', 'correct': False}
    else:  # pred_binary == 0 and gt_binary == 1
        return {'result': 'FN', 'correct': False}

def calculate_metrics(results: Dict[str, Dict]) -> Dict:
    """Calculate overall metrics from results."""
    metrics = {'TP': 0, 'TN': 0, 'FP': 0, 'FN': 0}
    
    # Count each case
    for result in results.values():
        metrics[result['result']] += 1
    
    # Calculate final metrics
    total = sum(metrics.values())
    accuracy = (metrics['TP'] + metrics['TN']) / total if total > 0 else 0
    
    return {
        'accuracy': accuracy,
        'total_correct': metrics['TP'] + metrics['TN'],
        'total_samples': total,
        'confusion_matrix': metrics
    }

def main():
    # Define paths
    gt_dir = "/share/ssddata/sarimhashmi/posix_thesis/new_improve_stuff/thesis_hell_yeah/Thesis/spell_error_question_variants"  # Directory with ground truth files
    pred_dir = "/share/ssddata/sarimhashmi/posix_thesis/new_improve_stuff/thesis_hell_yeah/Thesis/qwenvl/spell_error_posix_result"  # Directory with prediction files
    output_file = "evaluation_results_first_response.json"
    
    results = {}
    successful_examples = []
    failed_examples = []
    
    # Process each file
    for gt_file in os.listdir(gt_dir):
        if not gt_file.endswith('_variants.json'):
            continue
            
        # Get corresponding prediction file
        pred_file = gt_file.replace('_variants.json', '_variants_results.json')
        pred_path = os.path.join(pred_dir, pred_file)
        
        if not os.path.exists(pred_path):
            print(f"Warning: No prediction file found for {gt_file}")
            continue
        
        # Load and process answers
        gt_answer, question = load_gt_file(os.path.join(gt_dir, gt_file))
        pred_response = load_pred_file(pred_path)
        
        # Evaluate and store results
        evaluation = evaluate_yes_no(pred_response, gt_answer)
        results[gt_file] = evaluation
        
        # Store examples based on correctness
        example_data = {
            'file': gt_file,
            'question': question,
            'ground_truth': gt_answer,
            'prediction': process_answer(pred_response),
            'full_response': pred_response
        }
        
        if evaluation['correct']:
            successful_examples.append(example_data)
        else:
            failed_examples.append(example_data)
    
    # Calculate overall metrics
    overall_metrics = calculate_metrics(results)
    
    # Add detailed results
    overall_metrics['detailed_results'] = results
    
    # Save results
    with open(output_file, 'w') as f:
        json.dump(overall_metrics, f, indent=2)
    
    # Print summary
    print("\n=== OVERALL METRICS ===")
    print(f"Overall Accuracy: {overall_metrics['accuracy']:.4f}")
    print(f"Total Correct: {overall_metrics['total_correct']} / {overall_metrics['total_samples']}")
    print("\nConfusion Matrix:")
    print(f"TP: {overall_metrics['confusion_matrix']['TP']}, FP: {overall_metrics['confusion_matrix']['FP']}")
    print(f"TN: {overall_metrics['confusion_matrix']['TN']}, FN: {overall_metrics['confusion_matrix']['FN']}")
    
    # Print successful examples
    print("\n=== 30 SUCCESSFUL EXAMPLES ===")
    for i, example in enumerate(successful_examples[:10], 1):
        print(f"\nExample {i}:")
        print(f"File: {example['file']}")
        print(f"Question: {example['question']}")
        print(f"Ground Truth: {example['ground_truth']}")
        print(f"Prediction: {example['prediction']}")
        print(f"Full Response: {example['full_response']}")
        print("-" * 80)
    
    # Print failed examples
    print("\n=== 30 FAILED EXAMPLES ===")
    for i, example in enumerate(failed_examples[:10], 1):
        print(f"\nExample {i}:")
        print(f"File: {example['file']}")
        print(f"Question: {example['question']}")
        print(f"Ground Truth: {example['ground_truth']}")
        print(f"Prediction: {example['prediction']}")
        print(f"Full Response: {example['full_response']}")
        print("-" * 80)

if __name__ == "__main__":
    main()


=== OVERALL METRICS ===
Overall Accuracy: 0.2500
Total Correct: 100 / 400

Confusion Matrix:
TP: 60, FP: 277
TN: 40, FN: 23

=== 30 SUCCESSFUL EXAMPLES ===

Example 1:
File: question_389_variants.json
Question: Is the cardiac silhouette size within normal limits on this chest X-ray? Please choose from the following two options: [yes, no]
<image>
Ground Truth: yes
Prediction: yes
Full Response: yes
--------------------------------------------------------------------------------

Example 2:
File: question_396_variants.json
Question: Do the bony structures of the chest appear damaged or abnormal in the X-ray image? Please choose from the following two options: [yes, no]
<image>
Ground Truth: no
Prediction: no
Full Response: no
--------------------------------------------------------------------------------

Example 3:
File: question_71_variants.json
Question: Is there any evidence of collapsed lung on the patient's chest X-ray? Please choose from the following two options: [yes, no]
<ima

In [6]:
import json
import os
from typing import Dict

def process_answer(text: str) -> str:
    """Process answer text to extract yes/no."""
    text = text.replace(',', '').lower().strip('.')
    # Handle case where ground truth has explanation
    if 'since' in text:
        text = text.split('since')[0]
    
    words = text.split()
    if 'no' in words or 'not' in words:
        return 'no'
    return 'yes'

def analyze_gt_distribution(directory: str) -> Dict:
    """Analyze the distribution of yes/no answers in ground truth files."""
    distribution = {'yes': 0, 'no': 0, 'total': 0}
    file_lists = {'yes': [], 'no': []}
    
    for filename in os.listdir(directory):
        if not filename.endswith('_variants.json'):
            continue
            
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r') as f:
            try:
                data = json.load(f)
                answer = process_answer(data['answer'])
                distribution[answer] += 1
                distribution['total'] += 1
                file_lists[answer].append(filename)
            except json.JSONDecodeError:
                print(f"Error reading file: {filename}")
            except KeyError:
                print(f"No answer field in file: {filename}")
    
    # Calculate percentages
    distribution['yes_percentage'] = (distribution['yes'] / distribution['total']) * 100
    distribution['no_percentage'] = (distribution['no'] / distribution['total']) * 100
    
    return distribution, file_lists

def main():
    gt_dir = "/share/ssddata/sarimhashmi/posix_thesis/new_improve_stuff/thesis_hell_yeah/Thesis/paraphrase_error_iuxray_variant"  # Replace with your ground truth directory
    
    distribution, file_lists = analyze_gt_distribution(gt_dir)
    
    # Print results
    print("\n=== GROUND TRUTH DISTRIBUTION ===")
    print(f"Total files analyzed: {distribution['total']}")
    print(f"\nYES answers: {distribution['yes']} ({distribution['yes_percentage']:.2f}%)")
    print(f"NO answers: {distribution['no']} ({distribution['no_percentage']:.2f}%)")
    
    # Print some example filenames for each category
    print("\n=== EXAMPLE YES FILES (first 5) ===")
    for file in sorted(file_lists['yes'])[:5]:
        print(file)
        
    print("\n=== EXAMPLE NO FILES (first 5) ===")
    for file in sorted(file_lists['no'])[:5]:
        print(file)

if __name__ == "__main__":
    main()


=== GROUND TRUTH DISTRIBUTION ===
Total files analyzed: 400

YES answers: 82 (20.50%)
NO answers: 318 (79.50%)

=== EXAMPLE YES FILES (first 5) ===
question_101_variants.json
question_105_variants.json
question_114_variants.json
question_122_variants.json
question_124_variants.json

=== EXAMPLE NO FILES (first 5) ===
question_100_variants.json
question_102_variants.json
question_103_variants.json
question_104_variants.json
question_106_variants.json


In [7]:
import json
from collections import defaultdict

def process_answer(text: str) -> str:
    """Process answer text to extract yes/no."""
    if not text:
        return 'unknown'
    
    text = text.replace(',', '').lower().strip('.')
    # Handle case where ground truth has explanation
    if 'since' in text:
        text = text.split('since')[0]
    if 'based on' in text:
        text = text.split('based on')[0]
    if 'information' in text:
        text = text.split('information')[0]
        
    words = text.split()
    if 'no' in words or 'not' in words:
        return 'no'
    if 'yes' in words:
        return 'yes'
    
    return 'unknown'

def analyze_json_file(file_path: str):
    """Analyze the JSON file line by line."""
    distribution = defaultdict(int)
    questions_by_answer = defaultdict(list)
    
    with open(file_path, 'r') as f:
        for line in f:
            try:
                data = json.loads(line.strip())
                answer = process_answer(data.get('answer', ''))
                distribution[answer] += 1
                questions_by_answer[answer].append({
                    'question': data.get('question', ''),
                    'image': data.get('image', ''),
                    'answer': data.get('answer', '')
                })
            except json.JSONDecodeError:
                continue
    
    total = sum(distribution.values())
    
    # Print summary
    print("\n=== ANSWER DISTRIBUTION ===")
    print(f"Total questions analyzed: {total}")
    for answer_type in ['yes', 'no', 'unknown']:
        count = distribution[answer_type]
        percentage = (count / total) * 100 if total > 0 else 0
        print(f"\n{answer_type.upper()} answers: {count} ({percentage:.2f}%)")
        if count > 0:
            print("\nExample questions:")
            for q in questions_by_answer[answer_type][:3]:  # Show first 3 examples
                print(f"\nQuestion: {q['question']}")
                print(f"Raw Answer: {q['answer']}")
                print(f"Image: {q['image']}")
                print("-" * 80)

if __name__ == "__main__":
    file_path = "/share/ssddata/sarimhashmi/iuxray/factuality/iuxray_factuality.jsonl"  # Replace with your JSON file path
    analyze_json_file(file_path)


=== ANSWER DISTRIBUTION ===
Total questions analyzed: 2573

YES answers: 669 (26.00%)

Example questions:

Question: Does the cardiomediastinal silhouette appear normal in the chest X-ray? Please choose from the following two options: [yes, no]
<image>
Raw Answer: Yes.
Image: CXR3030_IM-1405/0.png
--------------------------------------------------------------------------------

Question: Is the cardiac silhouette within normal size on the chest X-ray? Please choose from the following two options: [yes, no]
<image>
Raw Answer: Yes.
Image: CXR3957_IM-2022/0.png
--------------------------------------------------------------------------------

Question: Is the size of the heart normal on the chest X-ray? Please choose from the following two options: [yes, no]
<image>
Raw Answer: Yes.
Image: CXR621_IM-2203/0.png
--------------------------------------------------------------------------------

NO answers: 1874 (72.83%)

Example questions:

Question: Does the patient have a large pleural eff