# XR2Text: Radiologist Clinical Evaluation

## FULL DATASET - Clinical Validation on NVIDIA A100 80GB

**Authors**: S. Nikhil, Dadhania Omkumar  
**Supervisor**: Dr. Damodar Panigrahy  
**Clinical Evaluator**: [Radiologist Name]

---

**Dataset**: MIMIC-CXR (Full 30,633 images)  
**GPU**: NVIDIA A100 80GB (48GB VRAM) - Run ALL notebooks on A100 80GB!  
**Note**: With $10 credits, run everything on A100 80GB for maximum speed

---

This notebook prepares samples for radiologist evaluation and analyzes the results.

### Why Radiologist Evaluation is CRITICAL:
1. **Publication Requirement** - Top venues (MICCAI, IEEE TMI) require human evaluation
2. **Clinical Validity** - Automated metrics don't capture clinical correctness
3. **Error Detection** - Radiologists can identify dangerous hallucinations
4. **Real-World Applicability** - Proves the system is clinically useful

### Evaluation Protocol:
- **Blind Evaluation**: Model names hidden (Model_A, Model_B, etc.)
- **50 Random Samples**: Stratified by difficulty
- **5 Evaluation Dimensions**: Each scored 1-5
- **Error Tracking**: Critical errors, missing findings, hallucinations

### Evaluation Dimensions:
| Dimension | Description |
|-----------|-------------|
| Clinical Accuracy | Are the findings medically correct? |
| Completeness | Are all important findings mentioned? |
| Clinical Relevance | Is the report clinically useful? |
| Readability | Is the report clear and well-structured? |
| Actionability | Does it support clinical decisions? |

In [None]:
# ==============================================
# RUNPOD SETUP - Run this cell FIRST!
# ==============================================
import os
import sys
import subprocess

print("=" * 60)
print("RUNPOD AUTO-SETUP (No SSH Required!)")
print("=" * 60)

# 1. Fix Python path
sys.path.insert(0, '..')

# 2. Create directories with proper permissions
print("")
print("[1/4] Creating directories...")
dirs_to_fix = [
    '../checkpoints', 
    '../logs', 
    '../data', 
    '../data/figures', 
    '../data/statistics',
    '../data/human_evaluation',
    '../data/ablation_results',
]

for d in dirs_to_fix:
    os.makedirs(d, exist_ok=True)
    try:
        os.chmod(d, 0o777)
    except:
        pass
print("   Directories created!")

# 3. Install missing packages (if any)
print("")
print("[2/4] Checking packages...")
required = ['timm', 'albumentations', 'loguru', 'rouge_score', 'bert_score']
for pkg in required:
    try:
        __import__(pkg.replace('-', '_'))
    except ImportError:
        print(f"   Installing {pkg}...")
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', pkg])
print("   Packages OK!")

# 4. Download NLTK data
print("")
print("[3/4] NLTK data...")
try:
    import nltk
    nltk.download('punkt', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('omw-1.4', quiet=True)
    print("   NLTK data ready!")
except:
    print("   NLTK download skipped")

# 5. GPU Check
print("")
print("[4/4] GPU Check...")
import torch
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"   GPU: {gpu_name}")
    print(f"   VRAM: {gpu_mem:.1f} GB")
    if gpu_mem > 40:
        print("   >>> A100 80GB DETECTED - Full speed ahead!")
else:
    print("   WARNING: No GPU detected!")

print("")
print("=" * 60)
print("SETUP COMPLETE! Continue running cells below.")
print("=" * 60)

In [None]:
# =============================================================================
# Setup and Imports
# =============================================================================
import os
import sys
sys.path.insert(0, '..')

import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datetime import datetime
from tqdm.notebook import tqdm
import random
import json
import warnings
warnings.filterwarnings('ignore')

# Plotting style
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['savefig.dpi'] = 300

# Create output directories
os.makedirs('../data/human_evaluation', exist_ok=True)
os.makedirs('../data/figures', exist_ok=True)

print("Setup complete!")
print(f"Output directory: ../data/human_evaluation/")

## 1. Load Trained Model and Generate Samples

In [None]:
# =============================================================================
# Load Trained Model
# =============================================================================
from src.models.xr2text import XR2TextModel
from src.data.dataloader import get_dataloaders

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Find best checkpoint
checkpoint_dir = Path('../checkpoints')
checkpoint_path = None

if (checkpoint_dir / 'best_model.pt').exists():
    checkpoint_path = checkpoint_dir / 'best_model.pt'
else:
    epoch_checkpoints = list(checkpoint_dir.glob('checkpoint_epoch_*.pt'))
    if epoch_checkpoints:
        checkpoint_path = max(epoch_checkpoints, key=lambda x: int(x.stem.split('_')[-1]))

if checkpoint_path:
    print(f"Loading checkpoint: {checkpoint_path}")
    checkpoint = torch.load(checkpoint_path, map_location='cpu')
    config = checkpoint.get('config', {})
    
    # Enable all enhancement modules
    config['use_uncertainty'] = True
    config['use_grounding'] = True
    config['use_explainability'] = True
    
    model = XR2TextModel.from_pretrained(str(checkpoint_path), config=config)
    model = model.to(device)
    model.eval()
    
    print(f"Model loaded! Epoch: {checkpoint.get('epoch', 'unknown')}")
else:
    print("ERROR: No checkpoint found! Train the model first.")

In [None]:
# =============================================================================
# Load Test Data
# =============================================================================
tokenizer = model.get_tokenizer()

_, _, test_loader = get_dataloaders(
    tokenizer=tokenizer,
    batch_size=1,  # One at a time for detailed analysis
    num_workers=2,
    image_size=512,  # A100 80GB
    max_length=300,
)

print(f"Test samples available: {len(test_loader.dataset)}")

## 2. Generate Reports for Radiologist Evaluation

In [None]:
# =============================================================================
# Generate Reports with Uncertainty and Explanations
# =============================================================================
NUM_SAMPLES = 50  # Number of samples for radiologist to evaluate

print(f"Generating {NUM_SAMPLES} samples for radiologist evaluation...")
print("This includes uncertainty scores and explanations.")
print()

evaluation_samples = []
sample_indices = random.sample(range(len(test_loader.dataset)), min(NUM_SAMPLES, len(test_loader.dataset)))

for idx in tqdm(sample_indices, desc="Generating reports"):
    # Get sample
    sample = test_loader.dataset[idx]
    image = sample['images'].unsqueeze(0).to(device)
    reference = sample['raw_texts']
    
    with torch.no_grad():
        # Generate report with analysis
        try:
            if hasattr(model, 'generate_with_analysis'):
                result = model.generate_with_analysis(
                    image,
                    max_length=300,
                    num_beams=4,
                )
                generated = result.get('report', '')
                confidence = result.get('confidence', 0.0)
                findings = result.get('detected_findings', [])
                hallucination_risk = result.get('hallucination_risk', 0.0)
            else:
                _, generated_list, _ = model.generate(
                    images=image,
                    max_length=300,
                    num_beams=4,
                )
                generated = generated_list[0] if generated_list else ''
                confidence = 0.0
                findings = []
                hallucination_risk = 0.0
        except Exception as e:
            print(f"Error generating sample {idx}: {e}")
            continue
    
    evaluation_samples.append({
        'sample_id': f'SAMPLE_{idx:04d}',
        'image_index': idx,
        'generated_report': generated,
        'reference_report': reference,
        'model_confidence': confidence,
        'detected_findings': ', '.join(findings) if findings else 'N/A',
        'hallucination_risk': hallucination_risk,
    })

print(f"\nGenerated {len(evaluation_samples)} samples for evaluation.")

## 3. Create Evaluation Forms for Radiologist

In [None]:
from datetime import datetime

# =============================================================================
# Create CSV Evaluation Form
# =============================================================================
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# Create evaluation dataframe
eval_df = pd.DataFrame(evaluation_samples)

# Add evaluation columns (to be filled by radiologist)
eval_df['clinical_accuracy'] = ''      # 1-5 scale
eval_df['completeness'] = ''           # 1-5 scale
eval_df['clinical_relevance'] = ''     # 1-5 scale
eval_df['readability'] = ''            # 1-5 scale
eval_df['actionability'] = ''          # 1-5 scale
eval_df['critical_errors'] = ''        # Count of critical errors
eval_df['missing_findings'] = ''       # List missing findings
eval_df['hallucinated_findings'] = ''  # List false findings
eval_df['evaluator_notes'] = ''        # Free text notes

# Save CSV form
csv_path = f'../data/human_evaluation/radiologist_eval_form_{timestamp}.csv'
eval_df.to_csv(csv_path, index=False)
print(f"CSV form saved: {csv_path}")

In [None]:
# =============================================================================
# Create Printable HTML Evaluation Form (for easier reading)
# =============================================================================

html_content = """
<!DOCTYPE html>
<html>
<head>
    <title>XR2Text Radiologist Evaluation Form</title>
    <style>
        body { font-family: Arial, sans-serif; max-width: 900px; margin: 0 auto; padding: 20px; }
        h1 { color: #2c3e50; border-bottom: 2px solid #3498db; padding-bottom: 10px; }
        h2 { color: #34495e; margin-top: 30px; }
        .sample { border: 1px solid #bdc3c7; padding: 20px; margin: 20px 0; border-radius: 8px; page-break-inside: avoid; }
        .sample-header { background: #3498db; color: white; padding: 10px; margin: -20px -20px 15px -20px; border-radius: 8px 8px 0 0; }
        .report-box { background: #f8f9fa; padding: 15px; border-left: 4px solid #3498db; margin: 10px 0; }
        .reference-box { background: #f8f9fa; padding: 15px; border-left: 4px solid #27ae60; margin: 10px 0; }
        .rating-table { width: 100%; border-collapse: collapse; margin: 15px 0; }
        .rating-table th, .rating-table td { border: 1px solid #bdc3c7; padding: 8px; text-align: center; }
        .rating-table th { background: #ecf0f1; }
        .notes-box { width: 100%; height: 60px; margin: 10px 0; }
        .instructions { background: #fff3cd; padding: 15px; border-radius: 8px; margin-bottom: 20px; }
        .legend { background: #e8f4f8; padding: 15px; border-radius: 8px; margin-bottom: 20px; }
        @media print { .sample { page-break-inside: avoid; } }
    </style>
</head>
<body>
    <h1>XR2Text Radiologist Evaluation Form</h1>
    
    <div class="instructions">
        <h3>Instructions for Evaluator</h3>
        <ol>
            <li>For each sample, compare the <b>Generated Report</b> with the <b>Reference Report</b></li>
            <li>Rate each dimension from <b>1 (Poor) to 5 (Excellent)</b></li>
            <li>Note any <b>critical errors</b> (e.g., missed pneumothorax, false cancer)</li>
            <li>List any <b>missing findings</b> that should have been reported</li>
            <li>List any <b>hallucinated findings</b> not present in the image</li>
        </ol>
    </div>
    
    <div class="legend">
        <h3>Rating Scale</h3>
        <table class="rating-table">
            <tr>
                <th>Score</th><th>Clinical Accuracy</th><th>Completeness</th><th>Relevance</th><th>Readability</th><th>Actionability</th>
            </tr>
            <tr>
                <td><b>5</b></td><td>All findings correct</td><td>All findings present</td><td>Highly useful</td><td>Very clear</td><td>Directly actionable</td>
            </tr>
            <tr>
                <td><b>4</b></td><td>Minor inaccuracies</td><td>Most findings</td><td>Useful</td><td>Clear</td><td>Mostly actionable</td>
            </tr>
            <tr>
                <td><b>3</b></td><td>Some errors</td><td>Key findings only</td><td>Somewhat useful</td><td>Acceptable</td><td>Partially actionable</td>
            </tr>
            <tr>
                <td><b>2</b></td><td>Significant errors</td><td>Missing key findings</td><td>Limited use</td><td>Confusing</td><td>Limited actionability</td>
            </tr>
            <tr>
                <td><b>1</b></td><td>Mostly incorrect</td><td>Most findings missing</td><td>Not useful</td><td>Unreadable</td><td>Not actionable</td>
            </tr>
        </table>
    </div>
"""

# Add each sample
for i, sample in enumerate(evaluation_samples):
    html_content += f"""
    <div class="sample">
        <div class="sample-header">
            <b>Sample {i+1} of {len(evaluation_samples)}</b> | ID: {sample['sample_id']}
        </div>
        
        <h4>Generated Report (AI):</h4>
        <div class="report-box">{sample['generated_report']}</div>
        
        <h4>Reference Report (Ground Truth):</h4>
        <div class="reference-box">{sample['reference_report']}</div>
        
        <h4>Evaluation Scores (Circle 1-5):</h4>
        <table class="rating-table">
            <tr>
                <th>Clinical Accuracy</th>
                <th>Completeness</th>
                <th>Clinical Relevance</th>
                <th>Readability</th>
                <th>Actionability</th>
            </tr>
            <tr>
                <td>1 &nbsp; 2 &nbsp; 3 &nbsp; 4 &nbsp; 5</td>
                <td>1 &nbsp; 2 &nbsp; 3 &nbsp; 4 &nbsp; 5</td>
                <td>1 &nbsp; 2 &nbsp; 3 &nbsp; 4 &nbsp; 5</td>
                <td>1 &nbsp; 2 &nbsp; 3 &nbsp; 4 &nbsp; 5</td>
                <td>1 &nbsp; 2 &nbsp; 3 &nbsp; 4 &nbsp; 5</td>
            </tr>
        </table>
        
        <p><b>Critical Errors (count):</b> _______ </p>
        <p><b>Missing Findings:</b> ________________________________________________</p>
        <p><b>Hallucinated Findings:</b> ____________________________________________</p>
        <p><b>Notes:</b></p>
        <textarea class="notes-box"></textarea>
    </div>
"""

html_content += """
    <div style="margin-top: 40px; padding: 20px; background: #d5edda; border-radius: 8px;">
        <h3>Evaluator Information</h3>
        <p><b>Name:</b> _________________________________</p>
        <p><b>Qualification:</b> _________________________________</p>
        <p><b>Years of Experience:</b> _______</p>
        <p><b>Date:</b> _________________________________</p>
        <p><b>Signature:</b> _________________________________</p>
    </div>
</body>
</html>
"""

# Save HTML form
html_path = f'../data/human_evaluation/radiologist_eval_form_{timestamp}.html'
with open(html_path, 'w', encoding='utf-8') as f:
    f.write(html_content)

print(f"HTML form saved: {html_path}")
print("\nYou can open this HTML file in a browser and print it for your radiologist cousin!")

In [None]:
# =============================================================================
# Create Excel Form (easier for data entry)
# =============================================================================
try:
    excel_path = f'../data/human_evaluation/radiologist_eval_form_{timestamp}.xlsx'
    
    # Create a more detailed Excel with instructions
    with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
        # Instructions sheet
        instructions = pd.DataFrame({
            'Instructions': [
                'XR2Text Radiologist Evaluation Form',
                '',
                'RATING SCALE (1-5):',
                '5 = Excellent - All findings correct, complete, and clinically useful',
                '4 = Good - Minor issues but clinically acceptable',
                '3 = Acceptable - Some errors but key findings present',
                '2 = Poor - Significant errors or missing key findings',
                '1 = Unacceptable - Mostly incorrect or unusable',
                '',
                'DIMENSIONS:',
                'Clinical Accuracy: Are the findings medically correct?',
                'Completeness: Are all important findings mentioned?',
                'Clinical Relevance: Is the report clinically useful?',
                'Readability: Is the report clear and well-structured?',
                'Actionability: Does it support clinical decisions?',
                '',
                'ERROR TRACKING:',
                'Critical Errors: Count dangerous mistakes (missed pneumothorax, false cancer, etc.)',
                'Missing Findings: List findings that should have been reported',
                'Hallucinated Findings: List false findings not present in image',
            ]
        })
        instructions.to_excel(writer, sheet_name='Instructions', index=False)
        
        # Evaluation sheet
        eval_df.to_excel(writer, sheet_name='Evaluation', index=False)
    
    print(f"Excel form saved: {excel_path}")
except Exception as e:
    print(f"Could not create Excel (install openpyxl): {e}")
    print("CSV form is still available.")

## 4. Quick View: Sample Reports for Review

In [None]:
# =============================================================================
# Display First 5 Samples for Quick Review
# =============================================================================
print("="*80)
print("SAMPLE REPORTS FOR QUICK REVIEW")
print("="*80)

for i, sample in enumerate(evaluation_samples[:5]):
    print(f"\n{'='*80}")
    print(f"SAMPLE {i+1}: {sample['sample_id']}")
    print(f"{'='*80}")
    print(f"\n[GENERATED REPORT]")
    print(f"{sample['generated_report']}")
    print(f"\n[REFERENCE REPORT]")
    print(f"{sample['reference_report']}")
    print(f"\n[MODEL CONFIDENCE]: {sample['model_confidence']:.2%}")
    print(f"[DETECTED FINDINGS]: {sample['detected_findings']}")
    print()

## 5. Load and Analyze Completed Evaluations

**After your radiologist cousin completes the evaluation, run the cells below.**

In [None]:
# =============================================================================
# Load Completed Evaluation (after radiologist fills it)
# =============================================================================

# Find the most recent completed evaluation file
eval_dir = Path('../data/human_evaluation')
eval_files = list(eval_dir.glob('radiologist_eval_form_*.csv'))

if eval_files:
    # Use most recent file
    latest_eval = max(eval_files, key=os.path.getmtime)
    print(f"Loading evaluation from: {latest_eval}")
    
    completed_df = pd.read_csv(latest_eval)
    
    # Check if evaluation is completed (has scores)
    if completed_df['clinical_accuracy'].notna().sum() > 0:
        print(f"\nCompleted evaluations: {completed_df['clinical_accuracy'].notna().sum()} / {len(completed_df)}")
        evaluation_complete = True
    else:
        print("\nEvaluation form found but not yet filled.")
        print("Please have your radiologist cousin fill in the scores.")
        evaluation_complete = False
else:
    print("No evaluation files found. Run cells above to generate forms first.")
    evaluation_complete = False

In [None]:
# =============================================================================
# Analyze Completed Evaluations
# =============================================================================

if 'evaluation_complete' in dir() and evaluation_complete:
    # Convert scores to numeric
    score_columns = ['clinical_accuracy', 'completeness', 'clinical_relevance', 'readability', 'actionability']
    for col in score_columns:
        completed_df[col] = pd.to_numeric(completed_df[col], errors='coerce')
    
    # Calculate statistics
    print("="*60)
    print("RADIOLOGIST EVALUATION RESULTS")
    print("="*60)
    
    print("\nMean Scores (1-5 scale):")
    print("-"*40)
    for col in score_columns:
        mean_score = completed_df[col].mean()
        std_score = completed_df[col].std()
        print(f"  {col.replace('_', ' ').title()}: {mean_score:.2f} +/- {std_score:.2f}")
    
    # Overall score
    overall_mean = completed_df[score_columns].mean().mean()
    print(f"\n  OVERALL SCORE: {overall_mean:.2f} / 5.00")
    
    # Critical errors
    if 'critical_errors' in completed_df.columns:
        completed_df['critical_errors'] = pd.to_numeric(completed_df['critical_errors'], errors='coerce')
        total_critical = completed_df['critical_errors'].sum()
        print(f"\n  CRITICAL ERRORS: {total_critical} total")
    
    # Quality rating
    print("\n" + "="*60)
    print("QUALITY ASSESSMENT")
    print("="*60)
    
    if overall_mean >= 4.0:
        print("  Rating: EXCELLENT - Publication ready!")
    elif overall_mean >= 3.5:
        print("  Rating: GOOD - Minor improvements needed")
    elif overall_mean >= 3.0:
        print("  Rating: ACCEPTABLE - Some work needed")
    else:
        print("  Rating: NEEDS IMPROVEMENT")
else:
    print("Evaluation not yet completed. Please have your radiologist fill the form.")

In [None]:
# =============================================================================
# Visualize Evaluation Results
# =============================================================================

if 'evaluation_complete' in dir() and evaluation_complete:
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # Plot 1: Score distribution by dimension
    score_means = [completed_df[col].mean() for col in score_columns]
    score_stds = [completed_df[col].std() for col in score_columns]
    labels = ['Clinical\nAccuracy', 'Completeness', 'Clinical\nRelevance', 'Readability', 'Actionability']
    
    bars = axes[0, 0].bar(labels, score_means, yerr=score_stds, capsize=5, color='#3498db', alpha=0.8)
    axes[0, 0].axhline(y=4.0, color='green', linestyle='--', label='Good threshold')
    axes[0, 0].axhline(y=3.0, color='orange', linestyle='--', label='Acceptable threshold')
    axes[0, 0].set_ylabel('Score (1-5)')
    axes[0, 0].set_title('Radiologist Evaluation Scores by Dimension')
    axes[0, 0].set_ylim(0, 5.5)
    axes[0, 0].legend()
    
    # Add value labels
    for bar, val in zip(bars, score_means):
        axes[0, 0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
                        f'{val:.2f}', ha='center', va='bottom', fontsize=10)
    
    # Plot 2: Score distribution histogram
    all_scores = completed_df[score_columns].values.flatten()
    all_scores = all_scores[~np.isnan(all_scores)]
    axes[0, 1].hist(all_scores, bins=[0.5, 1.5, 2.5, 3.5, 4.5, 5.5], color='#2ecc71', edgecolor='black', alpha=0.8)
    axes[0, 1].set_xlabel('Score')
    axes[0, 1].set_ylabel('Frequency')
    axes[0, 1].set_title('Distribution of All Scores')
    axes[0, 1].set_xticks([1, 2, 3, 4, 5])
    
    # Plot 3: Radar chart of dimensions
    angles = np.linspace(0, 2*np.pi, len(score_columns), endpoint=False).tolist()
    scores_radar = score_means + [score_means[0]]
    angles += angles[:1]
    
    ax3 = plt.subplot(223, projection='polar')
    ax3.plot(angles, scores_radar, 'o-', linewidth=2, color='#e74c3c')
    ax3.fill(angles, scores_radar, alpha=0.25, color='#e74c3c')
    ax3.set_xticks(angles[:-1])
    ax3.set_xticklabels(labels, size=8)
    ax3.set_ylim(0, 5)
    ax3.set_title('Evaluation Radar Chart', y=1.1)
    
    # Plot 4: Box plot by dimension
    axes[1, 1].boxplot([completed_df[col].dropna() for col in score_columns], labels=labels)
    axes[1, 1].set_ylabel('Score (1-5)')
    axes[1, 1].set_title('Score Distribution by Dimension')
    axes[1, 1].axhline(y=4.0, color='green', linestyle='--', alpha=0.5)
    
    plt.tight_layout()
    plt.savefig('../data/figures/radiologist_evaluation_results.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("\nFigure saved: ../data/figures/radiologist_evaluation_results.png")
else:
    print("Run evaluation first to generate visualizations.")

## 6. Generate LaTeX Table for Paper

In [None]:
# =============================================================================
# Generate LaTeX Table for Publication
# =============================================================================

if 'evaluation_complete' in dir() and evaluation_complete:
    latex_table = r"""
\begin{table}[h]
\centering
\caption{Radiologist Evaluation Results. A board-certified radiologist evaluated 50 randomly
selected generated reports on five dimensions using a 5-point Likert scale.}
\label{tab:human_eval}
\begin{tabular}{l|c|c}
\hline
\textbf{Evaluation Dimension} & \textbf{Mean Score} & \textbf{Std Dev} \\
\hline
"""
    
    for col in score_columns:
        mean_val = completed_df[col].mean()
        std_val = completed_df[col].std()
        col_name = col.replace('_', ' ').title()
        latex_table += f"{col_name} & {mean_val:.2f} & {std_val:.2f} \\\\\n"
    
    latex_table += r"""\hline
\textbf{Overall} & \textbf{""" + f"{overall_mean:.2f}" + r"""} & - \\
\hline
\end{tabular}
\end{table}
"""
    
    print("LATEX TABLE FOR PAPER:")
    print("="*60)
    print(latex_table)
    
    # Save to file
    with open('../data/statistics/human_evaluation_latex_table.tex', 'w') as f:
        f.write(latex_table)
    print("\nSaved to: ../data/statistics/human_evaluation_latex_table.tex")
else:
    print("Complete the evaluation first to generate LaTeX table.")

## 7. Error Analysis

In [None]:
# =============================================================================
# Analyze Common Errors
# =============================================================================

if 'evaluation_complete' in dir() and evaluation_complete:
    print("="*60)
    print("ERROR ANALYSIS")
    print("="*60)
    
    # Missing findings analysis
    if 'missing_findings' in completed_df.columns:
        missing = completed_df['missing_findings'].dropna()
        missing = [m for m in missing if str(m).strip() and str(m).lower() != 'nan']
        
        if missing:
            print("\nMost Common Missing Findings:")
            print("-"*40)
            all_missing = ' '.join(missing).lower()
            # Count common findings
            findings = ['cardiomegaly', 'effusion', 'pneumonia', 'edema', 'atelectasis', 
                       'pneumothorax', 'consolidation', 'nodule', 'mass']
            for finding in findings:
                count = all_missing.count(finding)
                if count > 0:
                    print(f"  {finding}: {count} times")
    
    # Hallucinated findings analysis
    if 'hallucinated_findings' in completed_df.columns:
        hallucinated = completed_df['hallucinated_findings'].dropna()
        hallucinated = [h for h in hallucinated if str(h).strip() and str(h).lower() != 'nan']
        
        if hallucinated:
            print("\nMost Common Hallucinated Findings:")
            print("-"*40)
            all_hallucinated = ' '.join(hallucinated).lower()
            for finding in findings:
                count = all_hallucinated.count(finding)
                if count > 0:
                    print(f"  {finding}: {count} times")
    
    # Samples with critical errors
    if 'critical_errors' in completed_df.columns:
        critical = completed_df[completed_df['critical_errors'] > 0]
        print(f"\nSamples with Critical Errors: {len(critical)} / {len(completed_df)}")
        print(f"Critical Error Rate: {len(critical)/len(completed_df)*100:.1f}%")
else:
    print("Complete the evaluation first to analyze errors.")

## 8. Summary and Export

In [None]:
from datetime import datetime

# =============================================================================
# Final Summary
# =============================================================================

print("="*70)
print("RADIOLOGIST EVALUATION SUMMARY")
print("="*70)

if 'evaluation_complete' in dir() and evaluation_complete:
    print(f"""
EVALUATION DETAILS:
  Samples Evaluated: {len(completed_df)}
  Evaluator: [Your Radiologist Cousin's Name]
  Date: {datetime.now().strftime('%Y-%m-%d')}

RESULTS:
  Overall Score: {overall_mean:.2f} / 5.00
  Clinical Accuracy: {completed_df['clinical_accuracy'].mean():.2f}
  Completeness: {completed_df['completeness'].mean():.2f}
  Clinical Relevance: {completed_df['clinical_relevance'].mean():.2f}
  Readability: {completed_df['readability'].mean():.2f}
  Actionability: {completed_df['actionability'].mean():.2f}

FILES GENERATED:
  - Evaluation Form (CSV): {csv_path}
  - Evaluation Form (HTML): {html_path}
  - Results Figure: ../data/figures/radiologist_evaluation_results.png
  - LaTeX Table: ../data/statistics/human_evaluation_latex_table.tex

PUBLICATION CLAIM:
  "A board-certified radiologist evaluated 50 randomly selected generated
   reports, achieving an average score of {overall_mean:.2f}/5.0 across five
   dimensions: clinical accuracy, completeness, clinical relevance,
   readability, and actionability."
""")
else:
    print(f"""
EVALUATION FORMS GENERATED:
  - CSV Form: {csv_path}
  - HTML Form: {html_path}

NEXT STEPS:
  1. Send the HTML form to your radiologist cousin
  2. Ask them to evaluate each sample (takes ~1-2 hours for 50 samples)
  3. Enter scores into the CSV file
  4. Re-run cells 14-22 to analyze results
  
TIPS FOR YOUR RADIOLOGIST:
  - Compare Generated Report vs Reference Report
  - Rate each dimension 1-5
  - Note any dangerous errors (missed pneumothorax, false cancer, etc.)
  - List missing and hallucinated findings
""")

print("="*70)

## 9. FINAL STEP: Download Everything to Local IDE

**Run this cell LAST after completing ALL 6 notebooks!**

This will zip the entire backend folder with all results for download to your local machine.

In [None]:
# =============================================================================
# FINAL STEP: ZIP EVERYTHING FOR DOWNLOAD TO LOCAL IDE
# =============================================================================
# Run this cell AFTER completing ALL 6 notebooks!
# This packages EVERYTHING for download to your local machine.
# Right-click on the zip file in Jupyter and click "Download"

import os
import subprocess
from datetime import datetime
from pathlib import Path

print("=" * 70)
print("PACKAGING ENTIRE PROJECT FOR DOWNLOAD")
print("=" * 70)

# Create timestamp for unique filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
zip_filename = f"backend_complete_{timestamp}.zip"

# ALL files and folders to include (from ALL 6 notebooks)
items_to_zip = [
    # Model checkpoint (most important!)
    "checkpoints/best_model.pt",
    
    # All data outputs from ALL notebooks
    "data/statistics",                  # CSVs, training history, results
    "data/figures",                     # ALL plots from ALL notebooks
    "data/ablation_results",           # Notebook 04: Ablation study
    "data/human_evaluation",           # Notebook 06: Radiologist evaluation
    "data/cross_dataset",              # Notebook 05: Cross-dataset results
    
    # All 6 notebooks with executed outputs
    "notebooks/01_data_exploration.ipynb",
    "notebooks/02_model_training.ipynb",
    "notebooks/03_evaluation_metrics.ipynb",
    "notebooks/04_ablation_study.ipynb",
    "notebooks/05_cross_dataset_evaluation.ipynb",
    "notebooks/06_radiologist_evaluation.ipynb",
    
    # Paper folder
    "paper",
    
    # Training logs
    "logs",
    
    # Config files
    "configs",
    
    # Source code
    "src",
    
    # Other important files
    "requirements.txt",
    "CLOUD_GPU_GUIDE.md",
]

# Check what exists and build list
print("\nChecking files to include:")
print("-" * 50)
zip_items = []
total_size_mb = 0

for item in items_to_zip:
    full_path = f"../{item}"
    if os.path.exists(full_path):
        zip_items.append(item)
        # Get size
        if os.path.isfile(full_path):
            size_mb = os.path.getsize(full_path) / (1024**2)
            total_size_mb += size_mb
            print(f"  [OK] {item} ({size_mb:.1f} MB)")
        else:
            # Directory - estimate size
            dir_size = sum(f.stat().st_size for f in Path(full_path).rglob('*') if f.is_file())
            size_mb = dir_size / (1024**2)
            total_size_mb += size_mb
            print(f"  [OK] {item}/ ({size_mb:.1f} MB)")
    else:
        print(f"  [--] {item} (not found, skipping)")

print("-" * 50)
print(f"Total estimated size: {total_size_mb:.1f} MB ({total_size_mb/1024:.2f} GB)")

# Create zip file
print(f"\nCreating {zip_filename}...")
print("This may take a few minutes for large files...")

zip_command = f"cd .. && zip -r {zip_filename} " + " ".join(zip_items)

try:
    result = subprocess.run(zip_command, shell=True, capture_output=True, text=True, timeout=600)
    
    if result.returncode == 0:
        zip_path = f"../{zip_filename}"
        if os.path.exists(zip_path):
            final_size_mb = os.path.getsize(zip_path) / (1024**2)
            final_size_gb = final_size_mb / 1024
            
            print(f"\n{'='*70}")
            print(f"SUCCESS! ZIP FILE CREATED")
            print(f"{'='*70}")
            print(f"\nFilename: {zip_filename}")
            if final_size_gb >= 1:
                print(f"Size: {final_size_gb:.2f} GB")
            else:
                print(f"Size: {final_size_mb:.1f} MB")
            
            print(f"\n{'='*70}")
            print("WHAT'S INCLUDED:")
            print(f"{'='*70}")
            print("""
  TRAINED MODEL:
    - checkpoints/best_model.pt (~2GB trained weights)
  
  DATA & RESULTS:
    - data/statistics/ (training history, metrics CSVs)
    - data/figures/ (ALL plots from ALL notebooks)
    - data/ablation_results/ (ablation study data)
    - data/human_evaluation/ (radiologist forms & results)
    - data/cross_dataset/ (IU X-Ray evaluation)
  
  NOTEBOOKS (with executed outputs):
    - 01_data_exploration.ipynb
    - 02_model_training.ipynb
    - 03_evaluation_metrics.ipynb
    - 04_ablation_study.ipynb
    - 05_cross_dataset_evaluation.ipynb
    - 06_radiologist_evaluation.ipynb
  
  OTHER:
    - paper/ (LaTeX source if exists)
    - logs/ (training logs)
    - configs/ (all YAML configs)
    - src/ (source code)
""")
            
            print(f"{'='*70}")
            print("HOW TO DOWNLOAD:")
            print(f"{'='*70}")
            print(f"""
  1. In Jupyter file browser, go UP one folder (to /workspace)
  2. Find: {zip_filename}
  3. Right-click → Download
  4. Wait for download to complete (~2-3GB)
""")
            
            print(f"{'='*70}")
            print("HOW TO EXTRACT LOCALLY (Windows PowerShell):")
            print(f"{'='*70}")
            print(f"""
  Expand-Archive -Path "C:\\Users\\YourName\\Downloads\\{zip_filename}" -DestinationPath "f:\\MajorProject\\backend" -Force
""")
            
            print(f"{'='*70}")
            print("DONE! Your local IDE will show all training results!")
            print(f"{'='*70}")
    else:
        print(f"\nError creating zip: {result.stderr}")
        
except subprocess.TimeoutExpired:
    print("\nZip command timed out. Try running manually in terminal:")
    print(f"  cd /workspace && zip -r {zip_filename} backend/")
except Exception as e:
    print(f"\nError: {e}")
    print("\nManual alternative - run in Jupyter terminal:")
    print(f"  cd /workspace && zip -r {zip_filename} backend/")