# Model Comparison: Text-Guided Segmentation vs CIPS-Net

This notebook compares all trained text-guided segmentation models against the CIPS-Net baseline.

## Evaluation Metrics
- **mIoU**: Mean Intersection over Union
- **mDice**: Mean Dice coefficient  
- **Per-class IoU/Dice**: For each cell type

## Datasets
- **PanNuke**: 3-fold cross-validation (training)
- **CoNSeP**: Zero-shot evaluation
- **MoNuSAC**: Zero-shot evaluation

In [None]:
import sys
import os
import json
from pathlib import Path

WORKSPACE = "/mnt/e3dbc9b9-6856-470d-84b1-ff55921cd906/Datasets/Nikhil/Histopathology_Work"
sys.path.insert(0, WORKSPACE)

import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('tab10')

CHECKPOINT_DIR = Path(f"{WORKSPACE}/checkpoints/text_guided")

print(f"Checkpoint directory: {CHECKPOINT_DIR}")
print(f"Exists: {CHECKPOINT_DIR.exists()}")

In [None]:
# Model information
MODEL_INFO = {
    'clipseg': {'name': 'CLIPSeg', 'venue': 'CVPR 2022', 'type': 'Decoder'},
    'clipseg_rd64': {'name': 'CLIPSeg-RD64', 'venue': 'CVPR 2022', 'type': 'Decoder'},
    'clipseg_rd128': {'name': 'CLIPSeg-RD128', 'venue': 'CVPR 2022', 'type': 'Decoder'},
    'lseg': {'name': 'LSeg', 'venue': 'ICLR 2022', 'type': 'Dense Prediction'},
    'groupvit': {'name': 'GroupViT', 'venue': 'CVPR 2022', 'type': 'Grouping'},
    'san': {'name': 'SAN', 'venue': 'CVPR 2023', 'type': 'Side Adapter'},
    'fc_clip': {'name': 'FC-CLIP', 'venue': 'NeurIPS 2023', 'type': 'Frozen CLIP'},
    'fc_clip_convnext': {'name': 'FC-CLIP-ConvNext', 'venue': 'NeurIPS 2023', 'type': 'Frozen CLIP'},
    'ovseg': {'name': 'OVSeg', 'venue': 'CVPR 2023', 'type': 'Open Vocabulary'},
    'cat_seg': {'name': 'CAT-Seg', 'venue': 'CVPR 2024', 'type': 'Cost Aggregation'},
    'sed': {'name': 'SED', 'venue': 'CVPR 2024', 'type': 'Decoupled'},
    'openseed': {'name': 'OpenSeeD', 'venue': 'ICCV 2023', 'type': 'Universal'},
    'odise': {'name': 'ODISE', 'venue': 'CVPR 2023', 'type': 'Diffusion'},
    'semantic_sam': {'name': 'Semantic-SAM', 'venue': 'ECCV 2024', 'type': 'SAM-based'},
    'cips_net': {'name': 'CIPS-Net', 'venue': 'Baseline', 'type': 'Instruction-guided'},
}

CLASS_NAMES = ['Neoplastic', 'Inflammatory', 'Connective', 'Dead', 'Epithelial']

print(f"Models to compare: {len(MODEL_INFO)}")

In [None]:
# Load all results
def load_results():
    """Load results from all trained models."""
    results = defaultdict(dict)
    
    # Load text-guided model results
    if CHECKPOINT_DIR.exists():
        for result_file in CHECKPOINT_DIR.glob('results_*.json'):
            with open(result_file) as f:
                data = json.load(f)
            
            model_name = data.get('model_name', result_file.stem.replace('results_', ''))
            fold = data.get('fold', 0)
            
            if model_name not in results:
                results[model_name] = {'folds': {}}
            
            results[model_name]['folds'][fold] = {
                'iou': data.get('best_iou', 0),
                'dice': data.get('final_dice', 0),
                'history': data.get('history', {}),
            }
    
    # Load CIPS-Net baseline results (if available)
    cips_checkpoint = Path(f"{WORKSPACE}/checkpoints/best_cipsnet_binary.pth")
    if cips_checkpoint.exists():
        # Placeholder - update with actual CIPS-Net results
        results['cips_net'] = {
            'folds': {
                0: {'iou': 0.0, 'dice': 0.0},
            }
        }
    
    return results

results = load_results()
print(f"Loaded results for {len(results)} models:")
for model, data in results.items():
    folds = list(data['folds'].keys())
    print(f"  {model}: {len(folds)} fold(s)")

In [None]:
# Aggregate results
def aggregate_results(results):
    """Calculate mean and std across folds."""
    summary = []
    
    for model_name, data in results.items():
        folds = data['folds']
        if not folds:
            continue
        
        ious = [f['iou'] for f in folds.values()]
        dices = [f['dice'] for f in folds.values()]
        
        info = MODEL_INFO.get(model_name, {'name': model_name, 'venue': 'Unknown', 'type': 'Unknown'})
        
        summary.append({
            'model_key': model_name,
            'Model': info['name'],
            'Venue': info['venue'],
            'Type': info['type'],
            'mIoU': np.mean(ious),
            'IoU_std': np.std(ious),
            'mDice': np.mean(dices),
            'Dice_std': np.std(dices),
            'n_folds': len(folds),
        })
    
    df = pd.DataFrame(summary)
    df = df.sort_values('mIoU', ascending=False)
    return df

summary_df = aggregate_results(results)
summary_df

In [None]:
# Create comparison table (LaTeX-ready)
if not summary_df.empty:
    print("\n" + "="*80)
    print("MODEL COMPARISON TABLE")
    print("="*80)
    
    # Format for display
    display_df = summary_df[['Model', 'Venue', 'Type', 'mIoU', 'IoU_std', 'mDice', 'Dice_std']].copy()
    display_df['mIoU'] = display_df.apply(lambda x: f"{x['mIoU']:.4f} ± {x['IoU_std']:.4f}", axis=1)
    display_df['mDice'] = display_df.apply(lambda x: f"{x['mDice']:.4f} ± {x['Dice_std']:.4f}", axis=1)
    display_df = display_df.drop(columns=['IoU_std', 'Dice_std'])
    
    print(display_df.to_string(index=False))
    
    # Save to CSV
    output_path = CHECKPOINT_DIR / 'comparison_table.csv'
    summary_df.to_csv(output_path, index=False)
    print(f"\nTable saved to {output_path}")
else:
    print("No results to compare yet. Run training first.")

In [None]:
# Bar chart comparison
if not summary_df.empty and len(summary_df) > 0:
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    
    # mIoU comparison
    ax1 = axes[0]
    x = np.arange(len(summary_df))
    bars1 = ax1.barh(x, summary_df['mIoU'], xerr=summary_df['IoU_std'], 
                     color='steelblue', alpha=0.8, capsize=3)
    ax1.set_yticks(x)
    ax1.set_yticklabels(summary_df['Model'])
    ax1.set_xlabel('mIoU')
    ax1.set_title('Mean IoU Comparison')
    ax1.set_xlim(0, 1)
    ax1.invert_yaxis()
    
    # Add value labels
    for i, (v, std) in enumerate(zip(summary_df['mIoU'], summary_df['IoU_std'])):
        ax1.text(v + std + 0.02, i, f'{v:.4f}', va='center', fontsize=9)
    
    # mDice comparison
    ax2 = axes[1]
    bars2 = ax2.barh(x, summary_df['mDice'], xerr=summary_df['Dice_std'],
                     color='coral', alpha=0.8, capsize=3)
    ax2.set_yticks(x)
    ax2.set_yticklabels(summary_df['Model'])
    ax2.set_xlabel('mDice')
    ax2.set_title('Mean Dice Comparison')
    ax2.set_xlim(0, 1)
    ax2.invert_yaxis()
    
    for i, (v, std) in enumerate(zip(summary_df['mDice'], summary_df['Dice_std'])):
        ax2.text(v + std + 0.02, i, f'{v:.4f}', va='center', fontsize=9)
    
    plt.tight_layout()
    plt.savefig(CHECKPOINT_DIR / 'comparison_bar_chart.png', dpi=150, bbox_inches='tight')
    plt.show()
else:
    print("No results to visualize yet.")

In [None]:
# Training curves comparison
def plot_training_curves(results, metric='val_iou'):
    """Plot training curves for all models."""
    fig, ax = plt.subplots(figsize=(12, 6))
    
    for model_name, data in results.items():
        folds = data.get('folds', {})
        if not folds:
            continue
        
        # Use first fold's history
        first_fold = list(folds.values())[0]
        history = first_fold.get('history', {})
        
        if metric in history:
            values = history[metric]
            info = MODEL_INFO.get(model_name, {'name': model_name})
            ax.plot(values, label=info['name'], alpha=0.8)
    
    ax.set_xlabel('Epoch')
    ax.set_ylabel(metric.replace('_', ' ').title())
    ax.set_title(f'Training Curves - {metric.replace("_", " ").title()}')
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    return fig

if results:
    fig = plot_training_curves(results, 'val_iou')
    fig.savefig(CHECKPOINT_DIR / 'training_curves_iou.png', dpi=150, bbox_inches='tight')
    plt.show()
else:
    print("No training history to plot.")

In [None]:
# Radar chart for multi-metric comparison
def radar_chart(summary_df, metrics=['mIoU', 'mDice']):
    """Create radar chart comparing models across metrics."""
    if summary_df.empty or len(summary_df) < 2:
        print("Need at least 2 models for radar chart")
        return
    
    categories = CLASS_NAMES
    n_cats = len(categories)
    
    # This would need per-class metrics
    # For now, create a simple comparison
    fig, ax = plt.subplots(figsize=(10, 8), subplot_kw=dict(projection='polar'))
    
    angles = [n / float(len(summary_df)) * 2 * np.pi for n in range(len(summary_df))]
    angles += angles[:1]
    
    # IoU
    values = summary_df['mIoU'].tolist()
    values += values[:1]
    ax.plot(angles, values, 'o-', linewidth=2, label='mIoU', color='steelblue')
    ax.fill(angles, values, alpha=0.25, color='steelblue')
    
    # Dice
    values = summary_df['mDice'].tolist()
    values += values[:1]
    ax.plot(angles, values, 'o-', linewidth=2, label='mDice', color='coral')
    ax.fill(angles, values, alpha=0.25, color='coral')
    
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(summary_df['Model'], size=10)
    ax.set_ylim(0, 1)
    ax.legend(loc='upper right', bbox_to_anchor=(1.2, 1.0))
    ax.set_title('Model Performance Comparison', size=14)
    
    plt.tight_layout()
    return fig

if len(summary_df) >= 2:
    fig = radar_chart(summary_df)
    if fig:
        fig.savefig(CHECKPOINT_DIR / 'radar_comparison.png', dpi=150, bbox_inches='tight')
        plt.show()

In [None]:
# Generate final report
def generate_report(summary_df):
    """Generate markdown report."""
    report = []
    report.append("# Text-Guided Segmentation Experiment Report\n")
    report.append("## Overview\n")
    report.append(f"- **Total models evaluated**: {len(summary_df)}\n")
    
    if not summary_df.empty:
        best_model = summary_df.iloc[0]
        report.append(f"- **Best model**: {best_model['Model']}\n")
        report.append(f"- **Best mIoU**: {best_model['mIoU']:.4f} ± {best_model['IoU_std']:.4f}\n")
        report.append(f"- **Best mDice**: {best_model['mDice']:.4f} ± {best_model['Dice_std']:.4f}\n")
    
    report.append("\n## Results Table\n")
    report.append("| Rank | Model | Venue | mIoU | mDice |\n")
    report.append("|------|-------|-------|------|-------|\n")
    
    for i, row in summary_df.iterrows():
        rank = summary_df.index.get_loc(i) + 1
        report.append(f"| {rank} | {row['Model']} | {row['Venue']} | "
                     f"{row['mIoU']:.4f}±{row['IoU_std']:.4f} | "
                     f"{row['mDice']:.4f}±{row['Dice_std']:.4f} |\n")
    
    report.append("\n## Conclusions\n")
    report.append("(Add your analysis here)\n")
    
    return ''.join(report)

report = generate_report(summary_df)
print(report)

# Save report
with open(CHECKPOINT_DIR / 'experiment_report.md', 'w') as f:
    f.write(report)
print(f"\nReport saved to {CHECKPOINT_DIR / 'experiment_report.md'}")

## Next Steps

1. **Train remaining models**: Use `run_all_experiments.py` or the unified notebook
2. **Zero-shot evaluation**: Test on CoNSeP and MoNuSAC
3. **Statistical analysis**: Paired t-tests for significance
4. **Qualitative analysis**: Visualize segmentation outputs
5. **Paper-ready figures**: Generate publication-quality plots