In [3]:
# FIX: Install and verify rouge-score package
# Run this cell FIRST if you get "ModuleNotFoundError: No module named 'rouge'"

import sys
import subprocess

def install_package(package_name):
    """Install package using pip"""
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package_name, "--quiet"])
        return True
    except subprocess.CalledProcessError:
        return False

# Check if rouge is available
try:
    from rouge import Rouge
    print("✓ rouge library is already installed")
except ImportError:
    print("⚠ rouge library not found. Installing rouge-score...")
    if install_package("rouge-score"):
        print("✓ Successfully installed rouge-score")
        # Try importing again
        try:
            from rouge import Rouge
            print("✓ rouge library now available")
        except ImportError:
            print("✗ Installation succeeded but import still fails.")
            print("  Please restart the kernel and run this cell again.")
    else:
        print("✗ Failed to install rouge-score")
        print("  Please run manually: pip install rouge-score")
        print("  Then restart the kernel.")


⚠ rouge library not found. Installing rouge-score...
✓ Successfully installed rouge-score
✗ Installation succeeded but import still fails.
  Please restart the kernel and run this cell again.


In [1]:
# Add project root to path
import sys
sys.path.append('..')

# Import models and utilities
try:
    from models.textrank import TextRankSummarizer
    from models.bart import BARTSummarizer
    from models.pegasus import PEGASUSSummarizer
    from utils.evaluator import SummarizerEvaluator
    from utils.data_loader import DataLoader
    print("✓ All imports successful")
except ImportError as e:
    print(f"✗ Import error: {e}")
    print("  Make sure you've run the previous cell to install dependencies")
    raise

# Import standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import json

plt.style.use('seaborn-v0_8')

✗ Import error: No module named 'rouge'
  Make sure you've run the previous cell to install dependencies


ModuleNotFoundError: No module named 'rouge'

In [None]:
print("Loading test dataset...")
loader = DataLoader()

# Load your saved samples (or load fresh)
try:
    test_data = loader.load_samples('../data/samples/test_50.json')
    print(f"✓ Loaded {len(test_data)} test samples")
except:
    print("Downloading test data...")
    test_data = loader.load_cnn_dailymail(split='test', num_samples=50)
    loader.save_samples(test_data, '../data/samples/test_50.json')
    print(f"✓ Downloaded and saved {len(test_data)} samples")

# Extract texts and references
texts = [item['article'] for item in test_data]
references = [item['reference_summary'] for item in test_data]

print(f"\nDataset Statistics:")
print(f"  - Number of samples: {len(texts)}")
print(f"  - Avg article length: {np.mean([len(t.split()) for t in texts]):.0f} words")
print(f"  - Avg reference length: {np.mean([len(r.split()) for r in references]):.0f}words")

In [None]:
print("\nInitializing models...")

models = {
    'TextRank': TextRankSummarizer(),
    'BART': BARTSummarizer(device='cpu'),
    'PEGASUS': PEGASUSSummarizer(device='cpu')
}

print("✓ All models ready")

# Cell 4: Generate Summaries (Takes ~10-20 minutes for 50 samples)
print("\nGenerating summaries for all models...")
print("This will take 10-20 minutes. Grab a coffee! ☕")

all_summaries = {}
all_times = {}

for model_name, model in models.items():
    print(f"\n{model_name}:")
    summaries = []
    times = []
    
    for i, text in enumerate(texts[:10], 1):  # Start with 10 samples
        print(f"  Processing {i}/10...", end='\r')
        
        if model_name == 'TextRank':
            result = model.summarize_with_metrics(text)
        else:
            result = model.summarize_with_metrics(text, max_length=100, min_length=30)
        
        summaries.append(result['summary'])
        times.append(result['metadata']['processing_time'])
    
    all_summaries[model_name] = summaries
    all_times[model_name] = times
    print(f"  ✓ Completed {model_name}                    ")

print("\n✓ All summaries generated!")

In [None]:
print("\nEvaluating models...")

evaluator = SummarizerEvaluator()
evaluation_results = {}

for model_name in models.keys():
    print(f"\nEvaluating {model_name}...")
    results = evaluator.evaluate_batch(
        all_summaries[model_name],
        references[:len(all_summaries[model_name])],
        model_name
    )
    results['avg_time'] = np.mean(all_times[model_name])
    results['std_time'] = np.std(all_times[model_name])
    evaluation_results[model_name] = results

print("✓ Evaluation complete")

In [None]:
print("\n" + "="*70)
print("EVALUATION RESULTS")
print("="*70)

results_table = []

for model_name, results in evaluation_results.items():
    results_table.append({
        'Model': model_name,
        'Type': 'Extractive' if model_name == 'TextRank' else 'Abstractive',
        'ROUGE-1': f"{results['rouge_1_f1_mean']:.4f} ± {results['rouge_1_f1_std']:.4f}",
        'ROUGE-2': f"{results['rouge_2_f1_mean']:.4f} ± {results['rouge_2_f1_std']:.4f}",
        'ROUGE-L': f"{results['rouge_l_f1_mean']:.4f} ± {results['rouge_l_f1_std']:.4f}",
        'Avg Time (s)': f"{results['avg_time']:.3f} ± {results['std_time']:.3f}",
        'Samples': results['num_samples']
    })

results_df = pd.DataFrame(results_table)
print(results_df.to_string(index=False))

# Save to CSV for report
results_df.to_csv('../results/evaluation_results.csv', index=False)
print("\n✓ Results saved to results/evaluation_results.csv")


In [None]:
print("\n" + "="*70)
print("STATISTICAL SIGNIFICANCE TESTS")
print("="*70)

# Compare BART vs PEGASUS (both abstractive)
bart_rouge1 = [s['rouge_1_f1'] for s in evaluation_results['BART']['individual_scores']]
peg_rouge1 = [s['rouge_1_f1'] for s in evaluation_results['PEGASUS']['individual_scores']]

sig_test = evaluator.statistical_significance_test(
    bart_rouge1,
    peg_rouge1,
    test_name='paired t-test'
)

print(f"\nBART vs PEGASUS (ROUGE-1):")
print(f"  Test: {sig_test['test_name']}")
print(f"  p-value: {sig_test['p_value']:.6f}")
print(f"  {sig_test['interpretation']}")

In [None]:
fig = plt.figure(figsize=(16, 12))

# Create grid
gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)

# 1. ROUGE Scores Comparison
ax1 = fig.add_subplot(gs[0, :2])
rouge_data = pd.DataFrame({
    'Model': list(evaluation_results.keys()) * 3,
    'Metric': ['ROUGE-1']*3 + ['ROUGE-2']*3 + ['ROUGE-L']*3,
    'Score': [
        evaluation_results['TextRank']['rouge_1_f1_mean'],
        evaluation_results['BART']['rouge_1_f1_mean'],
        evaluation_results['PEGASUS']['rouge_1_f1_mean'],
        evaluation_results['TextRank']['rouge_2_f1_mean'],
        evaluation_results['BART']['rouge_2_f1_mean'],
        evaluation_results['PEGASUS']['rouge_2_f1_mean'],
        evaluation_results['TextRank']['rouge_l_f1_mean'],
        evaluation_results['BART']['rouge_l_f1_mean'],
        evaluation_results['PEGASUS']['rouge_l_f1_mean']
    ]
})

sns.barplot(data=rouge_data, x='Metric', y='Score', hue='Model', ax=ax1)
ax1.set_title('ROUGE Score Comparison', fontsize=14, fontweight='bold')
ax1.set_ylabel('F1 Score')
ax1.set_ylim([0, 0.5])
ax1.legend(title='Model')
ax1.grid(axis='y', alpha=0.3)

# 2. Processing Time
ax2 = fig.add_subplot(gs[0, 2])
times = [evaluation_results[m]['avg_time'] for m in models.keys()]
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1']
ax2.bar(models.keys(), times, color=colors)
ax2.set_title('Processing Time', fontsize=12, fontweight='bold')
ax2.set_ylabel('Time (seconds)')
ax2.grid(axis='y', alpha=0.3)

# 3. ROUGE-1 Distribution
ax3 = fig.add_subplot(gs[1, 0])
for model_name, color in zip(models.keys(), colors):
    rouge1_scores = [s['rouge_1_f1'] for s in evaluation_results[model_name]['individual_scores']]
    ax3.hist(rouge1_scores, alpha=0.6, label=model_name, bins=10, color=color)
ax3.set_title('ROUGE-1 Score Distribution', fontsize=12, fontweight='bold')
ax3.set_xlabel('ROUGE-1 F1 Score')
ax3.set_ylabel('Frequency')
ax3.legend()
ax3.grid(axis='y', alpha=0.3)

# 4. ROUGE-2 Distribution
ax4 = fig.add_subplot(gs[1, 1])
for model_name, color in zip(models.keys(), colors):
    rouge2_scores = [s['rouge_2_f1'] for s in evaluation_results[model_name]['individual_scores']]
    ax4.hist(rouge2_scores, alpha=0.6, label=model_name, bins=10, color=color)
ax4.set_title('ROUGE-2 Score Distribution', fontsize=12, fontweight='bold')
ax4.set_xlabel('ROUGE-2 F1 Score')
ax4.set_ylabel('Frequency')
ax4.legend()
ax4.grid(axis='y', alpha=0.3)

# 5. ROUGE-L Distribution
ax5 = fig.add_subplot(gs[1, 2])
for model_name, color in zip(models.keys(), colors):
    rougel_scores = [s['rouge_l_f1'] for s in evaluation_results[model_name]['individual_scores']]
    ax5.hist(rougel_scores, alpha=0.6, label=model_name, bins=10, color=color)
ax5.set_title('ROUGE-L Score Distribution', fontsize=12, fontweight='bold')
ax5.set_xlabel('ROUGE-L F1 Score')
ax5.set_ylabel('Frequency')
ax5.legend()
ax5.grid(axis='y', alpha=0.3)

# 6. Box Plot Comparison
ax6 = fig.add_subplot(gs[2, :])
box_data = []
for model_name in models.keys():
    rouge1_scores = [s['rouge_1_f1'] for s in evaluation_results[model_name]['individual_scores']]
    for score in rouge1_scores:
        box_data.append({'Model': model_name, 'ROUGE-1': score})

box_df = pd.DataFrame(box_data)
sns.boxplot(data=box_df, x='Model', y='ROUGE-1', ax=ax6, palette=colors)
ax6.set_title('ROUGE-1 Score Distribution (Box Plot)', fontsize=14, fontweight='bold')
ax6.grid(axis='y', alpha=0.3)

plt.savefig('../results/comprehensive_evaluation.png', dpi=300, bbox_inches='tight')
print("\n✓ Comprehensive visualization saved!")
plt.show()

In [None]:
print("\n" + "="*70)
print("EXPORTING RESULTS FOR REPORT")
print("="*70)

# Create comprehensive export
export_data = {
    'evaluation_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
    'dataset': {
        'name': 'CNN/DailyMail',
        'samples_evaluated': len(all_summaries['TextRank']),
        'split': 'test'
    },
    'models': {
        model_name: {
            'type': results_table[i]['Type'],
            'rouge_1': {
                'mean': evaluation_results[model_name]['rouge_1_f1_mean'],
                'std': evaluation_results[model_name]['rouge_1_f1_std']
            },
            'rouge_2': {
                'mean': evaluation_results[model_name]['rouge_2_f1_mean'],
                'std': evaluation_results[model_name]['rouge_2_f1_std']
            },
            'rouge_l': {
                'mean': evaluation_results[model_name]['rouge_l_f1_mean'],
                'std': evaluation_results[model_name]['rouge_l_f1_std']
            },
            'processing_time': {
                'mean': evaluation_results[model_name]['avg_time'],
                'std': evaluation_results[model_name]['std_time']
            }
        }
        for i, model_name in enumerate(models.keys())
    },
    'statistical_tests': {
        'bart_vs_pegasus': sig_test
    }
}

with open('../results/final_evaluation.json', 'w') as f:
    json.dump(export_data, f, indent=2)

print("✓ Exported to results/final_evaluation.json")
print("\nFiles created for your report:")
print("  1. results/evaluation_results.csv - Table for report")
print("  2. results/comprehensive_evaluation.png - Main figure")
print("  3. results/final_evaluation.json - All data")

# Cell 10: Summary for Report
print("\n" + "="*70)
print("KEY FINDINGS FOR YOUR REPORT")
print("="*70)

best_model = max(evaluation_results.keys(), 
                 key=lambda x: evaluation_results[x]['rouge_1_f1_mean'])
fastest_model = min(evaluation_results.keys(),
                    key=lambda x: evaluation_results[x]['avg_time'])

print(f"\n1. Best Overall Performance: {best_model}")
print(f"   - ROUGE-1: {evaluation_results[best_model]['rouge_1_f1_mean']:.4f}")
print(f"   - ROUGE-2: {evaluation_results[best_model]['rouge_2_f1_mean']:.4f}")
print(f"   - ROUGE-L: {evaluation_results[best_model]['rouge_l_f1_mean']:.4f}")

print(f"\n2. Fastest Processing: {fastest_model}")
print(f"   - Avg time: {evaluation_results[fastest_model]['avg_time']:.3f}s")
print(f"   - {evaluation_results[max(evaluation_results.keys(), key=lambda x: evaluation_results[x]['avg_time'])]['avg_time'] / evaluation_results[fastest_model]['avg_time']:.1f}x faster than slowest")

print(f"\n3. Extractive vs Abstractive:")
print(f"   - TextRank (Extractive): ROUGE-1 = {evaluation_results['TextRank']['rouge_1_f1_mean']:.4f}")
print(f"   - BART (Abstractive): ROUGE-1 = {evaluation_results['BART']['rouge_1_f1_mean']:.4f}")
print(f"   - PEGASUS (Abstractive): ROUGE-1 = {evaluation_results['PEGASUS']['rouge_1_f1_mean']:.4f}")
print(f"   - Abstractive models outperform extractive by {(evaluation_results[best_model]['rouge_1_f1_mean'] / evaluation_results['TextRank']['rouge_1_f1_mean'] - 1) * 100:.1f}%")

print("\n" + "="*70)
print("✓ Evaluation complete! Use these results in your report.")
print("="*70)