# VLM Model Comparison for Caption Generation

This notebook compares evaluation metrics of 3 Vision-Language Models (VLM) for caption generation on the F1 dataset.

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load Metrics

In [None]:
# Models to compare
models = ['blip', 'git-base', 'swin-tiny']

# Load metrics for each model
metrics_data = {}
for model in models:
    with open(f'results/{model}/metrics.json', 'r') as f:
        metrics_data[model] = json.load(f)

print("Metrics loaded for:", list(metrics_data.keys()))

## 2. Comparison Table

### Available Metrics:

- **BLEU-1 to BLEU-4**: Measure n-gram precision (individual words, pairs, triplets, quadruplets). Higher values indicate better similarity with ground truth.
- **METEOR**: Metric that considers synonyms and stemming, more robust than BLEU. Range [0, 1].
- **ROUGE-L**: Measures the longest common subsequence. Useful for evaluating structural coherence of text.

In [None]:
# Create DataFrame for comparison
df = pd.DataFrame(metrics_data).T
df.index.name = 'Model'

# Display table
print("\n" + "="*70)
print("METRICS COMPARISON")
print("="*70 + "\n")
print(df.round(4))
print("\n" + "="*70)

# Identify best model per metric
print("\nBEST MODEL PER METRIC:")
print("-"*70)
for metric in df.columns:
    best_model = df[metric].idxmax()
    best_score = df[metric].max()
    print(f"{metric:<12} â†’ {best_model.upper():<12} ({best_score:.4f})")

df

## 3. Comparative Visualization

In [None]:
# Grouped bar chart
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Subplot 1: BLEU scores
bleu_cols = [col for col in df.columns if 'BLEU' in col]
df[bleu_cols].plot(kind='bar', ax=axes[0], width=0.8)
axes[0].set_title('BLEU Scores (N-gram Precision)', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Model', fontsize=11)
axes[0].set_ylabel('Score', fontsize=11)
axes[0].legend(title='Metric', fontsize=9)
axes[0].set_xticklabels(df.index, rotation=0)
axes[0].grid(axis='y', alpha=0.3)

# Subplot 2: METEOR and ROUGE-L
other_cols = ['METEOR', 'ROUGE-L']
df[other_cols].plot(kind='bar', ax=axes[1], width=0.7, color=['#2ecc71', '#e74c3c'])
axes[1].set_title('METEOR & ROUGE-L Scores', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Model', fontsize=11)
axes[1].set_ylabel('Score', fontsize=11)
axes[1].legend(title='Metric', fontsize=9)
axes[1].set_xticklabels(df.index, rotation=0)
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('results/model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nChart saved to: results/model_comparison.png")

## 4. Summary Analysis

In [None]:
# Calculate average of all metrics (normalized)
df_normalized = df.copy()
for col in df_normalized.columns:
    max_val = df_normalized[col].max()
    if max_val > 0:
        df_normalized[col] = df_normalized[col] / max_val

df_normalized['Average'] = df_normalized.mean(axis=1)

print("\nAVERAGE PERFORMANCE (normalized):")
print("-"*40)
for model in df_normalized.index:
    avg_score = df_normalized.loc[model, 'Average']
    print(f"{model.upper():<12}: {avg_score:.4f} ({avg_score*100:.1f}%)")

best_overall = df_normalized['Average'].idxmax()
print(f"\nBest overall model: {best_overall.upper()}")