# Results Analysis and Visualization

This notebook loads the model outputs, computes multiple similarity metrics by comparing each model's baseline output
(error_count==0) with outputs for perturbed variants, and produces extensive visualizations.

The similarity metrics computed are:
   - Difflib similarity
   - Cosine similarity based on sentence embeddings
   - BLEU score
   - **Jaccard similarity** (new metric based on token overlap)

Visualizations include per-model boxplots, histograms, density plots, scatter plots (grouped by word count),
global line plots of mean/median performance, and additional distribution analyses.


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
sns.set(style="whitegrid")
import warnings
warnings.filterwarnings("ignore")

from src.score_metrics import difflib_similarity, embedding_similarity, bleu_score, jaccard_similarity

# Load model outputs
df_outputs = pd.read_csv('data/model_outputs.csv')
print("Loaded model outputs:", df_outputs.shape)

# Add word count (number of words in the variant question)
df_outputs['word_count'] = df_outputs['variant_question'].apply(lambda x: len(x.split()))

# Initialize similarity score columns for each metric
df_outputs['difflib_score'] = np.nan
df_outputs['embedding_score'] = np.nan
df_outputs['bleu_score'] = np.nan
df_outputs['jaccard_score'] = np.nan

# For each group (original_question, model_name), use error_count==0 as baseline and compute scores
for (orig, model), group in df_outputs.groupby(['original_question', 'model_name']):
    baseline_row = group[group['error_count'] == 0]
    if baseline_row.empty:
        continue
    baseline_text = baseline_row.iloc[0]['model_output']
    for idx, row in group.iterrows():
        variant_text = row['model_output']
        df_outputs.at[idx, 'difflib_score'] = difflib_similarity(baseline_text, variant_text)
        df_outputs.at[idx, 'embedding_score'] = embedding_similarity(baseline_text, variant_text)
        df_outputs.at[idx, 'bleu_score'] = bleu_score(baseline_text, variant_text)
        df_outputs.at[idx, 'jaccard_score'] = jaccard_similarity(baseline_text, variant_text)

# Save the scored outputs for further evaluation
df_outputs.to_csv('data/model_outputs_scored.csv', index=False)
print("Scored outputs saved at data/model_outputs_scored.csv")


## Detailed Per-Model Visualizations
For each model and for each similarity metric, we generate:
   - Boxplots by error count
   - Histograms and density plots for each error level
   - Scatter plots of similarity versus sentence word count


In [None]:
models = df_outputs['model_name'].unique()
metrics = ['difflib_score', 'embedding_score', 'bleu_score', 'jaccard_score']

for metric in metrics:
    for model in models:
        model_data = df_outputs[df_outputs['model_name'] == model]

        # Boxplot by error count
        plt.figure(figsize=(8, 6))
        sns.boxplot(x='error_count', y=metric, data=model_data)
        plt.title(f"{model} - {metric} by Error Count")
        plt.xlabel("Number of Misspellings")
        plt.ylabel(f"{metric} (0-100)")
        plt.tight_layout()
        plt.show()

        # Histogram and density plot for each error_count
        error_levels = sorted(model_data['error_count'].unique())
        for error in error_levels:
            plt.figure(figsize=(8, 5))
            subset = model_data[model_data['error_count'] == error][metric].dropna()
            plt.hist(subset, bins=30, alpha=0.6, density=True, label="Histogram")
            sns.kdeplot(subset, label="Density", color="orange")
            plt.title(f"{model} - {metric} (Error Count = {error})")
            plt.xlabel(f"{metric} (0-100)")
            plt.ylabel("Density")
            plt.legend()
            plt.tight_layout()
            plt.show()

## Global Comparative Visualizations
Here we aggregate similarity scores across models and plot global comparisons including:
   - Line plots of mean and median metrics across error counts
   - Scatter plots of similarity versus word count (grouped by error count)
   - Global distribution histograms and density plots for each metric


In [None]:
agg_data = df_outputs.groupby(['model_name', 'error_count']).agg({
    'difflib_score': ['mean', 'median'],
    'embedding_score': ['mean', 'median'],
    'bleu_score': ['mean', 'median'],
    'jaccard_score': ['mean', 'median']
}).reset_index()
agg_data.columns = ['model_name', 'error_count',
                    'difflib_mean', 'difflib_median',
                    'embedding_mean', 'embedding_median',
                    'bleu_mean', 'bleu_median',
                    'jaccard_mean', 'jaccard_median']
print(agg_data.head())

for metric in ['difflib_mean', 'embedding_mean', 'bleu_mean', 'jaccard_mean']:
    plt.figure(figsize=(10, 6))
    for model in models:
        subset = agg_data[agg_data['model_name'] == model]
        plt.plot(subset['error_count'], subset[metric], marker='o', label=model)
    plt.xlabel("Number of Misspellings")
    plt.ylabel(f"Average {metric.split('_')[0].title()} Score")
    plt.title(f"Global Comparison: Average {metric.split('_')[0].title()} Score by Error Count")
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize='small')
    plt.tight_layout()
    plt.show()

# Scatter plots: Similarity vs. Word Count (grouped by error_count)
for metric in metrics:
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x='word_count', y=metric, hue='error_count', data=df_outputs, palette="viridis", alpha=0.7)
    plt.title(f"{metric} vs. Sentence Word Count (Grouped by Error Count)")
    plt.xlabel("Word Count")
    plt.ylabel(f"{metric} (0-100)")
    plt.tight_layout()
    plt.show()

# Additional global distributions for each metric
for metric in metrics:
    plt.figure(figsize=(8, 5))
    subset = df_outputs[metric].dropna()
    plt.hist(subset, bins=40, alpha=0.5, density=True, label="Histogram")
    sns.kdeplot(subset, label="Density", color="red")
    plt.title(f"Global Distribution of {metric}")
    plt.xlabel(f"{metric} (0-100)")
    plt.ylabel("Density")
    plt.legend()
    plt.tight_layout()
    plt.show()