# Deep Evaluation and Statistical Analysis
In this notebook we generate comprehensive evaluation tables and carry out additional statistical analyses.
We include all four similarity metrics (difflib, embedding cosine, BLEU, and Jaccard) and report:
   - Mean, standard deviation, median, 25th and 75th percentiles per model
   - Aggregated tables by model class (e.g., Small, Medium, Large, Standard)
   - Additional analyses such as correlation heatmaps, pair plots, and regression analyses.

These results help reveal which types of models (by size or architecture) are more robust to misspellings.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")
import warnings
warnings.filterwarnings("ignore")
import statsmodels.api as sm
import scipy.stats as stats
import re

# Load scored outputs
df = pd.read_csv('data/model_outputs_scored.csv')
print("Loaded scored outputs:", df.shape)

# Add word count if not present
if 'word_count' not in df.columns:
    df['word_count'] = df['variant_question'].apply(lambda x: len(x.split()))

# Classify models by type based on their name (this classification can be extended)
def classify_model(model_name):
    name = model_name.lower()
    if "large" in name:
        return "Large"
    elif "medium" in name:
        return "Medium"
    elif "small" in name:
        return "Small"
    else:
        return "Standard"

df['model_class'] = df['model_name'].apply(classify_model)

# Define numeric columns including the new metric (jaccard_score)
numeric_cols = ['error_count', 'word_count', 'difflib_score', 'embedding_score', 'bleu_score', 'jaccard_score']


## Comprehensive Evaluation Tables
Compute overall descriptive statistics per model and per model class for all similarity metrics.
The following tables report mean, standard deviation, median, 25th and 75th percentiles.

In [None]:
# Evaluation table by model
summary_by_model = df.groupby('model_name')[['difflib_score', 'embedding_score', 'bleu_score', 'jaccard_score']].agg(
    ['mean', 'std', 'median', lambda x: np.percentile(x, 25), lambda x: np.percentile(x, 75)]
).reset_index()
summary_by_model.columns = ['model_name',
                              'difflib_mean', 'difflib_std', 'difflib_median', 'difflib_q1', 'difflib_q3',
                              'embedding_mean', 'embedding_std', 'embedding_median', 'embedding_q1', 'embedding_q3',
                              'bleu_mean', 'bleu_std', 'bleu_median', 'bleu_q1', 'bleu_q3',
                              'jaccard_mean', 'jaccard_std', 'jaccard_median', 'jaccard_q1', 'jaccard_q3']
print("Evaluation Table by Model:")
print(summary_by_model.head())
summary_by_model.to_csv('data/evaluation_by_model.csv', index=False)
print("Saved evaluation table by model at data/evaluation_by_model.csv")

# %%
# Evaluation table by model class
summary_by_class = df.groupby('model_class')[['difflib_score', 'embedding_score', 'bleu_score', 'jaccard_score']].agg(
    ['mean', 'std', 'median', lambda x: np.percentile(x, 25), lambda x: np.percentile(x, 75)]
).reset_index()
summary_by_class.columns = ['model_class',
                              'difflib_mean', 'difflib_std', 'difflib_median', 'difflib_q1', 'difflib_q3',
                              'embedding_mean', 'embedding_std', 'embedding_median', 'embedding_q1', 'embedding_q3',
                              'bleu_mean', 'bleu_std', 'bleu_median', 'bleu_q1', 'bleu_q3',
                              'jaccard_mean', 'jaccard_std', 'jaccard_median', 'jaccard_q1', 'jaccard_q3']
print("Evaluation Table by Model Class:")
print(summary_by_class.head())
summary_by_class.to_csv('data/evaluation_by_model_class.csv', index=False)
print("Saved evaluation table by model class at data/evaluation_by_model_class.csv")


## Additional Statistical Analyses

Generate a correlation heatmap, pair plot, and perform a regression analysis for a selected model.


In [None]:
# Correlation heatmap for the numeric evaluation metrics
corr_matrix = df[numeric_cols].corr()
plt.figure(figsize=(9, 7))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap: Evaluation Metrics")
plt.tight_layout()
plt.show()

# %%
# Pair plot for error_count, word_count, and similarity metrics
sns.pairplot(df[numeric_cols])
plt.suptitle("Pair Plot of Evaluation Metrics", y=1.02)
plt.show()

# %%
# Example regression analysis: effect of error_count on difflib_score for a selected model
selected_model = "gpt2"
df_selected = df[df['model_name'] == selected_model].dropna(subset=['difflib_score'])
X = df_selected['error_count']
y = df_selected['difflib_score']
X = sm.add_constant(X)
reg_model = sm.OLS(y, X).fit()
print(reg_model.summary())

plt.figure(figsize=(8, 6))
sns.regplot(x='error_count', y='difflib_score', data=df_selected, ci=95, scatter_kws={'alpha': 0.5})
plt.title(f"Regression Analysis: Difflib Similarity vs. Error Count for {selected_model}")
plt.xlabel("Error Count")
plt.ylabel("Difflib Similarity (0-100)")
plt.tight_layout()
plt.show()


## Pairwise Model Comparisons

(Optional) Conduct pairwise t-tests between model classes or individual models across error levels to verify significance.
(This section can be expanded with further statistical tests as needed.)