# ANLI - Model Evaluation

This notebook performs comprehensive evaluation:
- Test set performance
- Confusion matrices
- Error analysis
- Performance by text length
- Sample predictions

In [None]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from data_loader import ANLIDataLoader
from models import TransformerNLI
from evaluator import ModelEvaluator

In [None]:
# Load data
loader = ANLIDataLoader()
train_df, dev_df, test_df = loader.load_data()

test_premises, test_hypotheses = loader.prepare_text_pairs(test_df)
test_labels = test_df['label'].values

print(f"Test samples: {len(test_labels)}")

In [None]:
# Load trained BERT model
print("Loading trained BERT model...")
bert_model = TransformerNLI(model_name='../models/deberta-anli/final', num_labels=3)

In [None]:
# Clear memory first
import gc
import torch
torch.cuda.empty_cache()
gc.collect()

# Predict in tiny batches
from tqdm import tqdm
import numpy as np

print("\nMaking predictions in small batches:")

batch_size = 8  # Very small!
all_preds = []

bert_model.model.eval()

for i in tqdm(range(0, len(test_premises), batch_size)):
    batch_premises = test_premises[i:i+batch_size]
    batch_hypotheses = test_hypotheses[i:i+batch_size]

    # Tokenize with shorter max_length
    encodings = bert_model.tokenize_data(
        batch_premises,
        batch_hypotheses,
        max_length=64  # Shorter to save memory
    )

    with torch.no_grad():
        input_ids = encodings['input_ids'].to(bert_model.device)
        attention_mask = encodings['attention_mask'].to(bert_model.device)

        outputs = bert_model.model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=-1)

        all_preds.extend(predictions.cpu().numpy().tolist())

        # Clear after each batch
        del input_ids, attention_mask, outputs, predictions
        torch.cuda.empty_cache()

test_preds = np.array(all_preds)
print("✓ Predictions complete")

## Overall Performance Metrics

In [None]:
print("TEST SET EVALUATION:")

evaluator = ModelEvaluator()
metrics = evaluator.compute_metrics(test_labels, test_preds)

print(f"\nOverall Accuracy: {metrics['accuracy']:.4f}")
print(f"Macro F1-Score: {metrics['macro_f1']:.4f}")

print("\nPer-Class Metrics:")
print("-" * 50)
for label_name, label_metrics in metrics['per_class'].items():
    print(f"\n{label_name.capitalize()}:")
    print(f"  Precision: {label_metrics['precision']:.4f}")
    print(f"  Recall: {label_metrics['recall']:.4f}")
    print(f"  F1-Score: {label_metrics['f1']:.4f}")
    print(f"  Support: {label_metrics['support']}")

In [None]:
# Detailed classification report
evaluator.print_classification_report(test_labels, test_preds)

## Confusion Matrix Analysis

In [None]:
# Regular confusion matrix
evaluator.plot_confusion_matrix(
    test_labels,
    test_preds,
    save_path='../results/plots/confusion_matrix.png',
    normalize=False
)

In [None]:
# Normalized confusion matrix
evaluator.plot_confusion_matrix(
    test_labels,
    test_preds,
    save_path='../results/plots/confusion_matrix_normalized.png',
    normalize=True
)

## Error Analysis

In [None]:
errors_df = evaluator.analyze_errors(test_df, test_labels, test_preds)

print(f"\nTotal errors: {len(errors_df)} out of {len(test_labels)}")
print(f"Error rate: {len(errors_df) / len(test_labels):.2%}")

print("\nError patterns:")
print(errors_df['error_type'].value_counts())

In [None]:
# Visualize error types
error_counts = errors_df['error_type'].value_counts()

plt.figure(figsize=(10, 6))
plt.barh(error_counts.index, error_counts.values, color='coral')
plt.xlabel('Count', fontsize=12)
plt.title('Error Type Distribution', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('../results/plots/error_types.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Show example errors
print("SAMPLE ERRORS:")

for error_type in errors_df['error_type'].value_counts().head(3).index:
    print(f"\n{error_type}:")
    print("-" * 50)
    samples = errors_df[errors_df['error_type'] == error_type].head(2)
    for idx, row in samples.iterrows():
        print(f"\nPremise: {row['premise']}")
        print(f"Hypothesis: {row['hypothesis']}")
        print(f"True: {loader.label_map[row['label']]} | Predicted: {loader.label_map[row['predicted']]}")
        print("-" * 50)

## Performance by Text Length

In [None]:
print("PERFORMANCE BY TEXT LENGTH:")

acc_by_premise, acc_by_hypothesis = evaluator.analyze_by_length(test_df, test_labels, test_preds)

print("\nAccuracy by premise length:")
print(acc_by_premise)

print("\nAccuracy by hypothesis length:")
print(acc_by_hypothesis)

In [None]:
# Visualization
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

acc_by_premise.plot(kind='bar', ax=axes[0], color='steelblue')
axes[0].set_title('Accuracy by Premise Length', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Accuracy')
axes[0].set_xlabel('Premise Length (words)')
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(axis='y', alpha=0.3)

acc_by_hypothesis.plot(kind='bar', ax=axes[1], color='coral')
axes[1].set_title('Accuracy by Hypothesis Length', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Accuracy')
axes[1].set_xlabel('Hypothesis Length (words)')
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('../results/plots/accuracy_by_length.png', dpi=300, bbox_inches='tight')
plt.show()

## Save Results

In [None]:
# Save metrics
results_summary = {
    'metric': ['Accuracy', 'Macro F1', 'Precision (avg)', 'Recall (avg)'],
    'value': [
        metrics['accuracy'],
        metrics['macro_f1'],
        np.mean([m['precision'] for m in metrics['per_class'].values()]),
        np.mean([m['recall'] for m in metrics['per_class'].values()])
    ]
}

results_df = pd.DataFrame(results_summary)
results_df.to_csv('../results/metrics/test_results.csv', index=False)

print("✓ Test results saved to ../results/metrics/test_results.csv")

In [None]:
# Save detailed predictions
predictions_df = test_df.copy()
predictions_df['predicted_label'] = test_preds
predictions_df['correct'] = test_labels == test_preds
predictions_df.to_csv('../results/metrics/test_predictions.csv', index=False)

print("✓ Detailed predictions saved to ../results/metrics/test_predictions.csv")

In [None]:
print(f"\nFinal Test Accuracy: {metrics['accuracy']:.4f}")
print(f"Final Macro F1: {metrics['macro_f1']:.4f}")
print(f"\nAll results saved to ../results/")