# Model Evaluation and Analysis
Detailed evaluation of the trained mood classifier

In [None]:
import json
import pandas as pd
import numpy as np
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

## Load Test Metrics

In [None]:
# Load saved test metrics
with open('../models/test_metrics.json', 'r') as f:
    metrics = json.load(f)

print('Overall Test Metrics:')
print(f"Loss: {metrics['test_loss']:.4f}")
print(f"F1 Micro: {metrics['test_f1_micro']:.4f}")
print(f"F1 Macro: {metrics['test_f1_macro']:.4f}")

print('\nPer-Mood Metrics:')
for mood, mood_metrics in metrics['per_mood_metrics'].items():
    print(f"{mood}:")
    print(f"  Precision: {mood_metrics['precision']:.4f}")
    print(f"  Recall: {mood_metrics['recall']:.4f}")
    print(f"  F1: {mood_metrics['f1']:.4f}")

## Visualize Per-Mood Performance

In [None]:
# Extract metrics for visualization
moods = list(metrics['per_mood_metrics'].keys())
precisions = [metrics['per_mood_metrics'][m]['precision'] for m in moods]
recalls = [metrics['per_mood_metrics'][m]['recall'] for m in moods]
f1s = [metrics['per_mood_metrics'][m]['f1'] for m in moods]

x = np.arange(len(moods))
width = 0.25

fig, ax = plt.subplots(figsize=(12, 6))
ax.bar(x - width, precisions, width, label='Precision', color='steelblue')
ax.bar(x, recalls, width, label='Recall', color='orange')
ax.bar(x + width, f1s, width, label='F1', color='green')

ax.set_xlabel('Mood')
ax.set_ylabel('Score')
ax.set_title('Per-Mood Performance Metrics')
ax.set_xticks(x)
ax.set_xticklabels(moods, rotation=45)
ax.legend()
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('../models/per_mood_performance.png', dpi=300, bbox_inches='tight')
plt.show()

## Load Model for Inference Testing

In [None]:
MODEL_PATH = '../models/distilbert-mood-classifier'
MOODS = ['work', 'date', 'quick_bite', 'budget', 'family', 'late_night', 'celebration']

tokenizer = DistilBertTokenizer.from_pretrained(MODEL_PATH)
model = DistilBertForSequenceClassification.from_pretrained(MODEL_PATH)
model.to(device)
model.eval()

print('Model loaded for inference testing')

## Test Inference Function

In [None]:
def predict_mood(text, threshold=0.5):
    """Predict mood probabilities for a review"""
    encoding = tokenizer(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probs = torch.sigmoid(logits).cpu().numpy()[0]
    
    mood_probs = {mood: float(prob) for mood, prob in zip(MOODS, probs)}
    predicted_moods = [mood for mood, prob in mood_probs.items() if prob > threshold]
    
    return mood_probs, predicted_moods

## Test on Sample Reviews

In [None]:
test_reviews = [
    "Perfect spot to work on my laptop. Great WiFi and plenty of power outlets. Coffee is excellent too.",
    "Most romantic dinner I've ever had. Dim lighting, intimate atmosphere, and the service was impeccable.",
    "Quick lunch spot. Got my sandwich in 3 minutes. Perfect for busy workdays.",
    "Amazing value! Huge portions for the price. Very affordable and tasty.",
    "Brought the whole family. Kids menu was great and staff were very accommodating with the little ones.",
    "Open until 2 AM! Perfect for late night cravings after a concert.",
    "Celebrated my birthday here. Fancy atmosphere and they brought out a special dessert with candles."
]

print('Testing model on sample reviews:\n')
for i, review in enumerate(test_reviews, 1):
    print(f"Review {i}: {review}")
    mood_probs, predicted = predict_mood(review)
    
    print(f"Predicted moods: {', '.join(predicted) if predicted else 'None'}")
    print("Top 3 probabilities:")
    sorted_moods = sorted(mood_probs.items(), key=lambda x: x[1], reverse=True)[:3]
    for mood, prob in sorted_moods:
        print(f"  {mood}: {prob:.3f}")
    print("-" * 80)

## Analyze Label Distribution

In [None]:
# Load labeled data
with open('../data/labeled/labeled_reviews.json', 'r') as f:
    labeled_data = json.load(f)

print(f'Total labeled reviews: {len(labeled_data)}')

# Count mood occurrences
all_moods = []
for review in labeled_data:
    all_moods.extend(review['moods'])

mood_counts = Counter(all_moods)

print('\nMood distribution in training data:')
for mood, count in mood_counts.most_common():
    percentage = (count / len(labeled_data)) * 100
    print(f'{mood}: {count} ({percentage:.1f}%)')

# Visualize
plt.figure(figsize=(10, 6))
moods_sorted = sorted(mood_counts.items(), key=lambda x: x[1], reverse=True)
moods_names = [m[0] for m in moods_sorted]
moods_values = [m[1] for m in moods_sorted]

plt.bar(moods_names, moods_values, color='steelblue')
plt.xlabel('Mood')
plt.ylabel('Count')
plt.title('Mood Distribution in Labeled Dataset')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('../models/mood_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

## Multi-label Statistics

In [None]:
# Analyze multi-label patterns
label_counts = [len(review['moods']) for review in labeled_data]
label_count_dist = Counter(label_counts)

print('Number of labels per review:')
for num_labels, count in sorted(label_count_dist.items()):
    percentage = (count / len(labeled_data)) * 100
    print(f'{num_labels} labels: {count} reviews ({percentage:.1f}%)')

avg_labels = np.mean(label_counts)
print(f'\nAverage labels per review: {avg_labels:.2f}')

# Visualize
plt.figure(figsize=(8, 6))
plt.bar(label_count_dist.keys(), label_count_dist.values(), color='coral')
plt.xlabel('Number of Moods per Review')
plt.ylabel('Frequency')
plt.title('Distribution of Label Counts')
plt.xticks(list(label_count_dist.keys()))
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('../models/label_count_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

## Error Analysis

In [None]:
# Find reviews where model struggled (low confidence or wrong predictions)
problematic_reviews = []

for review in labeled_data[:50]:  # Sample first 50
    text = review['review_text']
    true_moods = set(review['moods'])
    
    mood_probs, predicted_moods = predict_mood(text)
    predicted_set = set(predicted_moods)
    
    # Calculate overlap
    overlap = len(true_moods.intersection(predicted_set))
    total = len(true_moods.union(predicted_set))
    
    if total > 0:
        accuracy = overlap / total
    else:
        accuracy = 0
    
    if accuracy < 0.5:  # Low accuracy
        problematic_reviews.append({
            'text': text[:100] + '...',
            'true': list(true_moods),
            'predicted': list(predicted_set),
            'accuracy': accuracy
        })

print(f'Found {len(problematic_reviews)} problematic reviews (accuracy < 0.5):\n')
for i, rev in enumerate(problematic_reviews[:5], 1):
    print(f"{i}. {rev['text']}")
    print(f"   True: {rev['true']}")
    print(f"   Predicted: {rev['predicted']}")
    print(f"   Accuracy: {rev['accuracy']:.2f}")
    print()

## Summary Report

In [None]:
summary = f"""
MODEL EVALUATION SUMMARY
{'='*60}

Dataset Statistics:
- Total labeled reviews: {len(labeled_data)}
- Average moods per review: {avg_labels:.2f}
- Most common mood: {mood_counts.most_common(1)[0][0]} ({mood_counts.most_common(1)[0][1]} occurrences)

Model Performance:
- Test F1 (Macro): {metrics['test_f1_macro']:.4f}
- Test F1 (Micro): {metrics['test_f1_micro']:.4f}
- Test Loss: {metrics['test_loss']:.4f}

Best Performing Moods:
"""

# Sort moods by F1 score
sorted_moods = sorted(
    metrics['per_mood_metrics'].items(),
    key=lambda x: x[1]['f1'],
    reverse=True
)

for mood, mood_metrics in sorted_moods[:3]:
    summary += f"- {mood}: F1 = {mood_metrics['f1']:.4f}\n"

summary += "\nLowest Performing Moods:\n"
for mood, mood_metrics in sorted_moods[-3:]:
    summary += f"- {mood}: F1 = {mood_metrics['f1']:.4f}\n"

print(summary)

# Save summary
with open('../models/evaluation_summary.txt', 'w') as f:
    f.write(summary)

print('\nEvaluation summary saved to models/evaluation_summary.txt')