In [None]:
# Cell 1: Setup and model loading
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from src.sentiment_analyzer import EmployeeSentimentAnalyzer

# Load trained model
analyzer = EmployeeSentimentAnalyzer()

# Load test data
test_df = pd.read_csv('data/processed/test_data.csv')
test_texts = test_df['feedback'].tolist()
true_labels = test_df['sentiment'].tolist()

# Cell 2: Batch prediction
print("Running batch predictions...")
predictions = []
confidences = []

for text in tqdm(test_texts):
    result = analyzer.predict_sentiment(text)
    predictions.append(result['sentiment'])
    confidences.append(result['confidence'])

# Cell 3: Performance metrics
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Convert string labels to numeric for sklearn
label_map = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
true_numeric = [label_map[label] for label in true_labels]
pred_numeric = [label_map[pred] for pred in predictions]

accuracy = accuracy_score(true_numeric, pred_numeric)
precision, recall, f1, support = precision_recall_fscore_support(
    true_numeric, pred_numeric, average=None
)

print(f"Overall Accuracy: {accuracy:.4f}")
print("\nPer-class metrics:")
for i, label in enumerate(['Negative', 'Neutral', 'Positive']):
    print(f"{label}: Precision={precision[i]:.4f}, Recall={recall[i]:.4f}, F1={f1[i]:.4f}")

# Cell 4: Confusion Matrix
cm = confusion_matrix(true_numeric, pred_numeric)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Negative', 'Neutral', 'Positive'],
            yticklabels=['Negative', 'Neutral', 'Positive'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Cell 5: Confidence analysis
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.hist(confidences, bins=50, alpha=0.7)
plt.title('Model Confidence Distribution')
plt.xlabel('Confidence Score')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
confidence_df = pd.DataFrame({
    'confidence': confidences,
    'correct': [t == p for t, p in zip(true_labels, predictions)]
})
confidence_df.boxplot(column='confidence', by='correct')
plt.title('Confidence by Prediction Accuracy')
plt.xlabel('Correct Prediction')
plt.ylabel('Confidence Score')
plt.tight_layout()
plt.show()

# Cell 6: Error analysis
incorrect_indices = [i for i, (t, p) in enumerate(zip(true_labels, predictions)) if t != p]
print(f"Number of incorrect predictions: {len(incorrect_indices)}")

# Show some examples of incorrect predictions
print("\nExamples of incorrect predictions:")
for i in incorrect_indices[:5]:
    print(f"\nText: {test_texts[i][:100]}...")
    print(f"True: {true_labels[i]}, Predicted: {predictions[i]}, Confidence: {confidences[i]:.3f}")

# Cell 7: Performance by text length
test_df['text_length'] = test_df['feedback'].str.len()
test_df['correct'] = [t == p for t, p in zip(true_labels, predictions)]

# Bin by text length
test_df['length_bin'] = pd.cut(test_df['text_length'], bins=5, labels=['Very Short', 'Short', 'Medium', 'Long', 'Very Long'])
accuracy_by_length = test_df.groupby('length_bin')['correct'].mean()

plt.figure(figsize=(10, 6))
accuracy_by_length.plot(kind='bar')
plt.title('Model Accuracy by Text Length')
plt.xlabel('Text Length Category')
plt.ylabel('Accuracy')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()