# TechFlow AI Recruiter - Performance Evaluation

This notebook evaluates the Multi-Agent System performance using labeled conversation data.

## Evaluation Metrics:
- **Accuracy**: Overall correct predictions
- **Precision/Recall/F1**: Per-class performance
- **Confusion Matrix**: Detailed prediction breakdown

## Dataset:
- `sms_conversations.json`: Real SMS conversations with labeled actions (CONTINUE, SCHEDULE, END)

In [None]:
# Required Imports
import json
import os
import sys
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Add project root to path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from dotenv import load_dotenv
load_dotenv()

print("Imports successful!")

## 1. Load Dataset

In [None]:
# Load conversations dataset
with open('../sms_conversations.json', 'r', encoding='utf-8') as f:
    conversations = json.load(f)

print(f"Loaded {len(conversations)} conversations.")

# Count total labeled turns
total_labels = sum(1 for conv in conversations for turn in conv['turns'] if turn.get('label'))
print(f"Total labeled turns: {total_labels}")

## 2. Initialize Main Agent

In [None]:
from app.modules.agents import MainAgent

# Initialize the Main Agent
agent = MainAgent()
print("Main Agent initialized successfully!")

## 3. Run Evaluation

Note: This may take a while due to API calls. Adjust `max_conversations` to limit for testing.

In [None]:
# Configuration
MAX_CONVERSATIONS = 10  # Set to None for all conversations

y_true = []
y_pred = []
results = []

test_convs = conversations[:MAX_CONVERSATIONS] if MAX_CONVERSATIONS else conversations
print(f"Evaluating {len(test_convs)} conversations...")
print("="*50)

for conv_idx, conv in enumerate(test_convs):
    history = ""
    
    for turn in conv['turns']:
        speaker = "Recruiter" if turn['speaker'] == 'recruiter' else "Candidate"
        text = turn['text']
        
        # Only evaluate labeled recruiter turns
        if turn['speaker'] == 'recruiter' and turn.get('label'):
            try:
                action = agent.decide_action(history)
            except Exception as e:
                print(f"Error: {e}")
                action = "ERROR"
            
            pred = action.lower()
            true = turn['label'].lower()
            
            y_pred.append(pred)
            y_true.append(true)
            
            results.append({
                'conv_id': conv.get('conversation_id', conv_idx),
                'turn_id': turn.get('turn_id'),
                'predicted': pred,
                'actual': true,
                'correct': pred == true,
                'text': text[:60]
            })
            
            status = '✓' if pred == true else '✗'
            print(f"{status} Pred: {pred:10} | True: {true:10}")
        
        history += f"{speaker}: {text}\n"
    
    if (conv_idx + 1) % 5 == 0:
        print(f"--- Processed {conv_idx + 1}/{len(test_convs)} conversations ---")

print("\nEvaluation complete!")

## 4. Calculate Accuracy

In [None]:
# Overall Accuracy
accuracy = accuracy_score(y_true, y_pred)
print("="*50)
print(f"OVERALL ACCURACY: {accuracy:.2%}")
print(f"Correct: {sum(r['correct'] for r in results)} / {len(results)}")
print("="*50)

## 5. Classification Report

In [None]:
# Detailed Classification Report
print("\nCLASSIFICATION REPORT")
print("="*50)
print(classification_report(y_true, y_pred, zero_division=0))

## 6. Confusion Matrix

In [None]:
# Create Confusion Matrix
labels = ['continue', 'schedule', 'end']
cm = confusion_matrix(y_true, y_pred, labels=labels)

# Plot
plt.figure(figsize=(10, 8))
sns.heatmap(
    cm, 
    annot=True, 
    fmt='d', 
    cmap='Blues',
    xticklabels=labels,
    yticklabels=labels,
    annot_kws={'size': 16}
)
plt.title('Confusion Matrix - Main Agent Decisions', fontsize=14, fontweight='bold')
plt.xlabel('Predicted Label', fontsize=12)
plt.ylabel('Actual Label', fontsize=12)
plt.tight_layout()
plt.savefig('confusion_matrix.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nConfusion matrix saved to 'confusion_matrix.png'")

## 7. Error Analysis

In [None]:
# Create results DataFrame
results_df = pd.DataFrame(results)

# Show errors
errors_df = results_df[~results_df['correct']]
print(f"Total Errors: {len(errors_df)} / {len(results_df)}")
print("\nError Examples:")
errors_df[['conv_id', 'predicted', 'actual', 'text']].head(10)

In [None]:
# Error distribution by type
if len(errors_df) > 0:
    error_types = errors_df.groupby(['actual', 'predicted']).size().reset_index(name='count')
    error_types = error_types.sort_values('count', ascending=False)
    print("\nError Distribution:")
    print(error_types)

## 8. Summary Statistics

In [None]:
# Per-class accuracy
print("\nPER-CLASS ACCURACY")
print("="*50)
for label in labels:
    mask = results_df['actual'] == label
    if mask.sum() > 0:
        class_acc = results_df[mask]['correct'].mean()
        print(f"{label.upper():12} : {class_acc:.2%} ({mask.sum()} samples)")

In [None]:
# Label distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Actual distribution
actual_counts = pd.Series(y_true).value_counts().reindex(labels, fill_value=0)
axes[0].bar(actual_counts.index, actual_counts.values, color='steelblue', edgecolor='black')
axes[0].set_title('Actual Label Distribution', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Label')
axes[0].set_ylabel('Count')

# Predicted distribution  
pred_counts = pd.Series(y_pred).value_counts().reindex(labels, fill_value=0)
axes[1].bar(pred_counts.index, pred_counts.values, color='coral', edgecolor='black')
axes[1].set_title('Predicted Label Distribution', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Label')
axes[1].set_ylabel('Count')

plt.tight_layout()
plt.savefig('label_distribution.png', dpi=150)
plt.show()

## 9. Export Results

In [None]:
# Save results
results_df.to_csv('evaluation_results.csv', index=False)
print("Results saved to 'evaluation_results.csv'")

# Final Summary
print("\n" + "="*50)
print("FINAL EVALUATION SUMMARY")
print("="*50)
print(f"Total Predictions: {len(y_true)}")
print(f"Accuracy: {accuracy:.2%}")
print(f"Correct: {sum(results_df['correct'])}")
print(f"Errors: {len(errors_df)}")
print("="*50)