In [1]:
import os
import sys
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
import time
import requests
from tqdm import tqdm

# Add project root to path for importing PesaGuru modules
sys.path.append('C:/xampp/htdocs/PesaGuru')

# Configure paths
MODEL_DIR = 'C:/xampp/htdocs/PesaGuru/ai/models'
DATA_DIR = 'C:/xampp/htdocs/PesaGuru/notebooks/data'
RESULTS_DIR = 'C:/xampp/htdocs/PesaGuru/notebooks/outputs/model_evaluation'

# Create results directory if it doesn't exist
os.makedirs(RESULTS_DIR, exist_ok=True)

# Set style for visualizations
sns.set(style="whitegrid")
plt.style.use('fivethirtyeight')

# Define evaluation date
EVAL_DATE = datetime.now().strftime("%Y-%m-%d")

print(f"Starting AI Model Evaluation - {EVAL_DATE}")
print("=" * 50)

Starting AI Model Evaluation - 2025-03-14


In [None]:

def load_intent_classifier():
    """Load the intent classification model"""
    try:
        # Import the intent classifier module
        from ai.models.intent_classifier import IntentClassifier
        
        # Initialize the model
        model = IntentClassifier()
        model.load(os.path.join(MODEL_DIR, 'intent_classifier.pkl'))
        return model
    except Exception as e:
        print(f"Error loading intent classifier: {e}")
        return None

def load_entity_extractor():
    """Load the entity extraction model"""
    try:
        # Import the entity extractor module
        from ai.models.entity_extractor import EntityExtractor
        
        # Initialize the model
        model = EntityExtractor()
        model.load(os.path.join(MODEL_DIR, 'entity_extractor.pkl'))
        return model
    except Exception as e:
        print(f"Error loading entity extractor: {e}")
        return None

def load_sentiment_model():
    """Load the sentiment analysis model"""
    try:
        # Import the sentiment model
        from ai.models.sentiment_model import SentimentAnalyzer
        
        # Initialize the model
        model = SentimentAnalyzer()
        model.load(os.path.join(MODEL_DIR, 'sentiment_model.pkl'))
        return model
    except Exception as e:
        print(f"Error loading sentiment model: {e}")
        return None

def load_recommendation_model():
    """Load the investment recommendation model"""
    try:
        # Import the recommendation model
        from ai.models.recommendation_model import RecommendationEngine
        
        # Initialize the model
        model = RecommendationEngine()
        model.load(os.path.join(MODEL_DIR, 'recommendation_model.pkl'))
        return model
    except Exception as e:
        print(f"Error loading recommendation model: {e}")
        return None

def load_test_data():
    """Load test datasets for model evaluation"""
    test_data = {}
    
    # Load intent classification test data
    try:
        test_data['intent'] = pd.read_csv(os.path.join(DATA_DIR, 'intent_test_data.csv'))
        print(f"Loaded intent test data: {len(test_data['intent'])} samples")
    except Exception as e:
        print(f"Error loading intent test data: {e}")
    
    # Load entity extraction test data
    try:
        test_data['entity'] = pd.read_csv(os.path.join(DATA_DIR, 'entity_test_data.csv'))
        print(f"Loaded entity test data: {len(test_data['entity'])} samples")
    except Exception as e:
        print(f"Error loading entity test data: {e}")
    
    # Load sentiment analysis test data
    try:
        test_data['sentiment'] = pd.read_csv(os.path.join(DATA_DIR, 'sentiment_test_data.csv'))
        print(f"Loaded sentiment test data: {len(test_data['sentiment'])} samples")
    except Exception as e:
        print(f"Error loading sentiment test data: {e}")
    
    # Load recommendation test data
    try:
        test_data['recommendation'] = pd.read_csv(os.path.join(DATA_DIR, 'recommendation_test_data.csv'))
        print(f"Loaded recommendation test data: {len(test_data['recommendation'])} samples")
    except Exception as e:
        print(f"Error loading recommendation test data: {e}")
    
    # Load user profiles for testing
    try:
        test_data['user_profiles'] = pd.read_csv(os.path.join(DATA_DIR, 'user_profiles_test.csv'))
        print(f"Loaded user profiles test data: {len(test_data['user_profiles'])} samples")
    except Exception as e:
        print(f"Error loading user profiles test data: {e}")
    
    return test_data

# Attempt to load all models
print("Loading models...")
intent_model = load_intent_classifier()
entity_model = load_entity_extractor()
sentiment_model = load_sentiment_model()
recommendation_model = load_recommendation_model()

# Load test data
print("\nLoading test data...")
test_data = load_test_data()

In [None]:
def evaluate_intent_classification():
    """Evaluate intent classification performance"""
    print("\n" + "=" * 50)
    print("INTENT CLASSIFICATION EVALUATION")
    print("=" * 50)
    
    if intent_model is None or 'intent' not in test_data:
        print("Cannot evaluate: Intent model or test data not available")
        return None
    
    # Prepare data
    X_test = test_data['intent']['text'].tolist()
    y_true = test_data['intent']['intent'].tolist()
    
    # Measure response time
    start_time = time.time()
    y_pred = []
    
    for query in tqdm(X_test, desc="Processing intents"):
        # Predict intent
        intent = intent_model.predict(query)
        y_pred.append(intent)
    
    end_time = time.time()
    avg_response_time = (end_time - start_time) / len(X_test) * 1000  # in milliseconds
    
    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    
    # Print metrics
    print(f"\nIntent Classification Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Average Response Time: {avg_response_time:.2f} ms")
    
    # Plot confusion matrix
    plt.figure(figsize=(10, 8))
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Intent Classification Confusion Matrix')
    plt.xlabel('Predicted Intent')
    plt.ylabel('Actual Intent')
    plt.tight_layout()
    plt.savefig(os.path.join(RESULTS_DIR, f'intent_cm_{EVAL_DATE}.png'))
    
    # Save results to dictionary
    results = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'response_time_ms': avg_response_time,
        'date': EVAL_DATE
    }
    
    return results

def evaluate_entity_extraction():
    """Evaluate entity extraction performance"""
    print("\n" + "=" * 50)
    print("ENTITY EXTRACTION EVALUATION")
    print("=" * 50)
    
    if entity_model is None or 'entity' not in test_data:
        print("Cannot evaluate: Entity model or test data not available")
        return None
    
    # Prepare data
    texts = test_data['entity']['text'].tolist()
    true_entities = test_data['entity']['entities'].tolist()
    
    # Convert string representation of entities to actual lists
    true_entities = [json.loads(ent.replace("'", "\"")) if isinstance(ent, str) else ent for ent in true_entities]
    
    # Measure response time
    start_time = time.time()
    predicted_entities = []
    
    for text in tqdm(texts, desc="Processing entities"):
        # Extract entities
        entities = entity_model.extract_entities(text)
        predicted_entities.append(entities)
    
    end_time = time.time()
    avg_response_time = (end_time - start_time) / len(texts) * 1000  # in milliseconds
    
    # Calculate metrics
    # This is a simplified evaluation - in practice, entity extraction should be evaluated with more nuanced metrics
    correct = 0
    total = 0
    
    for true, pred in zip(true_entities, predicted_entities):
        # Count correct extractions
        true_set = set([(e['entity'], e['value']) for e in true])
        pred_set = set([(e['entity'], e['value']) for e in pred])
        
        correct += len(true_set.intersection(pred_set))
        total += len(true_set)
    
    precision = correct / sum(len(p) for p in predicted_entities) if sum(len(p) for p in predicted_entities) > 0 else 0
    recall = correct / total if total > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    # Print metrics
    print(f"\nEntity Extraction Results:")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Average Response Time: {avg_response_time:.2f} ms")
    
    # Visualize entity extraction performance by entity type
    entity_types = set()
    for entities in true_entities:
        for e in entities:
            entity_types.add(e['entity'])
    
    entity_performance = {entity_type: {'true': 0, 'correct': 0} for entity_type in entity_types}
    
    for true, pred in zip(true_entities, predicted_entities):
        for e in true:
            entity_type = e['entity']
            entity_performance[entity_type]['true'] += 1
            
            # Check if this entity was correctly predicted
            if any(p['entity'] == entity_type and p['value'] == e['value'] for p in pred):
                entity_performance[entity_type]['correct'] += 1
    
    # Calculate F1 scores by entity type
    entity_f1 = {}
    for entity_type, counts in entity_performance.items():
        if counts['true'] == 0:
            entity_f1[entity_type] = 0
            continue
            
        recall = counts['correct'] / counts['true']
        
        # Calculate precision
        predicted_count = sum(1 for entities in predicted_entities for e in entities if e['entity'] == entity_type)
        precision = counts['correct'] / predicted_count if predicted_count > 0 else 0
        
        # Calculate F1
        entity_f1[entity_type] = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    # Plot F1 scores by entity type
    plt.figure(figsize=(12, 6))
    plt.bar(entity_f1.keys(), entity_f1.values())
    plt.title('Entity Extraction F1 Score by Entity Type')
    plt.xlabel('Entity Type')
    plt.ylabel('F1 Score')
    plt.ylim(0, 1)
    plt.tight_layout()
    plt.xticks(rotation=45)
    plt.savefig(os.path.join(RESULTS_DIR, f'entity_f1_by_type_{EVAL_DATE}.png'))
    
    # Save results to dictionary
    results = {
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'response_time_ms': avg_response_time,
        'entity_f1': entity_f1,
        'date': EVAL_DATE
    }
    
    return results

def evaluate_sentiment_analysis():
    """Evaluate sentiment analysis performance"""
    print("\n" + "=" * 50)
    print("SENTIMENT ANALYSIS EVALUATION")
    print("=" * 50)
    
    if sentiment_model is None or 'sentiment' not in test_data:
        print("Cannot evaluate: Sentiment model or test data not available")
        return None
    
    # Prepare data
    texts = test_data['sentiment']['text'].tolist()
    true_sentiments = test_data['sentiment']['sentiment'].tolist()
    
    # Measure response time
    start_time = time.time()
    predicted_sentiments = []
    
    for text in tqdm(texts, desc="Processing sentiment"):
        # Predict sentiment
        sentiment = sentiment_model.predict(text)
        predicted_sentiments.append(sentiment)
    
    end_time = time.time()
    avg_response_time = (end_time - start_time) / len(texts) * 1000  # in milliseconds
    
    # Calculate metrics
    accuracy = accuracy_score(true_sentiments, predicted_sentiments)
    precision = precision_score(true_sentiments, predicted_sentiments, average='weighted')
    recall = recall_score(true_sentiments, predicted_sentiments, average='weighted')
    f1 = f1_score(true_sentiments, predicted_sentiments, average='weighted')
    
    # Print metrics
    print(f"\nSentiment Analysis Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Average Response Time: {avg_response_time:.2f} ms")
    
    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    cm = confusion_matrix(true_sentiments, predicted_sentiments)
    sentiment_labels = ['negative', 'neutral', 'positive']
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=sentiment_labels, yticklabels=sentiment_labels)
    plt.title('Sentiment Analysis Confusion Matrix')
    plt.xlabel('Predicted Sentiment')
    plt.ylabel('Actual Sentiment')
    plt.tight_layout()
    plt.savefig(os.path.join(RESULTS_DIR, f'sentiment_cm_{EVAL_DATE}.png'))
    
    # Save results to dictionary
    results = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'response_time_ms': avg_response_time,
        'date': EVAL_DATE
    }
    
    return results

# Run NLP evaluations
intent_results = evaluate_intent_classification()
entity_results = evaluate_entity_extraction()
sentiment_results = evaluate_sentiment_analysis()

In [None]:
def evaluate_recommendation_accuracy():
    """Evaluate investment recommendation accuracy"""
    print("\n" + "=" * 50)
    print("RECOMMENDATION ENGINE EVALUATION")
    print("=" * 50)
    
    if recommendation_model is None or 'recommendation' not in test_data or 'user_profiles' not in test_data:
        print("Cannot evaluate: Recommendation model or test data not available")
        return None
    
    # Prepare data
    user_profiles = test_data['user_profiles']
    recommendations_test = test_data['recommendation']
    
    # Track metrics
    correct_recommendations = 0
    total_recommendations = len(recommendations_test)
    response_times = []
    risk_consistency = []
    
    # Process each test case
    for idx, test_case in tqdm(recommendations_test.iterrows(), desc="Evaluating recommendations", total=len(recommendations_test)):
        user_id = test_case['user_id']
        expected_recommendation = test_case['expected_recommendation']
        
        # Get user profile
        user_profile = user_profiles[user_profiles['user_id'] == user_id].iloc[0].to_dict()
        
        # Measure response time
        start_time = time.time()
        recommendation = recommendation_model.get_recommendation(user_profile)
        end_time = time.time()
        
        response_time = (end_time - start_time) * 1000  # in milliseconds
        response_times.append(response_time)
        
        # Check if recommendation matches expected
        if recommendation == expected_recommendation:
            correct_recommendations += 1
        
        # Check risk consistency
        user_risk_tolerance = user_profile.get('risk_tolerance', 'moderate')
        recommendation_risk = recommendation_model.get_recommendation_risk(recommendation)
        
        # Score risk consistency (0-1)
        risk_consistency.append(recommendation_model.calculate_risk_consistency(user_risk_tolerance, recommendation_risk))
    
    # Calculate metrics
    accuracy = correct_recommendations / total_recommendations if total_recommendations > 0 else 0
    avg_response_time = sum(response_times) / len(response_times) if response_times else 0
    avg_risk_consistency = sum(risk_consistency) / len(risk_consistency) if risk_consistency else 0
    
    # Print metrics
    print(f"\nRecommendation Engine Results:")
    print(f"Recommendation Accuracy: {accuracy:.4f}")
    print(f"Average Response Time: {avg_response_time:.2f} ms")
    print(f"Risk Profile Consistency: {avg_risk_consistency:.4f}")
    
    # Plot recommendation accuracy by risk profile
    recommendation_by_risk = recommendations_test.copy()
    recommendation_by_risk['user_risk'] = recommendation_by_risk['user_id'].apply(
        lambda uid: user_profiles[user_profiles['user_id'] == uid]['risk_tolerance'].values[0]
    )
    
    recommendation_by_risk['correct'] = recommendation_by_risk.apply(
        lambda row: recommendation_model.get_recommendation(
            user_profiles[user_profiles['user_id'] == row['user_id']].iloc[0].to_dict()
        ) == row['expected_recommendation'],
        axis=1
    )
    
    # Calculate accuracy by risk profile
    accuracy_by_risk = recommendation_by_risk.groupby('user_risk')['correct'].mean()
    
    plt.figure(figsize=(10, 6))
    accuracy_by_risk.plot(kind='bar')
    plt.title('Recommendation Accuracy by Risk Profile')
    plt.xlabel('Risk Tolerance')
    plt.ylabel('Accuracy')
    plt.ylim(0, 1)
    plt.tight_layout()
    plt.savefig(os.path.join(RESULTS_DIR, f'recommendation_by_risk_{EVAL_DATE}.png'))
    
    # Save results to dictionary
    results = {
        'accuracy': accuracy,
        'response_time_ms': avg_response_time,
        'risk_consistency': avg_risk_consistency,
        'accuracy_by_risk': accuracy_by_risk.to_dict(),
        'date': EVAL_DATE
    }
    
    return results

recommendation_results = evaluate_recommendation_accuracy()

In [None]:
def evaluate_chatbot_response_time():
    """Evaluate chatbot response time for common queries"""
    print("\n" + "=" * 50)
    print("CHATBOT RESPONSE TIME EVALUATION")
    print("=" * 50)
    
    # Define sample queries of different complexity
    sample_queries = [
        {"query": "What is the current exchange rate for USD to KES?", "complexity": "simple"},
        {"query": "How do I open a stock trading account?", "complexity": "simple"},
        {"query": "What are the best investment options for me?", "complexity": "medium"},
        {"query": "Compare M-Shwari and KCB-M-Pesa loan rates", "complexity": "medium"},
        {"query": "Given my savings of 50,000 KES, what investment portfolio would you recommend for moderate risk?", "complexity": "complex"},
        {"query": "Show me the performance of Safaricom stock over the past 3 months and analyze its trend", "complexity": "complex"}
    ]
    
    # Setup connection to chatbot API
    try:
        # Import the chatbot controller
        sys.path.append('C:/xampp/htdocs/PesaGuru/server')
        from controllers.chatbotController import ChatbotController
        
        chatbot = ChatbotController()
        
        # Process each query
        response_times = []
        
        for query_data in tqdm(sample_queries, desc="Testing chatbot response time"):
            query = query_data["query"]
            complexity = query_data["complexity"]
            
            # Create dummy user for testing
            user = {
                "user_id": "test_user_1",
                "name": "Test User",
                "language": "en",
                "risk_tolerance": "moderate"
            }
            
            # Measure response time
            start_time = time.time()
            response = chatbot.process_query(query, user)
            end_time = time.time()
            
            response_time = (end_time - start_time) * 1000  # in milliseconds
            
            response_times.append({
                "query": query,
                "complexity": complexity,
                "response_time_ms": response_time
            })
        
        # Calculate average response time by complexity
        response_df = pd.DataFrame(response_times)
        avg_by_complexity = response_df.groupby('complexity')['response_time_ms'].mean()
        
        # Print results
        print(f"\nChatbot Response Time Results:")
        print(f"Overall Average Response Time: {response_df['response_time_ms'].mean():.2f} ms")
        print("\nAverage Response Time by Complexity:")
        for complexity, avg_time in avg_by_complexity.items():
            print(f"{complexity.capitalize()}: {avg_time:.2f} ms")
        
        # Plot response times by complexity
        plt.figure(figsize=(10, 6))
        sns.barplot(x='complexity', y='response_time_ms', data=response_df, order=['simple', 'medium', 'complex'])
        plt.title('Average Response Time by Query Complexity')
        plt.xlabel('Query Complexity')
        plt.ylabel('Response Time (ms)')
        plt.tight_layout()
        plt.savefig(os.path.join(RESULTS_DIR, f'response_time_by_complexity_{EVAL_DATE}.png'))
        
        # Return results
        results = {
            'overall_avg_response_time_ms': response_df['response_time_ms'].mean(),
            'response_time_by_complexity': avg_by_complexity.to_dict(),
            'date': EVAL_DATE
        }
        
        return results
    
    except Exception as e:
        print(f"Error evaluating chatbot response time: {e}")
        return None

chatbot_response_results = evaluate_chatbot_response_time()

In [None]:
def compare_with_benchmarks():
    """Compare evaluation results with target benchmarks"""
    print("\n" + "=" * 50)
    print("COMPARISON WITH BENCHMARK TARGETS")
    print("=" * 50)
    
    # Define target benchmarks
    benchmarks = {
        'intent_recognition': {
            'accuracy': 0.90,
            'response_time_ms': 1000  # 1 second
        },
        'entity_extraction': {
            'f1': 0.85,
            'response_time_ms': 1000
        },
        'sentiment_analysis': {
            'accuracy': 0.80,
            'response_time_ms': 1000
        },
        'recommendation': {
            'accuracy': 0.85,
            'risk_consistency': 0.90,
            'response_time_ms': 2000  # 2 seconds
        },
        'chatbot_response': {
            'overall_avg_response_time_ms': 2000,
            'simple_response_time_ms': 1000,
            'medium_response_time_ms': 2000,
            'complex_response_time_ms': 3000
        }
    }
    
    # Gather current results
    current_results = {
        'intent_recognition': intent_results if intent_results else {},
        'entity_extraction': entity_results if entity_results else {},
        'sentiment_analysis': sentiment_results if sentiment_results else {},
        'recommendation': recommendation_results if recommendation_results else {},
        'chatbot_response': chatbot_response_results if chatbot_response_results else {}
    }
    
    # Create comparison
    comparison = {}
    
    # Intent recognition comparison
    if intent_results:
        comparison['intent_recognition'] = {
            'accuracy': {
                'current': intent_results['accuracy'],
                'target': benchmarks['intent_recognition']['accuracy'],
                'met': intent_results['accuracy'] >= benchmarks['intent_recognition']['accuracy']
            },
            'response_time_ms': {
                'current': intent_results['response_time_ms'],
                'target': benchmarks['intent_recognition']['response_time_ms'],
                'met': intent_results['response_time_ms'] <= benchmarks['intent_recognition']['response_time_ms']
            }
        }
    
    # Entity extraction comparison
    if entity_results:
        comparison['entity_extraction'] = {
            'f1': {
                'current': entity_results['f1'],
                'target': benchmarks['entity_extraction']['f1'],
                'met': entity_results['f1'] >= benchmarks['entity_extraction']['f1']
            },
            'response_time_ms': {
                'current': entity_results['response_time_ms'],
                'target': benchmarks['entity_extraction']['response_time_ms'],
                'met': entity_results['response_time_ms'] <= benchmarks['entity_extraction']['response_time_ms']
            }
        }
    
    # Sentiment analysis comparison
    if sentiment_results:
        comparison['sentiment_analysis'] = {
            'accuracy': {
                'current': sentiment_results['accuracy'],
                'target': benchmarks['sentiment_analysis']['accuracy'],
                'met': sentiment_results['accuracy'] >= benchmarks['sentiment_analysis']['accuracy']
            },
            'response_time_ms': {
                'current': sentiment_results['response_time_ms'],
                'target': benchmarks['sentiment_analysis']['response_time_ms'],
                'met': sentiment_results['response_time_ms'] <= benchmarks['sentiment_analysis']['response_time_ms']
            }
        }
    
    # Recommendation engine comparison
    if recommendation_results:
        comparison['recommendation'] = {
            'accuracy': {
                'current': recommendation_results['accuracy'],
                'target': benchmarks['recommendation']['accuracy'],
                'met': recommendation_results['accuracy'] >= benchmarks['recommendation']['accuracy']
            },
            'risk_consistency': {
                'current': recommendation_results['risk_consistency'],
                'target': benchmarks['recommendation']['risk_consistency'],
                'met': recommendation_results['risk_consistency'] >= benchmarks['recommendation']['risk_consistency']
            },
            'response_time_ms': {
                'current': recommendation_results['response_time_ms'],
                'target': benchmarks['recommendation']['response_time_ms'],
                'met': recommendation_results['response_time_ms'] <= benchmarks['recommendation']['response_time_ms']
            }
        }
    
    # Chatbot response time comparison
    if chatbot_response_results:
        comparison['chatbot_response'] = {
            'overall_avg_response_time_ms': {
                'current': chatbot_response_results['overall_avg_response_time_ms'],
                'target': benchmarks['chatbot_response']['overall_avg_response_time_ms'],
                'met': chatbot_response_results['overall_avg_response_time_ms'] <= benchmarks['chatbot_response']['overall_avg_response_time_ms']
            }
        }
        
        # Add complexity-specific comparisons if available
        if 'response_time_by_complexity' in chatbot_response_results:
            for complexity in ['simple', 'medium', 'complex']:
                if complexity in chatbot_response_results['response_time_by_complexity']:
                    current_value = chatbot_response_results['response_time_by_complexity'][complexity]
                    target_value = benchmarks['chatbot_response'][f'{complexity}_response_time_ms']
                    
                    comparison['chatbot_response'][f'{complexity}_response_time_ms'] = {
                        'current': current_value,
                        'target': target_value,
                        'met': current_value <= target_value
                    }
    
    # Print comparison
    print("\nPerformance Comparison with Benchmarks:")
    
    for model, metrics in comparison.items():
        print(f"\n{model.upper()}")
        for metric, values in metrics.items():
            status = "✅ MET" if values['met'] else "❌ NOT MET"
            print(f"  {metric}: {values['current']:.2f} vs target {values['target']:.2f} - {status}")
    
    # Create visual benchmark comparison
    plot_data = []
    
    for model, metrics in comparison.items():
        for metric, values in metrics.items():
            if 'response_time' in metric:
                # For response time, lower is better
                performance_ratio = values['target'] / values['current'] if values['current'] > 0 else 1
                performance_ratio = min(performance_ratio, 1.5)  # Cap at 150% for visualization
            else:
                # For accuracy metrics, higher is better
                performance_ratio = values['current'] / values['target'] if values['target'] > 0 else 0
            
            plot_data.append({
                'model': model,
                'metric': metric,
                'ratio': performance_ratio,
                'met': values['met']
            })
    
    # Plot benchmark comparison
    plot_df = pd.DataFrame(plot_data)
    
    plt.figure(figsize=(12, 8))
    bars = sns.barplot(x='model', y='ratio', hue='metric', data=plot_df)
    
    # Highlight bars by whether they met the benchmark
    for i, bar in enumerate(bars.patches):
        if plot_df.iloc[i]['met']:
            bar.set_edgecolor('green')
            bar.set_linewidth(2)
        else:
            bar.set_edgecolor('red')
            bar.set_linewidth(2)
    
    plt.axhline(y=1.0, color='r', linestyle='--', alpha=0.7)
    plt.title('Model Performance Relative to Benchmarks')
    plt.xlabel('Model Component')
    plt.ylabel('Performance Ratio (Current/Target)')
    plt.legend(title='Metric', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.savefig(os.path.join(RESULTS_DIR, f'benchmark_comparison_{EVAL_DATE}.png'))
    
    # Return comparison
    return comparison

benchmark_comparison = compare_with_benchmarks()

In [None]:
def generate_summary_report():
    """Generate a summary report of all evaluations"""
    print("\n" + "=" * 50)
    print("GENERATING SUMMARY REPORT")
    print("=" * 50)
    
    # Combine all results
    all_results = {
        'intent_recognition': intent_results,
        'entity_extraction': entity_results,
        'sentiment_analysis': sentiment_results,
        'recommendation': recommendation_results,
        'chatbot_response': chatbot_response_results,
        'benchmark_comparison': benchmark_comparison,
        'evaluation_date': EVAL_DATE
    }
    
    # Create summary metrics
    summary = {
        'overall_performance': {},
        'areas_of_strength': [],
        'areas_for_improvement': [],
        'recommendations': []
    }
    
    # Calculate overall performance score
    scores = []
    
    if intent_results:
        scores.append(intent_results['accuracy'])
    
    if entity_results:
        scores.append(entity_results['f1'])
    
    if sentiment_results:
        scores.append(sentiment_results['accuracy'])
    
    if recommendation_results:
        scores.append(recommendation_results['accuracy'])
        scores.append(recommendation_results['risk_consistency'])
    
    if scores:
        summary['overall_performance']['average_score'] = sum(scores) / len(scores)
    
    # Determine areas of strength
    if benchmark_comparison:
        for model, metrics in benchmark_comparison.items():
            for metric, values in metrics.items():
                if values['met']:
                    summary['areas_of_strength'].append(f"{model} - {metric}")
                else:
                    summary['areas_for_improvement'].append(f"{model} - {metric}")
    
    # Generate recommendations
    if 'intent_recognition' in summary['areas_for_improvement']:
        summary['recommendations'].append(
            "Improve intent recognition by expanding the training dataset with more diverse user queries."
        )
    
    if 'entity_extraction' in summary['areas_for_improvement']:
        summary['recommendations'].append(
            "Enhance entity extraction by fine-tuning the model on Kenyan financial terminology and adding more entity types."
        )
    
    if 'sentiment_analysis' in summary['areas_for_improvement']:
        summary['recommendations'].append(
            "Improve sentiment analysis accuracy by training on more financial-specific sentiment data and local expressions."
        )
    
    if 'recommendation' in summary['areas_for_improvement']:
        summary['recommendations'].append(
            "Enhance investment recommendation accuracy by improving risk profiling and expanding the product database."
        )
    
    if 'chatbot_response' in summary['areas_for_improvement']:
        summary['recommendations'].append(
            "Optimize response time by implementing caching mechanisms and parallel processing for complex queries."
        )
    
    # Add general recommendations
    summary['recommendations'].extend([
        "Implement continuous learning from user feedback to improve model accuracy over time.",
        "Add support for Swahili language to increase user accessibility.",
        "Expand test datasets to cover more edge cases and real-world scenarios."
    ])
    
    # Save all results to JSON
    with open(os.path.join(RESULTS_DIR, f'evaluation_results_{EVAL_DATE}.json'), 'w') as f:
        json.dump(all_results, f, indent=4)
    
    # Save summary to JSON
    with open(os.path.join(RESULTS_DIR, f'evaluation_summary_{EVAL_DATE}.json'), 'w') as f:
        json.dump(summary, f, indent=4)
    
    # Print summary
    print("\nEvaluation Summary:")
    
    if 'average_score' in summary['overall_performance']:
        print(f"Overall Performance Score: {summary['overall_performance']['average_score']:.2f}")
    
    print("\nAreas of Strength:")
    for strength in summary['areas_of_strength']:
        print(f"- {strength}")
    
    print("\nAreas for Improvement:")
    for improvement in summary['areas_for_improvement']:
        print(f"- {improvement}")
    
    print("\nRecommendations:")
    for recommendation in summary['recommendations']:
        print(f"- {recommendation}")
    
    print(f"\nDetailed results saved to: {os.path.join(RESULTS_DIR, f'evaluation_results_{EVAL_DATE}.json')}")
    print(f"Summary saved to: {os.path.join(RESULTS_DIR, f'evaluation_summary_{EVAL_DATE}.json')}")
    
    return summary

# Generate summary report
summary_report = generate_summary_report()

print("\n" + "=" * 50)
print("AI MODEL EVALUATION COMPLETE")
print("=" * 50)

In [None]:
def track_performance_over_time():
    """Load previous evaluation results and plot performance trends"""
    # Find all evaluation result files
    result_files = [f for f in os.listdir(RESULTS_DIR) if f.startswith('evaluation_results_') and f.endswith('.json')]
    
    if len(result_files) <= 1:
        print("Not enough historical data to plot performance trends.")
        return
    
    # Load all evaluation results
    performance_history = []
    
    for result_file in result_files:
        try:
            with open(os.path.join(RESULTS_DIR, result_file), 'r') as f:
                results = json.load(f)
                
                data_point = {
                    'date': results['evaluation_date']
                }
                
                # Extract key metrics
                if 'intent_recognition' in results and results['intent_recognition']:
                    data_point['intent_accuracy'] = results['intent_recognition']['accuracy']
                
                if 'entity_extraction' in results and results['entity_extraction']:
                    data_point['entity_f1'] = results['entity_extraction']['f1']
                
                if 'sentiment_analysis' in results and results['sentiment_analysis']:
                    data_point['sentiment_accuracy'] = results['sentiment_analysis']['accuracy']
                
                if 'recommendation' in results and results['recommendation']:
                    data_point['recommendation_accuracy'] = results['recommendation']['accuracy']
                
                performance_history.append(data_point)
        except Exception as e:
            print(f"Error loading results from {result_file}: {e}")
    
    # Sort by date
    performance_history.sort(key=lambda x: x['date'])
    
    # Convert to DataFrame
    history_df = pd.DataFrame(performance_history)
    
    # Plot performance trends
    plt.figure(figsize=(12, 6))
    
    metrics = [col for col in history_df.columns if col != 'date']
    
    for metric in metrics:
        if metric in history_df.columns:
            plt.plot(history_df['date'], history_df[metric], marker='o', label=metric)
    
    plt.title('Model Performance Trends Over Time')
    plt.xlabel('Evaluation Date')
    plt.ylabel('Performance Score')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(os.path.join(RESULTS_DIR, 'performance_trends.png'))
    
    print(f"Performance trends plotted and saved to: {os.path.join(RESULTS_DIR, 'performance_trends.png')}")

# Track performance over time
try:
    track_performance_over_time()
except Exception as e:
    print(f"Error tracking performance over time: {e}")

In [None]:
def evaluate_financial_accuracy():
    """Evaluate accuracy of financial calculations and market data"""
    # Test stock price accuracy
    nse_stocks = ['SCOM', 'KCB', 'EQTY', 'COOP', 'BAT']
    price_accuracy = []
    
    for stock in nse_stocks:
        # Get actual price from NSE API
        actual_price = get_nse_price(stock)
        # Get price from chatbot
        chatbot_price = get_chatbot_stock_price(stock)
        # Calculate percentage difference
        diff_pct = abs(actual_price - chatbot_price) / actual_price * 100
        price_accuracy.append({
            'stock': stock,
            'actual_price': actual_price,
            'chatbot_price': chatbot_price,
            'diff_pct': diff_pct
        })
    
    # Test loan calculation accuracy
    loan_scenarios = [
        {'amount': 50000, 'term': 12, 'rate': 14},
        {'amount': 100000, 'term': 24, 'rate': 13},
        {'amount': 500000, 'term': 36, 'rate': 12.5}
    ]
    
    loan_accuracy = []
    for scenario in loan_scenarios:
        # Calculate expected EMI
        expected_emi = calculate_loan_emi(scenario['amount'], scenario['term'], scenario['rate'])
        # Get chatbot calculation
        chatbot_emi = get_chatbot_loan_emi(scenario)
        # Calculate percentage difference
        diff_pct = abs(expected_emi - chatbot_emi) / expected_emi * 100
        loan_accuracy.append({
            'scenario': scenario,
            'expected_emi': expected_emi,
            'chatbot_emi': chatbot_emi,
            'diff_pct': diff_pct
        })
    
    return {'stock_price_accuracy': price_accuracy, 'loan_calculation_accuracy': loan_accuracy}

In [None]:
def evaluate_user_experience_metrics():
    """Evaluate user satisfaction and engagement metrics"""
    # Load user interaction logs
    interaction_logs = pd.read_csv(os.path.join(DATA_DIR, 'user_interaction_logs.csv'))
    
    # Calculate session completion rate
    total_sessions = len(interaction_logs['session_id'].unique())
    completed_sessions = len(interaction_logs[interaction_logs['session_completed'] == True]['session_id'].unique())
    completion_rate = completed_sessions / total_sessions if total_sessions > 0 else 0
    
    # Calculate user satisfaction from feedback
    satisfaction_scores = interaction_logs['satisfaction_score'].dropna()
    avg_satisfaction = satisfaction_scores.mean() if len(satisfaction_scores) > 0 else 0
    
    # Calculate task success rate
    tasks_attempted = len(interaction_logs['task_id'].dropna())
    tasks_completed = len(interaction_logs[interaction_logs['task_completed'] == True])
    task_success_rate = tasks_completed / tasks_attempted if tasks_attempted > 0 else 0
    
    # Calculate average queries per session
    avg_queries_per_session = interaction_logs.groupby('session_id').size().mean()
    
    return {
        'completion_rate': completion_rate,
        'avg_satisfaction': avg_satisfaction,
        'task_success_rate': task_success_rate,
        'avg_queries_per_session': avg_queries_per_session
    }

In [None]:
def evaluate_multilingual_support():
    """Evaluate chatbot performance in both English and Swahili"""
    languages = ['en', 'sw']
    results = {}
    
    for lang in languages:
        # Load language-specific test data
        test_data = pd.read_csv(os.path.join(DATA_DIR, f'intent_test_data_{lang}.csv'))
        
        # Test intent recognition
        X_test = test_data['text'].tolist()
        y_true = test_data['intent'].tolist()
        
        # Set language in model
        intent_model.set_language(lang)
        
        # Predict intents
        y_pred = []
        for query in X_test:
            intent = intent_model.predict(query)
            y_pred.append(intent)
        
        # Calculate metrics
        accuracy = accuracy_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred, average='weighted')
        
        results[lang] = {
            'accuracy': accuracy,
            'f1': f1
        }
    
    # Plot comparison
    plt.figure(figsize=(10, 6))
    
    languages_display = {'en': 'English', 'sw': 'Swahili'}
    metrics = ['accuracy', 'f1']
    
    x = np.arange(len(languages_display))
    width = 0.35
    
    fig, ax = plt.subplots(figsize=(10, 6))
    
    for i, metric in enumerate(metrics):
        values = [results[lang][metric] for lang in languages]
        ax.bar(x + i*width, values, width, label=metric)
    
    ax.set_xlabel('Language')
    ax.set_ylabel('Score')
    ax.set_title('Performance Comparison by Language')
    ax.set_xticks(x + width/2)
    ax.set_xticklabels(list(languages_display.values()))
    ax.legend()
    ax.set_ylim(0, 1)
    
    plt.tight_layout()
    plt.savefig(os.path.join(RESULTS_DIR, f'multilingual_performance_{EVAL_DATE}.png'))
    
    return results

In [None]:
def implement_model_improvements(evaluation_results):
    """Implement automatic model improvements based on evaluation"""
    # Identify areas needing improvement
    improvement_needed = []
    
    if evaluation_results['intent_recognition']['accuracy'] < 0.90:
        improvement_needed.append('intent_recognition')
    
    if evaluation_results['entity_extraction']['f1'] < 0.85:
        improvement_needed.append('entity_extraction')
    
    # Implement improvements for each area
    for area in improvement_needed:
        if area == 'intent_recognition':
            # Extract misclassified examples
            misclassified = get_misclassified_intents()
            # Add to training data with correct labels
            update_intent_training_data(misclassified)
            # Retrain model
            retrain_intent_model()
        
        elif area == 'entity_extraction':
            # Extract missed entities
            missed_entities = get_missed_entities()
            # Add to training data
            update_entity_training_data(missed_entities)
            # Retrain model
            retrain_entity_model()
    
    return improvement_needed

In [None]:
def create_evaluation_dashboard():
    """Create an HTML dashboard with evaluation results"""
    dashboard_html = f"""
    <!DOCTYPE html>
    <html>
    <head>
        <title>PesaGuru AI Model Evaluation - {EVAL_DATE}</title>
        <style>
            body {{ font-family: Arial, sans-serif; margin: 20px; }}
            .metric-card {{ border: 1px solid #ddd; border-radius: 8px; padding: 15px; margin: 10px; display: inline-block; width: 200px; }}
            .metric-value {{ font-size: 24px; font-weight: bold; }}
            .metric-name {{ font-size: 14px; color: #666; }}
            .metric-status {{ font-size: 12px; margin-top: 5px; }}
            .met {{ color: green; }}
            .not-met {{ color: red; }}
            .charts {{ display: flex; flex-wrap: wrap; margin-top: 20px; }}
            .chart {{ margin: 10px; border: 1px solid #ddd; border-radius: 8px; padding: 10px; }}
        </style>
    </head>
    <body>
        <h1>PesaGuru AI Model Evaluation Dashboard</h1>
        <p>Evaluation Date: {EVAL_DATE}</p>
        
        <h2>Performance Metrics</h2>
        <div class="metrics-container">
            <!-- Metrics will be inserted here -->
        </div>
        
        <h2>Visualizations</h2>
        <div class="charts">
            <!-- Charts will be inserted here -->
        </div>
        
        <h2>Recommendations</h2>
        <ul>
            <!-- Recommendations will be inserted here -->
        </ul>
    </body>
    </html>
    """
    
    with open(os.path.join(RESULTS_DIR, f'evaluation_dashboard_{EVAL_DATE}.html'), 'w') as f:
        f.write(dashboard_html)
    
    print(f"Dashboard created at: {os.path.join(RESULTS_DIR, f'evaluation_dashboard_{EVAL_DATE}.html')}")