In [1]:
import json
import os
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
from typing import List, Dict, Any, Tuple
import torch
from tqdm import tqdm

In [2]:
# Set random seed for reproducibility
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x2139e617650>

In [3]:
class EmbeddingModel:
    def __init__(self, name: str, model_type: str = 'sentence_transformers'):
        """Initialize an embedding model."""
        self.name = name
        self.model_type = model_type
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        
        if model_type == 'sentence_transformers':
            self.model = SentenceTransformer(name)
            self.model.to(self.device)
        elif model_type == 'transformers':
            self.model = pipeline(
                "feature-extraction",
                model=name,
                device=0 if self.device == 'cuda' else -1  # Use -1 for CPU
            )
    
    def encode(self, text: str) -> np.ndarray:
        """Generate embedding for the given text."""
        if self.model_type == 'sentence_transformers':
            return self.model.encode(text, convert_to_numpy=True)
        else:  # transformers
            # Get the embedding from the last hidden state's [CLS] token
            embedding = self.model(text, return_tensors="pt")[0][0].detach().cpu().numpy()
            return embedding

def load_test_data(test_dir: str) -> List[Dict[str, Any]]:
    """Load all test data from JSON files in the test directory."""
    test_data = []
    test_dir_path = os.path.join(test_dir)
    
    if not os.path.exists(test_dir_path):
        raise FileNotFoundError(f"Test data directory not found at: {test_dir_path}")
    
    for file in os.listdir(test_dir_path):
        if file.endswith('_test.json'):
            print(f"Loading {file}...")
            with open(os.path.join(test_dir_path, file), 'r', encoding='utf-8') as f:
                test_data.extend(json.load(f)['test_cases'])
    return test_data

def create_evaluation_df(test_data: List[Dict[str, Any]], qa_service, models: Dict[str, EmbeddingModel]) -> pd.DataFrame:
    """Create a DataFrame with all evaluation data."""
    rows = []
    
    for test_case in tqdm(test_data, desc="Processing test cases"):
        user_info = test_case['user_info']
        for conv in test_case['conversations']:
            # Get generated answer
            generated_answer = qa_service.get_answer(user_info, conv['question'])
            
            # Generate embeddings for each model
            embeddings = {}
            similarities = {}
            is_correct = {}
            
            for model_name, model in models.items():
                # Generate embeddings
                ground_truth_embedding = model.encode(conv['answer'])
                generated_embedding = model.encode(generated_answer)
                
                # Calculate cosine similarity
                cosine_sim = np.dot(ground_truth_embedding, generated_embedding) / (
                    np.linalg.norm(ground_truth_embedding) * np.linalg.norm(generated_embedding)
                )
                
                embeddings[model_name] = {
                    'ground_truth': ground_truth_embedding,
                    'generated': generated_embedding
                }
                similarities[model_name] = cosine_sim
                is_correct[model_name] = cosine_sim >= 0.85  # Threshold for considering answer correct
            
            # Create row
            row = {
                'id_number': user_info['id_number'],
                'gender': user_info['gender'],
                'age': user_info['age'],
                'hmo_name': user_info['hmo_name'],
                'membership_tier': user_info['membership_tier'],
                'question': conv['question'],
                'ground_truth_answer': conv['answer'],
                'generated_answer': generated_answer,
                **{f'embedding_{model_name}': emb for model_name, emb in embeddings.items()},
                **{f'similarity_{model_name}': sim for model_name, sim in similarities.items()},
                **{f'is_correct_{model_name}': corr for model_name, corr in is_correct.items()}
            }
            rows.append(row)
    
    return pd.DataFrame(rows)

def calculate_metrics(df: pd.DataFrame, models: Dict[str, EmbeddingModel]) -> Dict[str, Dict[str, float]]:
    """Calculate various metrics for each model."""
    metrics = {}
    
    for model_name in models.keys():
        model_metrics = {
            'overall_accuracy': accuracy_score(df[f'is_correct_{model_name}'], [True] * len(df)),
            'precision': precision_score(df[f'is_correct_{model_name}'], [True] * len(df)),
            'recall': recall_score(df[f'is_correct_{model_name}'], [True] * len(df))
        }
        
        # Per service metrics
        for service in df['question'].unique():
            service_df = df[df['question'] == service]
            model_metrics[f'accuracy_{service}'] = accuracy_score(
                service_df[f'is_correct_{model_name}'], [True] * len(service_df)
            )
        
        # Per HMO metrics
        for hmo in df['hmo_name'].unique():
            hmo_df = df[df['hmo_name'] == hmo]
            model_metrics[f'accuracy_{hmo}'] = accuracy_score(
                hmo_df[f'is_correct_{model_name}'], [True] * len(hmo_df)
            )
        
        # Per tier metrics
        for tier in df['membership_tier'].unique():
            tier_df = df[df['membership_tier'] == tier]
            model_metrics[f'accuracy_{tier}'] = accuracy_score(
                tier_df[f'is_correct_{model_name}'], [True] * len(tier_df)
            )
        
        metrics[model_name] = model_metrics
    
    return metrics

def create_visualizations(df: pd.DataFrame, models: Dict[str, EmbeddingModel], output_dir: str):
    """Create and save various visualizations for each model."""
    output_dir_path = os.path.join(output_dir)
    os.makedirs(output_dir_path, exist_ok=True)
    
    for model_name in models.keys():
        model_dir = os.path.join(output_dir_path, model_name)
        os.makedirs(model_dir, exist_ok=True)
        
        # 1. Overall accuracy by service
        plt.figure(figsize=(12, 6))
        service_acc = df.groupby('question')[f'is_correct_{model_name}'].mean()
        sns.barplot(x=service_acc.index, y=service_acc.values)
        plt.xticks(rotation=45)
        plt.title(f'Accuracy by Service - {model_name}')
        plt.tight_layout()
        plt.savefig(os.path.join(model_dir, 'accuracy_by_service.png'))
        plt.close()
        
        # 2. Accuracy by HMO
        plt.figure(figsize=(10, 6))
        hmo_acc = df.groupby('hmo_name')[f'is_correct_{model_name}'].mean()
        sns.barplot(x=hmo_acc.index, y=hmo_acc.values)
        plt.title(f'Accuracy by HMO - {model_name}')
        plt.tight_layout()
        plt.savefig(os.path.join(model_dir, 'accuracy_by_hmo.png'))
        plt.close()
        
        # 3. Accuracy by tier
        plt.figure(figsize=(10, 6))
        tier_acc = df.groupby('membership_tier')[f'is_correct_{model_name}'].mean()
        sns.barplot(x=tier_acc.index, y=tier_acc.values)
        plt.title(f'Accuracy by Membership Tier - {model_name}')
        plt.tight_layout()
        plt.savefig(os.path.join(model_dir, 'accuracy_by_tier.png'))
        plt.close()
        
        # 4. Confusion matrix
        plt.figure(figsize=(8, 6))
        cm = confusion_matrix(df[f'is_correct_{model_name}'], [True] * len(df))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f'Confusion Matrix - {model_name}')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.tight_layout()
        plt.savefig(os.path.join(model_dir, 'confusion_matrix.png'))
        plt.close()
    
    # 5. Model comparison plot
    plt.figure(figsize=(12, 6))
    model_accuracies = [
        df[f'is_correct_{model_name}'].mean()
        for model_name in models.keys()
    ]
    sns.barplot(x=list(models.keys()), y=model_accuracies)
    plt.xticks(rotation=45)
    plt.title('Overall Accuracy by Model')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir_path, 'model_comparison.png'))
    plt.close()

def generate_report(metrics: Dict[str, Dict[str, float]], output_dir: str):
    """Generate a comprehensive report comparing all models."""
    output_dir_path = os.path.join(output_dir)
    report = "QA Service Evaluation Report - Model Comparison\n==========================================\n\n"
    
    # Overall metrics comparison
    report += "Overall Metrics by Model:\n----------------------\n"
    for model_name, model_metrics in metrics.items():
        report += f"\n{model_name}:\n"
        report += f"Accuracy: {model_metrics['overall_accuracy']:.2%}\n"
        report += f"Precision: {model_metrics['precision']:.2%}\n"
        report += f"Recall: {model_metrics['recall']:.2%}\n"
    
    # Per service comparison
    report += "\nAccuracy by Service:\n------------------\n"
    services = set()
    for model_metrics in metrics.values():
        services.update(key.replace('accuracy_', '') for key in model_metrics.keys()
                      if key.startswith('accuracy_') and not any(x in key for x in ['hmo', 'tier']))
    
    for service in services:
        report += f"\n{service}:\n"
        for model_name, model_metrics in metrics.items():
            report += f"{model_name}: {model_metrics.get(f'accuracy_{service}', 'N/A'):.2%}\n"
    
    # Per HMO comparison
    report += "\nAccuracy by HMO:\n--------------\n"
    hmos = set()
    for model_metrics in metrics.values():
        hmos.update(key.replace('accuracy_', '') for key in model_metrics.keys()
                   if key.startswith('accuracy_') and 'hmo' in key)
    
    for hmo in hmos:
        report += f"\n{hmo}:\n"
        for model_name, model_metrics in metrics.items():
            report += f"{model_name}: {model_metrics.get(f'accuracy_{hmo}', 'N/A'):.2%}\n"
    
    # Per tier comparison
    report += "\nAccuracy by Tier:\n---------------\n"
    tiers = set()
    for model_metrics in metrics.values():
        tiers.update(key.replace('accuracy_', '') for key in model_metrics.keys()
                    if key.startswith('accuracy_') and 'tier' in key)
    
    for tier in tiers:
        report += f"\n{tier}:\n"
        for model_name, model_metrics in metrics.items():
            report += f"{model_name}: {model_metrics.get(f'accuracy_{tier}', 'N/A'):.2%}\n"
    
    with open(os.path.join(output_dir_path, 'evaluation_report.txt'), 'w', encoding='utf-8') as f:
        f.write(report)

In [18]:
    models = [
        ('jinaai/jina-embeddings-v32', 'sentence_transformers'),
        ('multilingual-e5-large-instruct', 'sentence_transformers'),
        ('ncbi/MedCPT-Query-Encoder', 'transformers'),
        ('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract', 'transformers'),
        ('emilyalsentzer/Bio_ClinicalBERT', 'transformers')
    ]

In [19]:
# Initialize models
embedding_models = {
    name: EmbeddingModel(name, model_type)
    for name, model_type in models
}

Device set to use cpu
Device set to use cpu
Device set to use cpu


In [20]:
# Load test data
test_data = load_test_data('test_data')
print(f"Loaded {len(test_data)} test cases")

Loading alternative_services_english_test.json...
Loading alternative_services_hebrew_test.json...
Loading communication_clinic_services_english_test.json...
Loading communication_clinic_services_hebrew_test.json...
Loading dental_services_english_test.json...
Loading dental_services_hebrew_test.json...
Loading optometry_services_english_test.json...
Loading optometry_services_hebrew_test.json...
Loading pregnancy_services_english_test.json...
Loading pregnancy_services_hebrew_test.json...
Loading workshops_services_english_test.json...
Loading workshops_services_hebrew_test.json...
Loaded 36 test cases


In [18]:
import sys
import os

# Get the parent directory of the current directory
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Add the parent directory to sys.path
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

# Now you can import from core
from core.qa_service import QAService

qa_service = QAService()

In [21]:
import json

# Assuming qa_service.services_db is a dictionary or something that can be serialized to JSON
print(json.dumps(qa_service.services_db, indent=2, ensure_ascii=False))

{
  "מכבי": {
    "זהב": {
      "דיקור סיני (אקופונקטורה)": "70% הנחה, עד 20 טיפולים בשנה",
      "שיאצו": "65% הנחה, עד 15 טיפולים בשנה",
      "רפלקסולוגיה": "60% הנחה, עד 12 טיפולים בשנה",
      "נטורופתיה": "70% הנחה, עד 16 טיפולים בשנה",
      "הומאופתיה": "65% הנחה, עד 12 טיפולים בשנה",
      "כירופרקטיקה": "75% הנחה, עד 18 טיפולים בשנה",
      "אבחון הפרעות שפה ודיבור": "90% הנחה, כולל דוח מפורט",
      "טיפול בגמגום": "80% הנחה, עד 30 טיפולים בשנה",
      "טיפול בהפרעות קול": "75% הנחה, עד 20 טיפולים בשנה",
      "אבחון וטיפול בהפרעות בליעה": "85% הנחה, כולל בדיקת וידאופלורוסקופיה",
      "טיפול בעיכוב התפתחותי": "90% הנחה, עד 40 טיפולים בשנה",
      "שיקום שמיעה": "80% הנחה, כולל התאמת מכשירי שמיעה",
      "בדיקות וניקוי שיניים": "חינם פעמיים בשנה, תור תוך 48 שעות",
      "סתימות": "80% הנחה, חומרים מתקדמים",
      "טיפולי שורש": "70% הנחה, כולל צילומי רנטגן",
      "כתרים ושתלים": "60% הנחה, אחריות ל-5 שנים",
      "יישור שיניים": "50% הנחה, כולל רטנציה",
      "טיפולים קוסמ

In [None]:
# Create evaluation DataFrame
df = create_evaluation_df(test_data, qa_service, embedding_models)
print(f"Created DataFrame with {len(df)} rows")

In [25]:
# Count rows where 'generated_answer' contains "I apologize"
count = df[df['generated_answer'].str.contains('I apologize', case=True, na=False)].shape[0]

In [26]:
count

0

In [27]:
# Calculate metrics
metrics = calculate_metrics(df, embedding_models)

In [28]:
# Display overall metrics for each model
for model_name, model_metrics in metrics.items():
    print(f"\n{model_name}:")
    print(f"Accuracy: {model_metrics['overall_accuracy']:.2%}")
    print(f"Precision: {model_metrics['precision']:.2%}")
    print(f"Recall: {model_metrics['recall']:.2%}")


all-MiniLM-L6-v2:
Accuracy: 32.43%
Precision: 32.43%
Recall: 100.00%

all-mpnet-base-v2:
Accuracy: 36.94%
Precision: 36.94%
Recall: 100.00%

ncbi/MedCPT-Query-Encoder:
Accuracy: 61.26%
Precision: 61.26%
Recall: 100.00%

microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract:
Accuracy: 100.00%
Precision: 100.00%
Recall: 100.00%

emilyalsentzer/Bio_ClinicalBERT:
Accuracy: 9.46%
Precision: 9.46%
Recall: 100.00%


In [29]:
# Create visualizations
create_visualizations(df, embedding_models, 'evaluation_results')

In [30]:
# Generate report
generate_report(metrics, 'evaluation_results')

In [31]:
# Save DataFrame
df.to_pickle(os.path.join('evaluation_results.pkl'))
print("Results saved to evaluation_results.pkl")

Results saved to evaluation_results.pkl


In [34]:
df.to_json('evaluation_results.json', orient='records')

In [35]:
df.columns

Index(['id_number', 'gender', 'age', 'hmo_name', 'membership_tier', 'question',
       'ground_truth_answer', 'generated_answer', 'embedding_all-MiniLM-L6-v2',
       'embedding_all-mpnet-base-v2', 'embedding_ncbi/MedCPT-Query-Encoder',
       'embedding_microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract',
       'embedding_emilyalsentzer/Bio_ClinicalBERT',
       'similarity_all-MiniLM-L6-v2', 'similarity_all-mpnet-base-v2',
       'similarity_ncbi/MedCPT-Query-Encoder',
       'similarity_microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract',
       'similarity_emilyalsentzer/Bio_ClinicalBERT',
       'is_correct_all-MiniLM-L6-v2', 'is_correct_all-mpnet-base-v2',
       'is_correct_ncbi/MedCPT-Query-Encoder',
       'is_correct_microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract',
       'is_correct_emilyalsentzer/Bio_ClinicalBERT'],
      dtype='object')

In [38]:
# Define which columns to include as user info
user_info_cols = ['gender', 'age', 'hmo_name', 'membership_tier']
question_col = 'question'
ground_truth_col = 'ground_truth_answer'
generated_col = 'generated_answer'
df[user_info_cols + [question_col, ground_truth_col, generated_col]].to_json(
    'evaluation_results.json', 
    orient='records', 
    force_ascii=False,  # This is important for Hebrew characters
    indent=4
)

In [39]:
# Sort by the similarity score in ascending order (lowest first)
embedding_column = 'similarity_microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract'

# Get the sorted DataFrame
lowest_scores_df = df.sort_values(by=embedding_column, ascending=True)

# Show the first few entries with the lowest scores
lowest_scores_df[['question', 'ground_truth_answer', 'generated_answer', embedding_column]].head(10)

Unnamed: 0,question,ground_truth_answer,generated_answer,similarity_microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract
156,What are the benefits in pregnancy follow-up?,"As a Meuhedet Gold member, you are entitled to...","Hi Michal,\n\nAs a member of Meuhedet Gold, yo...",0.887475
191,What are the benefits in the pregnancy and chi...,"As a Maccabi Gold member, you are entitled to ...","Hi Dana,\n\nBased on your Maccabi Gold members...",0.89606
197,What are the benefits in the pregnancy and chi...,"As a Meuhedet Gold member, you are entitled to...","Hi Michal,\n\nBased on your Meuhedet Gold memb...",0.900637
203,What are the benefits in the pregnancy and chi...,"As a Clalit Gold member, you are entitled to a...","Hi Shira,\n\nBased on your Clalit Gold members...",0.901598
162,What are the benefits in pregnancy follow-up?,"As a Clalit Gold member, you are entitled to f...","Hi Shira,\n\nFor pregnancy follow-up, you have...",0.906203
150,What are the benefits in pregnancy follow-up?,"As a Maccabi Gold member, you are entitled to ...","Hi Dana,\n\nFor pregnancy follow-up, you have ...",0.91865
196,What are the benefits in the diabetes workshop?,"As a Meuhedet Gold member, you are entitled to...","Hi Michal,\n\nAs a member of Meuhedet Gold, yo...",0.926977
180,מה ההטבות במעקב הריון?,"כמבוטחת כללית במסלול זהב, את זכאית למעקב הריון...","שלום שירה,\n\nאני רואה שאת מבוטחת בכללית בתוכנ...",0.92867
73,How long is the wait time for dental check-ups...,"As a Maccabi Gold member, wait time for dental...","Hi Dana,\n\nFor dental check-ups and cleanings...",0.93409
221,מה ההטבות בסדנת הריון ולידה?,"כמבוטחת כללית במסלול זהב, את זכאית לסדנת הריון...","שלום שירה,\n\nאני רואה שאת מבוטחת בכללית במעמד...",0.935069


In [42]:
lowest_scores_df[['question', 'ground_truth_answer', 'generated_answer', embedding_column]].head(10).to_json(orient='records',force_ascii=False, indent=2)

'[\n  {\n    "question":"What are the benefits in pregnancy follow-up?",\n    "ground_truth_answer":"As a Meuhedet Gold member, you are entitled to free pregnancy follow-up, including personal digital monitoring",\n    "generated_answer":"Hi Michal,\\n\\nAs a member of Meuhedet Gold, you have several comprehensive benefits for pregnancy follow-up. Here are the details:\\n\\n1. **Pregnancy Follow-Up (מעקב הריון)**: This is provided for free and includes a personal digital follow-up.\\n2. **Genetic Screening Tests (בדיקות סקר גנטיות)**: You receive an 85% discount, which includes advanced tests.\\n3. **System Scans (סקירות מערכות)**: These are free and include a 3D scan.\\n4. **Childbirth Preparation Course (קורס הכנה ללידה)**: This is free and includes a baby CPR course.\\n5. **Nutritional Counseling (ייעוץ תזונתי)**: You are entitled to 6 free sessions, which also include a set of dietary supplements.\\n6. **Complications of Pregnancy Treatment (טיפול בסיבוכי הריון)**: You have 85% cov