In [None]:
pip install transformers datasets huggingface_hub torch evaluate 

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


Evalutation

In [2]:
import pandas as pd
import nltk
from sentence_transformers import SentenceTransformer, util
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK resources
try:
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
except Exception as e:
    print(f"Error downloading NLTK resources: {e}")
    exit(1)

# Initialize Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2').to('cuda' if torch.cuda.is_available() else 'cpu')  # Use CPU
# Initialize stopwords
stop_words = set(stopwords.words('english'))

# Define metric calculation functions
def compute_contextual_precision(answer, rag_response, threshold=0.4):
    answer_sentences = nltk.sent_tokenize(answer)
    rag_sentences = nltk.sent_tokenize(rag_response)

    if not rag_sentences or not answer_sentences:
        return 0.0
    
    answer_embeddings = model.encode(answer_sentences, convert_to_tensor=True)
    rag_embeddings = model.encode(rag_sentences, convert_to_tensor=True)
    
    relevant_count = 0
    for rag_emb in rag_embeddings:
        similarities = util.cos_sim(rag_emb, answer_embeddings)
        if similarities.max().item() > threshold:
            relevant_count += 1
    
    return relevant_count / len(rag_sentences)

def compute_contextual_recall(answer, rag_response, threshold=0.4):
    # Tokenize the answer and the RAG response into sentences
    answer_sentences = nltk.sent_tokenize(answer)
    rag_sentences = nltk.sent_tokenize(rag_response)

    if not answer_sentences or not rag_sentences:
        return 0.0
        
    # Encode the sentences into embeddings using the pre-trained model
    answer_embeddings = model.encode(answer_sentences, convert_to_tensor=True)
    rag_embeddings = model.encode(rag_sentences, convert_to_tensor=True)
    
    covered_count = 0
    for answer_emb in answer_embeddings:
        similarities = util.cos_sim(answer_emb, rag_embeddings)
        if similarities.max() > threshold:
            covered_count += 1
    
    return covered_count / len(answer_sentences)

# compute contextual relevancy
def compute_contextual_relevancy(question, answer, rag_response):
    combined_ref = question + " " + answer
    ref_embedding = model.encode(combined_ref, convert_to_tensor=True)
    rag_embedding = model.encode(rag_response, convert_to_tensor=True)

    similarity = util.cos_sim(ref_embedding, rag_embedding).item()
    return similarity

# compute answer relevancy
def compute_answer_relevancy(question, rag_response):
    question_embedding = model.encode(question, convert_to_tensor=True)
    rag_embedding = model.encode(rag_response, convert_to_tensor=True)

    similarity = util.cos_sim(question_embedding, rag_embedding).item()
    return similarity

def compute_faithfulness(answer, rag_response):
    try:
        # Semantic similarity
        answer_embedding = model.encode(answer, convert_to_tensor=True)
        rag_embedding = model.encode(rag_response, convert_to_tensor=True)
        semantic_similarity = util.cos_sim(answer_embedding, rag_embedding).item()
        
        # Keyword overlap
        answer_tokens = set(word_tokenize(answer.lower())) - stop_words
        rag_tokens = set(word_tokenize(rag_response.lower())) - stop_words
        if not answer_tokens or not rag_tokens:
            keyword_overlap = 0.0
        else:
            common_tokens = answer_tokens.intersection(rag_tokens)
            keyword_overlap = len(common_tokens) / len(answer_tokens)
        
        # Combine scores (weighted average, adjustable)
        faithfulness_score = (semantic_similarity + keyword_overlap) / 2
        return faithfulness_score
    except Exception as e:
        print(f"Error in compute_faithfulness: {e}")
        with open('faithfulness_errors.log', 'a') as f:
            f.write(f"Error: {e}\nrag_response: {rag_response[:500]}...\n\n")
        return 0.0

def plot_metric_distributions(results_df):
    metrics = ['Contextual Precision', 'Contextual Recall', 'Contextual Relevancy', 
               'Answer Relevancy', 'Faithfulness']
    
    plt.figure(figsize=(12, 8))
    for i, metric in enumerate(metrics, 1):
        plt.subplot(2, 3, i)
        plt.hist(results_df[metric], bins=20, edgecolor='black')
        plt.title(metric)
        plt.xlabel('Score')
        plt.ylabel('Frequency')
    plt.tight_layout()
    plt.savefig('metric_distributions.png')
    plt.close()

def main():
    encodings = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']
    df = None
    for encoding in encodings:
        try:
            df = pd.read_csv('Result_prompt/medqa3_dataset_rag_prompt.csv', encoding=encoding)
            print(f"Successfully read CSV with {encoding} encoding.")
            break
        except UnicodeDecodeError:
            print(f"Failed to read CSV with {encoding} encoding. Trying next encoding...")
        except FileNotFoundError:
            print("Error: CSV file not found.")
            return
    
    if df is None:
        print("Error: Could not read CSV with any supported encoding.")
        return
    
    required_columns = ['Question', 'Answer', 'RAG Response']
    if not all(col in df.columns for col in required_columns):
        print("Error: CSV must contain 'Question', 'Answer', 'RAG Response' columns.")
        return
    
    results = []
    low_score_records = []
    
    for idx, row in df.iterrows():
        question = str(row['Question'])
        answer = str(row['Answer'])
        rag_response = str(row['RAG Response'])
        
        # Log long RAG responses for debugging
        if len(rag_response) > 5000:
            print(f"Warning: Long RAG response at index {idx}, length: {len(rag_response)}")
        
        metrics = {
            'Contextual Precision': compute_contextual_precision(answer, rag_response),
            'Contextual Recall': compute_contextual_recall(answer, rag_response),
            'Contextual Relevancy': compute_contextual_relevancy(question, answer, rag_response),
            'Answer Relevancy': compute_answer_relevancy(question, rag_response),
            'Faithfulness': compute_faithfulness(answer, rag_response)
        }
        
        if metrics['Contextual Precision'] < 0.1:
            low_score_records.append(metrics)
        
        results.append(metrics)
    
    results_df = pd.DataFrame(results)
    results_df.to_csv('metrics_results.csv', index=False)
    print("Metrics calculated and saved to 'metrics_results.csv'.")
    
    # Save low score records for inspection
    if low_score_records:
        low_score_df = pd.DataFrame(low_score_records)
        low_score_df.to_csv('low_score_records.csv', index=False)
        print(f"Low score records saved to 'low_score_records.csv'.")
    
    plot_metric_distributions(results_df)
    
    avg_metrics = results_df[['Contextual Precision', 'Contextual Recall', 
                             'Contextual Relevancy', 'Answer Relevancy', 
                             'Faithfulness']].mean()
    print("\nAverage Metrics:")
    print(avg_metrics)

if __name__ == "__main__":
    main()

Successfully read CSV with utf-8 encoding.
Metrics calculated and saved to 'metrics_results.csv'.
Low score records saved to 'low_score_records.csv'.

Average Metrics:
Contextual Precision    0.686620
Contextual Recall       0.725277
Contextual Relevancy    0.664052
Answer Relevancy        0.637604
Faithfulness            0.524522
dtype: float64
