# RAG ChatBot Response Evaluation

## Objective
Comprehensively evaluate and compare three different RAG ChatBot responses to the same prompt using multiple evaluation metrics.

In [1]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rouge_score import rouge_scorer
import numpy as np
import evaluate

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('wordnet')

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Define the prompt
prompt = "Can you generate an HTML code using TailwindCSS that creates a responsive feature section with a main heading, a short description, and three feature cards? Each card should include an SVG icon, a title, a description, and a list of related topics (like Computers, Health, Reference). The layout should use a grid system and flexbox for alignment, styled entirely with Tailwind utility classes. It should have a clean, modern design and be fully responsive. This component should be static, with no JavaScript interactivity."

# Read responses from a text file
def read_responses_from_text_file(file_path):
    try:
        responses = {}
        current_response = None
        response_content = []
        
        with open(file_path, 'r') as file:
            for line in file:
                line = line.strip()
                
                # Check if this is a response identifier line (e.g., "Response A:")
                if line.endswith(':') and line.startswith('Response '):
                    # If we were already processing a response, save it
                    if current_response:
                        responses[current_response] = '\n'.join(response_content)
                    
                    # Start new response
                    current_response = line[:-1]  # Remove the colon
                    response_content = []
                
                # Otherwise add to current response content if we're in a response section
                elif current_response:
                    response_content.append(line)
            
            # Add the last response if there is one
            if current_response and response_content:
                responses[current_response] = '\n'.join(response_content)
                
        return responses
    except FileNotFoundError:
        print(f"Error: File {file_path} not found.")
        return None

# Example usage
file_path = "responses.txt"
responses = read_responses_from_text_file(file_path)

if responses:
    print(f"Prompt: {prompt}")
    print("\nResponses:")
    for key, value in responses.items():
        print(f"\n{key}:")
        print(value)
        
        
# Define the prompt and three responses
# prompt = "Your specific prompt here"
# responses = {
#     'Response A': "First RAG response",
#     'Response B': "Second RAG response",
#     'Response C': "Third RAG response"
# }

Prompt: Can you generate an HTML code using TailwindCSS that creates a responsive feature section with a main heading, a short description, and three feature cards? Each card should include an SVG icon, a title, a description, and a list of related topics (like Computers, Health, Reference). The layout should use a grid system and flexbox for alignment, styled entirely with Tailwind utility classes. It should have a clean, modern design and be fully responsive. This component should be static, with no JavaScript interactivity.

Responses:

Response A:
Sure! Here's a clean, modern, and fully responsive Feature Section built with HTML and TailwindCSS.
It includes a main heading, a short description, and three feature cards laid out using a grid and flexbox:
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Feature Section</title>
<script src="https://cdn.tailwindcss.com"></script>
</head>
<body

## 1. Semantic Similarity Evaluation

In [3]:
def semantic_similarity(text1, text2):
    """
    Calculate semantic similarity using spaCy word embeddings
    """
    doc1 = nlp(text1)
    doc2 = nlp(text2)
    return doc1.similarity(doc2)

# Semantic similarity to the prompt
semantic_scores = {}
for name, response in responses.items():
    semantic_scores[name] = semantic_similarity(prompt, response)

print("Semantic Similarity Scores:")
for name, score in semantic_scores.items():
    print(f"{name}: {score:.4f}")

  return doc1.similarity(doc2)


Semantic Similarity Scores:
Response A: 0.2206
Response B: 0.2029
Response C: 0.2664


## 2. ROUGE Score Evaluation

In [4]:
# ROUGE Score Calculation
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

rouge_scores = {}
for name, response in responses.items():
    rouge_scores[name] = scorer.score(prompt, response)

print("ROUGE Scores:")
for name, scores in rouge_scores.items():
    print(f"{name}:")
    print(f"  ROUGE-1: {scores['rouge1'].fmeasure:.4f}")
    print(f"  ROUGE-2: {scores['rouge2'].fmeasure:.4f}")
    print(f"  ROUGE-L: {scores['rougeL'].fmeasure:.4f}")

ROUGE Scores:
Response A:
  ROUGE-1: 0.1298
  ROUGE-2: 0.0441
  ROUGE-L: 0.0718
Response B:
  ROUGE-1: 0.0387
  ROUGE-2: 0.0000
  ROUGE-L: 0.0277
Response C:
  ROUGE-1: 0.1818
  ROUGE-2: 0.1290
  ROUGE-L: 0.1371


## 3. Perplexity Evaluation

In [5]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Function to read responses from a text file
def read_responses_from_file(file_path):
    try:
        responses = {}
        current_response = None
        response_content = []
        
        with open(file_path, 'r') as file:
            for line in file:
                line = line.strip()
                
                # Check if this is a response identifier line (e.g., "Response A:")
                if line.endswith(':') and line.startswith('Response '):
                    # If we were already processing a response, save it
                    if current_response:
                        responses[current_response] = '\n'.join(response_content)
                    
                    # Start new response
                    current_response = line[:-1]  # Remove the colon
                    response_content = []
                
                # Otherwise add to current response content if we're in a response section
                elif current_response:
                    response_content.append(line)
            
            # Add the last response if there is one
            if current_response and response_content:
                responses[current_response] = '\n'.join(response_content)
                
        return responses
    except FileNotFoundError:
        print(f"Error: File {file_path} not found.")
        return None

# Load pre-trained model and tokenizer
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

def calculate_perplexity(text, max_length=1024):
    """
    Calculate perplexity of the text with chunking for long texts
    Lower perplexity indicates better language model performance
    """
    # Handle text that's too long by chunking
    if len(text) > 3000:  # An arbitrary threshold
        chunks = []
        words = text.split()
        chunk = []
        chunk_length = 0
        
        for word in words:
            chunk.append(word)
            chunk_length += len(word) + 1  # +1 for space
            
            if chunk_length > 1000:  # Smaller than max_length to be safe
                chunks.append(' '.join(chunk))
                chunk = []
                chunk_length = 0
        
        # Add the last chunk if it exists
        if chunk:
            chunks.append(' '.join(chunk))
        
        # Calculate perplexity for each chunk and average
        perplexities = []
        for chunk_text in chunks:
            # Prepare the input
            encodings = tokenizer(chunk_text, return_tensors='pt', truncation=True, max_length=max_length)
            
            # Calculate loss
            with torch.no_grad():
                outputs = model(**encodings, labels=encodings['input_ids'])
                loss = outputs.loss
            
            # Calculate perplexity
            perplexity = torch.exp(loss).item()
            perplexities.append(perplexity)
        
        return sum(perplexities) / len(perplexities)
    else:
        # Prepare the input
        encodings = tokenizer(text, return_tensors='pt', truncation=True, max_length=max_length)
        
        # Calculate loss
        with torch.no_grad():
            outputs = model(**encodings, labels=encodings['input_ids'])
            loss = outputs.loss
        
        # Calculate perplexity
        perplexity = torch.exp(loss).item()
        return perplexity

# Example usage
file_path = "responses.txt"
responses = read_responses_from_file(file_path)

if responses:
    # Calculate perplexity for each response
    perplexity_scores = {}
    for name, response in responses.items():
        perplexity_scores[name] = calculate_perplexity(response)
    
    print("Perplexity Scores:")
    for name, score in perplexity_scores.items():
        print(f"{name}: {score:.4f}")

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Perplexity Scores:
Response A: 17.0156
Response B: 15.2050
Response C: 23.2467


## 4. Embedding-based Bias Evaluation

In [6]:
import gensim.downloader as api

# Load pre-trained word vectors
word_vectors = api.load('glove-wiki-gigaword-100')

def calculate_word_embedding_bias(text, bias_terms):
    """
    Calculate bias in word embeddings
    """
    # Tokenize the text
    tokens = nltk.word_tokenize(text.lower())
    
    # Calculate average similarity to bias terms
    bias_scores = []
    for term in bias_terms:
        term_similarities = []
        for token in tokens:
            if token in word_vectors.key_to_index:
                try:
                    similarity = word_vectors.similarity(token, term)
                    term_similarities.append(similarity)
                except KeyError:
                    continue
        
        # Average similarity for this bias term
        if term_similarities:
            bias_scores.append(np.mean(term_similarities))
    
    # Overall bias score
    return np.mean(bias_scores) if bias_scores else 0

# Define bias terms (these should be chosen carefully)
bias_terms = ['man', 'woman', 'he', 'she', 'male', 'female']

# Calculate bias for each response
bias_scores = {}
for name, response in responses.items():
    bias_scores[name] = calculate_word_embedding_bias(response, bias_terms)

print("Bias Scores:")
for name, score in bias_scores.items():
    print(f"{name}: {score:.4f}")

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\Admin/nltk_data'
    - 'c:\\ITITIU20316\\THESIS\\ROUGE\\THESIS_EVALUATION\\.venv\\nltk_data'
    - 'c:\\ITITIU20316\\THESIS\\ROUGE\\THESIS_EVALUATION\\.venv\\share\\nltk_data'
    - 'c:\\ITITIU20316\\THESIS\\ROUGE\\THESIS_EVALUATION\\.venv\\lib\\nltk_data'
    - 'C:\\Users\\Admin\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


## 5. Comprehensive Evaluation Summary

In [None]:
# Combine all evaluation metrics
evaluation_summary = pd.DataFrame({
    'Semantic Similarity': semantic_scores,
    'ROUGE-1 Score': {name: scores['rouge1'].fmeasure for name, scores in rouge_scores.items()},
    'ROUGE-2 Score': {name: scores['rouge2'].fmeasure for name, scores in rouge_scores.items()},
    'Perplexity': perplexity_scores,
    'Embedding Bias': bias_scores
})

print("Comprehensive Evaluation Summary:")
print(evaluation_summary)

# Normalize and weight different metrics
def normalize_column(series):
    return (series - series.min()) / (series.max() - series.min())

# Weights for different metrics (adjust as needed)
weights = {
    'Semantic Similarity': 0.25,
    'ROUGE-1 Score': 0.2,
    'ROUGE-2 Score': 0.2,
    'Perplexity': -0.2,  # Negative weight as lower is better
    'Embedding Bias': -0.15  # Negative weight as less bias is better
}

# Normalize and weight the scores
weighted_scores = {}
for column, weight in weights.items():
    normalized = normalize_column(evaluation_summary[column])
    for name in responses.keys():
        if name not in weighted_scores:
            weighted_scores[name] = 0
        weighted_scores[name] += normalized[name] * weight

# Print final ranking
print("\nFinal Weighted Scores:")
for name, score in sorted(weighted_scores.items(), key=lambda x: x[1], reverse=True):
    print(f"{name}: {score:.4f}")

# Visualize the results
plt.figure(figsize=(10, 6))
plt.bar(weighted_scores.keys(), weighted_scores.values())
plt.title('Comprehensive RAG Response Evaluation')
plt.xlabel('Response')
plt.ylabel('Weighted Score')
plt.tight_layout()
plt.show()

## Conclusion

This notebook provides a comprehensive evaluation of RAG ChatBot responses using multiple metrics:

1. Semantic Similarity
2. ROUGE Scores
3. Perplexity
4. Embedding-based Bias Evaluation

The final weighted score considers the performance across these different dimensions, providing a holistic assessment of the responses.