# Faithfullness

In [1]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Simulate retrieved documents (in practice, these would be retrieved from your retrieval system)
retrieved_docs = [
    "The sky is blue and the sun is shining.",
    "Clouds can form when the atmosphere cools and water vapor condenses.",
    "A clear sky indicates good weather."
]

# Simulate a generated response (in practice, this would be generated by your language model)
generated_response = "The sky is clear and blue, and there are no clouds."

# Combine the retrieved documents into a single string
retrieved_text = " ".join(retrieved_docs)

# Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer().fit_transform([retrieved_text, generated_response])
vectors = vectorizer.toarray()

# Calculate cosine similarity between the retrieved text and the generated response
cosine_sim = cosine_similarity([vectors[0]], [vectors[1]])[0][0]

# Define a simple threshold for faithfulness (e.g., 0.5 is arbitrary here, adjust as needed)
faithfulness_threshold = 0.5

# Determine faithfulness
if cosine_sim >= faithfulness_threshold:
    print(f"The generated response is faithful (Cosine Similarity: {cosine_sim:.2f})")
else:
    print(f"The generated response is not faithful (Cosine Similarity: {cosine_sim:.2f})")

The generated response is not faithful (Cosine Similarity: 0.50)


# Answer Relavency

In [3]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Simulate a query
query = "What causes the sky to appear blue?"

# Simulate a generated answer from your RAG pipeline
generated_answer = "The sky appears blue due to the scattering of sunlight by the atmosphere."

# Vectorize the query and the generated answer using TF-IDF
vectorizer = TfidfVectorizer().fit_transform([query, generated_answer])
vectors = vectorizer.toarray()

# Calculate cosine similarity between the query and the generated answer
cosine_sim = cosine_similarity([vectors[0]], [vectors[1]])[0][0]

# Define a simple threshold for relevance (e.g., 0.5 is arbitrary here, adjust as needed)
relevance_threshold = 0.5

# Determine relevance
if cosine_sim >= relevance_threshold:
    print(f"The generated answer is relevant (Cosine Similarity: {cosine_sim:.2f})")
else:
    print(f"The generated answer is not relevant (Cosine Similarity: {cosine_sim:.2f})")

The generated answer is not relevant (Cosine Similarity: 0.37)


# Context Precision

In [7]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Simulate a query
query = "What causes the sky to appear blue?"

# Simulate retrieved contexts from your RAG pipeline
retrieved_contexts = [
    "The sky appears blue due to the scattering of sunlight by the Earth's atmosphere.",
    "Rainbows are formed when light is refracted through water droplets.",
    "The color of the sky is affected by the scattering of light by air molecules."
]

# Combine the query and each retrieved context into a list
texts = [query] + retrieved_contexts

# Vectorize the texts using TF-IDF
vectorizer = TfidfVectorizer().fit_transform(texts)
vectors = vectorizer.toarray()

# Calculate cosine similarity between the query and each retrieved context
similarities = cosine_similarity([vectors[0]], vectors[1:])[0]

# Define a simple threshold for context relevance (e.g., 0.5 is arbitrary here, adjust as needed)
relevance_threshold = 0.5

# Determine the number of relevant contexts
relevant_contexts_count = sum(sim >= relevance_threshold for sim in similarities)

# Calculate precision (ratio of relevant contexts to total retrieved contexts)
context_precision = relevant_contexts_count / len(retrieved_contexts)

print(f"Context Precision: {context_precision:.2f}")

Context Precision: 0.00


# Context Relavency

In [6]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# Simulate a query
query = "What causes the sky to appear blue?"
# Simulate retrieved contexts from your RAG pipeline
retrieved_contexts = [
    "The sky appears blue due to the scattering of sunlight by the Earth's atmosphere.",
    "Rainbows are formed when light is refracted through water droplets.",
    "The color of the sky is affected by the scattering of light by air molecules."
]
# Combine the query and each retrieved context into a list
texts = [query] + retrieved_contexts
# Vectorize the texts using TF-IDF
vectorizer = TfidfVectorizer().fit_transform(texts)
vectors = vectorizer.toarray()
# Calculate cosine similarity between the query and each retrieved context
similarities = cosine_similarity([vectors[0]], vectors[1:])[0]
# Define a simple threshold for context relevancy (e.g., 0.5 is arbitrary here, adjust as needed)
relevancy_threshold = 0.5
# Determine relevancy of each context
relevancy_results = [
    f"Context {i+1} is {'relevant' if sim >= relevancy_threshold else 'not relevant'} (Cosine Similarity: {sim:.2f})"
    for i, sim in enumerate(similarities)
]
# Print the relevancy results for each context
for result in relevancy_results:
  print(result)
# Calculate overall context relevancy (e.g., average similarity or percentage of relevant contexts)
average_similarity = np.mean(similarities)
relevant_contexts_count = sum(sim >= relevancy_threshold for sim in similarities)
print(f"\nAverage Context Relevancy: {average_similarity:.2f}")
print(f"Number of Relevant Contexts: {relevant_contexts_count} out of {len(retrieved_contexts)}")

Context 1 is not relevant (Cosine Similarity: 0.37)
Context 2 is not relevant (Cosine Similarity: 0.00)
Context 3 is not relevant (Cosine Similarity: 0.19)

Average Context Relevancy: 0.18
Number of Relevant Contexts: 0 out of 3


#Context recall


In [8]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Simulate a query
query = "What causes the sky to appear blue?"

# Simulate retrieved contexts from your RAG pipeline
retrieved_contexts = [
    "The sky appears blue due to the scattering of sunlight by the Earth's atmosphere.",
    "Rainbows are formed when light is refracted through water droplets.",
    "The color of the sky is affected by the scattering of light by air molecules."
]

# Simulate a list of all possible relevant contexts (for example, from a ground truth or gold standard)
all_relevant_contexts = [
    "The sky appears blue due to the scattering of sunlight by the Earth's atmosphere.",
    "The color of the sky is affected by the scattering of light by air molecules.",
    "The sky appears blue because blue light is scattered more than other colors due to Rayleigh scattering."
]

# Combine the query and all relevant contexts into a list for vectorization
texts = [query] + retrieved_contexts + all_relevant_contexts

# Vectorize the texts using TF-IDF
vectorizer = TfidfVectorizer().fit_transform(texts)
vectors = vectorizer.toarray()

# Calculate cosine similarity between the query and each retrieved context
retrieved_similarities = cosine_similarity([vectors[0]], vectors[1:len(retrieved_contexts)+1])[0]

# Calculate cosine similarity between the query and each relevant context
all_relevant_similarities = cosine_similarity([vectors[0]], vectors[len(retrieved_contexts)+1:])[0]

# Define a simple threshold for relevance (e.g., 0.5 is arbitrary here, adjust as needed)
relevance_threshold = 0.5

# Count relevant retrieved contexts
relevant_retrieved_count = sum(sim >= relevance_threshold for sim in retrieved_similarities)

# Count total relevant contexts
total_relevant_count = sum(sim >= relevance_threshold for sim in all_relevant_similarities)

# Calculate recall (ratio of relevant retrieved contexts to total relevant contexts)
context_recall = relevant_retrieved_count / total_relevant_count if total_relevant_count > 0 else 0

print(f"Context Recall: {context_recall:.2f}")

Context Recall: 0.00


#Context Entities Recall

In [9]:
import spacy

# Load the spaCy model for entity recognition (you can use "en_core_web_sm" or a similar model)
nlp = spacy.load("en_core_web_sm")

# Simulate a query
query = "What causes the sky to appear blue?"



# Simulate retrieved contexts from your RAG pipeline
retrieved_contexts = [
    "The sky appears blue due to the scattering of sunlight by the Earth's atmosphere.",
    "Rainbows are formed when light is refracted through water droplets.",
    "The color of the sky is affected by the scattering of light by air molecules."
]


# Extract entities from the query
query_doc = nlp(query)
query_entities = {ent.text.lower() for ent in query_doc.ents}

# Extract entities from each retrieved context
retrieved_entities = set()
for context in retrieved_contexts:
  context_doc = nlp(context)
  retrieved_entities.update({ent.text.lower() for ent in context_doc.ents})

# Calculate recall: the ratio of relevant entities retrieved
if query_entities:
    entities_recall = len(query_entities & retrieved_entities) / len(query_entities)
else:
    entities_recall = 0.0

print(f"Context Entities Recall: {entities_recall:.2f}")

# Optional: Print entities for verification
print(f"Query Entities: {query_entities}")
print(f"Retrieved Entities: {retrieved_entities}")

Context Entities Recall: 0.00
Query Entities: set()
Retrieved Entities: {'earth'}


# Answer Semantic Similarity

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Simulate a reference answer (ground truth)
reference_answer = "The sky appears blue due to the scattering of sunlight by the Earth's atmosphere."

# Simulate an answer generated by the RAG pipeline
generated_answer = "The reason the sky looks blue is because of the scattering of sunlight by the atmosphere."

# Combine the reference answer and generated answer into a list
texts = [reference_answer, generated_answer]

# Vectorize the texts using TF-IDF
vectorizer = TfidfVectorizer().fit_transform(texts)
vectors = vectorizer.toarray()

# Calculate cosine similarity between the reference answer and the generated answer
semantic_similarity = cosine_similarity([vectors[0]], [vectors[1]])[0][0]

# Print the semantic similarity score
print(f"Answer Semantic Similarity: {semantic_similarity:.2f}")

Answer Semantic Similarity: 0.70


# Answer Correctness

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

# Simulate a reference answer (ground truth)
reference_answer = "The sky appears blue due to the scattering of sunlight by the Earth's atmosphere."

# Simulate an answer generated by the RAG pipeline
generated_answer = "The sky is blue because of the scattering of sunlight by the atmosphere."

# Convert answers to lowercase for case-insensitive comparison
reference_answer = reference_answer.lower()
generated_answer = generated_answer.lower()

# Method 1: Exact Match
exact_match = reference_answer == generated_answer

# Method 2: Keyword Overlap
# Define a simple function to extract key terms
def extract_keywords(text):
    vectorizer = CountVectorizer().fit([text])
    return set(vectorizer.get_feature_names_out())

# Extract keywords from both reference and generated answers
reference_keywords = extract_keywords(reference_answer)
generated_keywords = extract_keywords(generated_answer)

# Calculate the overlap
keyword_overlap = len(reference_keywords & generated_keywords) / len(reference_keywords)

# Define a threshold for correctness based on keyword overlap (e.g., 0.8 for 80% overlap)
correctness_threshold = 0.8
is_correct = keyword_overlap >= correctness_threshold

# Print the results
print(f"Exact Match Correctness: {'Correct' if exact_match else 'Incorrect'}")
print(f"Keyword Overlap: {keyword_overlap:.2f}")
print(f"Answer Correctness: {'Correct' if is_correct else 'Incorrect'}")

Exact Match Correctness: Incorrect
Keyword Overlap: 0.67
Answer Correctness: Incorrect


# Aspect Critique

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Simulate a reference answer (ground truth) with identified aspects
reference_answer = {
    "aspect_1": "The sky appears blue.",
    "aspect_2": "This is due to the scattering of sunlight.",
    "aspect_3": "The Earth's atmosphere causes this scattering."
}

# Simulate an answer generated by the RAG pipeline
generated_answer = "The sky looks blue because sunlight is scattered by the atmosphere."

# Function to evaluate if each aspect is covered in the generated answer
def evaluate_aspect_coverage(reference_aspects, generated_answer):
    # Vectorize the aspects and generated answer using TF-IDF
    vectorizer = TfidfVectorizer().fit_transform(list(reference_aspects.values()) + [generated_answer])
    vectors = vectorizer.toarray()

    aspect_coverage = {}

    for i, (aspect, text) in enumerate(reference_aspects.items()):
        # Calculate cosine similarity between the aspect and the generated answer
        similarity = cosine_similarity([vectors[i]], [vectors[-1]])[0][0]
        # Define a threshold for considering the aspect sufficiently covered
        coverage_threshold = 0.5
        aspect_coverage[aspect] = similarity >= coverage_threshold

    return aspect_coverage

# Evaluate aspect coverage
aspect_coverage = evaluate_aspect_coverage(reference_answer, generated_answer)

# Print the aspect critique results
for aspect, covered in aspect_coverage.items():
    print(f"{aspect}: {'Covered' if covered else 'Not Covered'}")

# Optional: Calculate overall aspect critique score (percentage of aspects covered)
overall_coverage = sum(covered for covered in aspect_coverage.values()) / len(aspect_coverage)
print(f"\nOverall Aspect Coverage: {overall_coverage * 100:.2f}%")

aspect_1: Not Covered
aspect_2: Not Covered
aspect_3: Not Covered

Overall Aspect Coverage: 0.00%
