Evaluate the LLM responses

In [2]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

In [27]:
import json
import os
import pandas as pd
from pinecone import Pinecone, ServerlessSpec
from llama_index.llms.openai import OpenAI
from llama_index.core import (
    Settings, Document, VectorStoreIndex, get_response_synthesizer, PromptTemplate
)
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from dotenv import load_dotenv
from difflib import SequenceMatcher
from fuzzywuzzy import fuzz

### Load environment variables (for API keys)

In [31]:
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
pinecone_client = Pinecone(api_key=PINECONE_API_KEY)
client = OpenAI(model="gpt-4o-mini", temperature=0)

### Initialize OpenAI client and Pinecone

In [5]:
embedding = OpenAIEmbedding(model="text-embedding-ada-002")
Settings.llm = client
Settings.embed_model = embedding
Settings.chunk_size_limit = 1536

In [11]:
# Initialize Pinecone vector store (if required)
pinecone_index = pinecone_client.Index("chatbot-index")
vector_store = PineconeVectorStore(pinecone_index)
retriever = VectorIndexRetriever(index=VectorStoreIndex.from_vector_store(vector_store=vector_store), similarity_top_k=5)

In [12]:
# Define the chatbot's prompt template
prompt_template = (
    "You are a friendly chatbot specialized in helping beginners use FamilySearch and its tools. 😊 "
    "This includes record hints, source attachments, and other related topics. Provide clear and concise answers, "
    "and try to make the conversation enjoyable! 😄\n\n"
    "Context:\n"
    "#####################################\n"
    "{context_str}\n"
    "Answer the user's question: {query_str}\n\n"
    "If the question is related to FamilySearch or its tools (such as record hints, source attachments, or genealogical research), "
    "provide a detailed answer along with a summary. Also, include the following source metadata as 'Source':\n"
    "- **Title**: {title}\n"
    "- **Publish Date**: {date}\n"
    "- **URL**: {url}\n\n"
    "However, if the question is unrelated to FamilySearch, provide a direct and concise answer without any summary or metadata."
)

# Create the template and response synthesizer
qa_template = PromptTemplate(template=prompt_template)
response_synthesizer = get_response_synthesizer(
    llm=client, text_qa_template=qa_template, response_mode="compact"
)
query_engine = RetrieverQueryEngine(
    retriever=retriever, response_synthesizer=response_synthesizer
)

In [32]:
# Function to query the LLM for an answer based on a given question
def query_llm(question):
    # Query the LLM using llama_index's query engine
    try:
        response = query_engine.query(question)
        return response.response  # Ensure this extracts the correct text part of the response
    except Exception as e:
        return f"Error fetching LLM response: {str(e)}"

### Function to calculate the similarity score between two strings and to evaluate LLM responses against predefined answers

In [45]:
# # Function to calculate the similarity score between two strings
def calculate_similarity(stored_answer, llm_answer):
    return fuzz.partial_ratio(stored_answer, llm_answer)

In [14]:
# Load JSON file with predefined questions and answers
with open('../data/faq_test_data/faq_test_data2.json', 'r') as f:
    test_data = json.load(f)

In [20]:
# Create a DataFrame from JSON
df = pd.DataFrame(test_data)
df.head(5)

Unnamed: 0,question,answer,url
0,What are record hints in Family Tree?,Record hints are historical records that Famil...,https://www.familysearch.org/en/help/helpcente...
1,What is the difference between the left and th...,The left column shows you details from the his...,https://www.familysearch.org/en/help/helpcente...
2,What do the Details and Compare buttons do in ...,The Details button expands both the record det...,https://www.familysearch.org/en/help/helpcente...
3,What is my source box?,Your source box contains a list of sources tha...,https://www.familysearch.org/en/help/helpcente...
4,How do I give feedback to FamilySearch about S...,We want to know what you think about Source Li...,https://www.familysearch.org/en/help/helpcente...


In [28]:
# List to store LLM responses and similarity scores
llm_responses = []
similarity_scores = []
url_scores = []


In [40]:
# Iterate through the DataFrame to compare the LLM responses with the stored answers
for index, row in df.iterrows():
    question = row['question']
    expected_answer = row['answer']
    expected_url = row['url']

In [41]:
# Get the LLM's answer
llm_answer = query_llm(question)

In [42]:
# Calculate the similarity between the LLM answer and the expected answer
answer_similarity = fuzz.ratio(llm_answer, expected_answer)
# Calculate the similarity between the URLs (100 if they match exactly, 0 if not)
url_similarity = 100 if llm_answer.find(expected_url) != -1 else 0

In [43]:
# Append the results to their respective lists
llm_responses.append(llm_answer)
similarity_scores.append(answer_similarity)
url_scores.append(url_similarity)

In [44]:
# Add the LLM responses, similarity scores, and URL scores to the DataFrame
df['LLM Response'] = llm_responses
df['Answer Similarity (%)'] = similarity_scores
df['URL Similarity (%)'] = url_scores

ValueError: Length of values (2) does not match length of index (8)

In [None]:
# Display the DataFrame for visualization
df[['question', 'answer', 'LLM Response', 'Answer Similarity (%)', 'url', 'URL Similarity (%)']]