In [4]:
!pip install deepeval
!python -m pip install -U weaviate-client==4.7.1
!python -m pip install python-dotenv==1.0.0
!python -m pip install openai==1.54.3

Collecting weaviate-client==4.7.1
  Downloading weaviate_client-4.7.1-py3-none-any.whl.metadata (3.3 kB)
Collecting httpx<=0.27.0,>=0.25.0 (from weaviate-client==4.7.1)
  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting validators==0.33.0 (from weaviate-client==4.7.1)
  Downloading validators-0.33.0-py3-none-any.whl.metadata (3.8 kB)
Collecting authlib<2.0.0,>=1.2.1 (from weaviate-client==4.7.1)
  Downloading Authlib-1.3.2-py2.py3-none-any.whl.metadata (3.9 kB)
Collecting grpcio-tools<2.0.0,>=1.57.0 (from weaviate-client==4.7.1)
  Downloading grpcio_tools-1.68.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.3 kB)
Collecting grpcio-health-checking<2.0.0,>=1.57.0 (from weaviate-client==4.7.1)
  Downloading grpcio_health_checking-1.68.0-py3-none-any.whl.metadata (1.1 kB)
Collecting protobuf<6.0dev,>=5.26.1 (from grpcio-health-checking<2.0.0,>=1.57.0->weaviate-client==4.7.1)
  Downloading protobuf-5.28.3-cp38-abi3-manylinux2014_x86_64.whl.meta

In [86]:
import os
import json
from typing import List
from langchain_openai import ChatOpenAI
from deepeval import evaluate
from deepeval.metrics import ContextualRelevancyMetric, FaithfulnessMetric, GEval
from deepeval.test_case import LLMTestCase
from deepeval.test_case import LLMTestCaseParams
from weaviate.classes.query import MetadataQuery
from dotenv import load_dotenv
import weaviate

### Environmental Variables & Configurations

In [18]:
from dotenv import load_dotenv
import os

# Load environment variables
dotenv_path = os.path.join(os.getcwd(), ".env")
load_dotenv(dotenv_path)

# Retrieve environment variables with fallbacks
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY', '')
WEAVIATE_URL_VERBA = os.getenv('WEAVIATE_URL_VERBA', '')
WEAVIATE_API_KEY_VERBA = os.getenv('WEAVIATE_API_KEY_VERBA', '')

# Validate lengths
print(len(WEAVIATE_URL_VERBA) > 20)
print(len(WEAVIATE_API_KEY_VERBA) > 20)
print(len(OPENAI_API_KEY) > 20)


True
True
True


In [19]:
# Embedding
EMBEDDING_MODEL = "text-embedding-3-small"
ENCODING_FORMAT = "float"

# Database
COLLECTION_NAME = "VERBA_Embedding_text_embedding_3_small"

### Connections

In [20]:
# VECTOR DATABASE CONNECTION

from weaviate.classes.init import Auth, AdditionalConfig, Timeout
import weaviate

client_db = weaviate.connect_to_weaviate_cloud(
    cluster_url = WEAVIATE_URL_VERBA,
    auth_credentials = Auth.api_key(WEAVIATE_API_KEY_VERBA),
    additional_config=AdditionalConfig(timeout=Timeout(init=30, query=60, insert=120))  # Values in seconds
)

print(client_db.is_ready())

True


In [40]:
from weaviate import Client
from weaviate.auth import AuthApiKey

# Ensure the Weaviate URL and API key are correctly loaded
WEAVIATE_URL_VERBA = os.getenv("WEAVIATE_URL_VERBA")
WEAVIATE_API_KEY_VERBA = os.getenv("WEAVIATE_API_KEY_VERBA")

if not WEAVIATE_URL_VERBA or not WEAVIATE_API_KEY_VERBA:
    raise ValueError("Weaviate URL or API key not set. Check your environment variables.")

# Initialize the Weaviate client
client_db = Client(
    url=WEAVIATE_URL_VERBA,
    auth_client_secret=AuthApiKey(api_key=WEAVIATE_API_KEY_VERBA)
)

# Check if the client is ready
if not client_db.is_ready():
    raise ConnectionError("Weaviate is not ready. Check your connection or configuration.")

print("Weaviate client initialized successfully!")


Weaviate client initialized successfully!


In [21]:
# Extracts the actual vector embedding from the OpenAI response
# https://platform.openai.com/docs/api-reference/embeddings/create
def openai_extract_vector(
        response
    ) -> list[float]:

    return response.data[0].embedding

In [22]:
# EMBEDDING CONNECTION - OPENAI
# https://platform.openai.com/docs/api-reference/authentication

import openai
from openai import OpenAI

# Set API Key.  Not necessary if you have an
# OPENAI_API_KEY variable in your environment
openai.api_key = OPENAI_API_KEY

client_embedding = OpenAI()

In [23]:
# GENERATION CONNECTION - OPENAI
# https://platform.openai.com/docs/api-reference/authentication

import openai
from openai import OpenAI

# Set API Key.  Not necessary if you have an
# OPENAI_API_KEY variable in your environment
openai.api_key = OPENAI_API_KEY

generation_client = OpenAI()

### Metrics for Evaluation

In [87]:
correctness_metric = GEval(
    name="Correctness",
    model="gpt-4o",
    evaluation_params=[
        LLMTestCaseParams.EXPECTED_OUTPUT,
        LLMTestCaseParams.ACTUAL_OUTPUT,
    ],
    evaluation_steps=["Determine whether the actual output is factually correct based on the expected output."]
)

faithfulness_metric = FaithfulnessMetric(
    threshold=0.7, model="gpt-4", include_reason=False
)

relevance_metric = ContextualRelevancyMetric(
    threshold=1, model="gpt-4", include_reason=True
)

### Helper Functions

Function: Retrieve Context from Weaviate

In [62]:
def retrieve_context_from_weaviate(query_text, class_name, limit=5):
    """
    Retrieve context chunks from Weaviate using a vector query.

    Args:
        query_text (str): The input query text.
        class_name (str): The class name in Weaviate.
        limit (int): The number of results to retrieve.

    Returns:
        List[str]: Retrieved context chunks.

    Raises:
        ValueError: If the response is invalid or the class is not found in the schema.
    """
    # Step 1: Verify the class name exists in the schema
    schema = client_db.schema.get()
    available_classes = [cls["class"] for cls in schema["classes"]]
    if class_name not in available_classes:
        raise ValueError(f"Class '{class_name}' not found in schema. Available classes: {available_classes}")

    # Step 2: Generate embedding for the query text
    response_embedding = client_embedding.embeddings.create(
        model=EMBEDDING_MODEL, input=query_text, encoding_format=ENCODING_FORMAT
    )
    query_vector = openai_extract_vector(response_embedding)

    # Step 3: Build and execute the GraphQL query
    graphql_query = f"""
    {{
      Get {{
        {class_name}(
          nearVector: {{vector: [{", ".join(map(str, query_vector))}]}}
          limit: {limit}
        ) {{
          content
          chunk_id
        }}
      }}
    }}
    """

    # Execute the query
    response = client_db.query.raw(graphql_query)

    # Step 4: Parse and validate the response
    if "data" in response and "Get" in response["data"] and class_name in response["data"]["Get"]:
        results = response["data"]["Get"][class_name]
        return [result["content"] for result in results if "content" in result]
    else:
        raise ValueError(f"Invalid response or no results found: {response}")



In [57]:
from langchain import PromptTemplate

def create_question_answer_from_context_chain(llm):
    """
    Creates a chain for answering questions based on context using an LLM.

    Args:
        llm: A language model instance (e.g., OpenAI's ChatGPT).

    Returns:
        A chain that takes a context and a question as input and generates an answer.
    """
    # Define the prompt template
    question_answer_prompt_template = """
    For the question below, provide a concise but sufficient answer based ONLY on the provided context:
    {context}
    Question:
    {question}
    """

    # Create a PromptTemplate object with the specified template and input variables
    question_answer_from_context_prompt = PromptTemplate(
        template=question_answer_prompt_template,
        input_variables=["context", "question"],
    )

    # Combine the prompt template and the language model
    question_answer_from_context_cot_chain = (
        question_answer_from_context_prompt
        | llm
    )

    return question_answer_from_context_cot_chain

In [90]:
def retrieve_context_per_question(question, chunks_query_retriever):
    """
    Retrieves the relevant context for a given question using the specified retriever.

    Args:
        question (str): The question to retrieve context for.
        chunks_query_retriever (function): A function to retrieve context chunks.

    Returns:
        List[str]: A list of retrieved context strings.
    """
    try:
        context = chunks_query_retriever(question, class_name="CreditComplaints", limit=5)
        return context
    except Exception as e:
        print(f"Error retrieving context for question: {question}")
        print(f"Error: {e}")
        return []


Function: Generate Answer from Context

In [58]:
def answer_question_from_context(question, context, chain):
    """
    Answer a question using the given context and chain.

    Args:
        question (str): The question to be answered.
        context (str): The context to be used for answering the question.
        chain: The chain for generating the answer.

    Returns:
        dict: A dictionary containing the answer, context, and question.
    """
    input_data = {"context": context, "question": question}
    print("Answering the question from the retrieved context...")

    # Invoke the chain to get the result
    output = chain.invoke(input_data)

    # Extract the answer from the content attribute
    if hasattr(output, "content"):
        answer = output.content
    else:
        answer = "No answer provided."

    return {"answer": answer, "context": context, "question": question}


Function: Create Test Cases

In [83]:
from typing import List
from deepeval.test_case import LLMTestCase

def create_deep_eval_test_cases(
    questions: List[str],
    gt_answers: List[str],
    generated_answers: List[str],
    retrieved_documents: List[str],
) -> List[LLMTestCase]:
    """
    Create a list of LLMTestCase objects for evaluation.

    Args:
        questions (List[str]): List of input questions.
        gt_answers (List[str]): List of ground truth answers.
        generated_answers (List[str]): List of generated answers.
        retrieved_documents (List[str]): List of retrieved documents.

    Returns:
        List[LLMTestCase]: List of LLMTestCase objects.
    """
    return [
        LLMTestCase(
            input=question,
            expected_output=gt_answer,
            actual_output=generated_answer,
            retrieval_context=[retrieved_document],
        )
        for question, gt_answer, generated_answer, retrieved_document in zip(
            questions, gt_answers, generated_answers, retrieved_documents
        )
    ]


Function: Evaluate RAG System

In [94]:
def evaluate_rag(chunks_query_retriever, q_a_file, num_questions=5):
    """
    Evaluate the RAG system using predefined metrics.

    Args:
        chunks_query_retriever: Function to retrieve context chunks for a given query.
        q_a_file (str): Path to the JSON file containing questions and answers.
        num_questions (int): Number of questions to evaluate (default: 5).
    """
    llm = ChatOpenAI(temperature=0, model_name="gpt-4o", max_tokens=2000)
    question_answer_from_context_chain = create_question_answer_from_context_chain(llm)

    # Load questions and answers from JSON file
    with open(q_a_file, "r", encoding="utf-8") as json_file:
        q_a_data = json.load(json_file)

    questions = [item["properties"]["question"] for item in q_a_data][:num_questions]
    ground_truth_answers = [item["properties"]["answer"] for item in q_a_data][:num_questions]
    generated_answers = []
    retrieved_documents = []

    # Generate answers and retrieve documents for each question
    for question in questions:
        print(f"Processing question: {question}")

        # Retrieve context
        context = chunks_query_retriever(
            query_text=question,
            class_name="VERBA_Embedding_text_embedding_3_small",
            limit=5
        )

        if not context:
            context = ["No relevant context found"]  # Fallback for empty context

        # Validate the context is a list of strings
        if not all(isinstance(doc, str) for doc in context):
            raise TypeError("Retrieved context must be a list of strings.")

        # Flatten and append to retrieved_documents
        retrieved_documents.append(" ".join(context))  # Join context chunks into a single string

        # Answer the question using the retrieved context
        context_string = " ".join(context)
        result = answer_question_from_context(
            question, context_string, question_answer_from_context_chain
        )
        generated_answers.append(result["answer"])

    # Create test cases and evaluate
    test_cases = create_deep_eval_test_cases(
        questions, ground_truth_answers, generated_answers, retrieved_documents
    )
    evaluate(
        test_cases=test_cases,
        metrics=[correctness_metric, faithfulness_metric, relevance_metric],
    )

In [96]:
evaluate_rag(
    chunks_query_retriever=retrieve_context_from_weaviate,
    q_a_file="q_a.json",
    num_questions=3
)


Processing question: What are the top complaint categories for Wells Fargo?
Answering the question from the retrieved context...
Processing question: What percentage of complaints about Bank of America were resolved?
Answering the question from the retrieved context...
Processing question: How many complaints were filed in June 2024 for Wells Fargo?
Answering the question from the retrieved context...


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 3 test case(s) in parallel: |          |  0% (0/3) [Time Taken: 00:00, ?test case/s]ERROR:root:OpenAI rate limit exceeded. Retrying: 1 time(s)...
ERROR:root:OpenAI rate limit exceeded. Retrying: 1 time(s)...
ERROR:root:OpenAI rate limit exceeded. Retrying: 1 time(s)...
Evaluating 3 test case(s) in parallel: |███▎      | 33% (1/3) [Time Taken: 00:35, 35.79s/test case]ERROR:root:OpenAI rate limit exceeded. Retrying: 2 time(s)...
ERROR:root:OpenAI rate limit exceeded. Retrying: 2 time(s)...
ERROR:root:OpenAI rate limit exceeded. Retrying: 3 time(s)...
Evaluating 3 test case(s) in parallel: |██████████|100% (3/3) [Time Taken: 02:18, 46.11s/test case]



Metrics Summary

  - ✅ Correctness (GEval) (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The actual output matches the expected output in terms of content and factual accuracy., error: None)
  - ✅ Faithfulness (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-4, reason: None, error: None)
  - ❌ Contextual Relevancy (score: 0.875, threshold: 1.0, strict: False, evaluation model: gpt-4, reason: The score is 0.88 because although there were statements in the retrieval context that were irrelevant due to being about JP Morgan Chase, there were numerous relevant statements concerning various problems with Wells Fargo such as issues with 'Checking or savings account', 'Money transfer, virtual currency, or money service', and 'Vehicle loan or lease'., error: None)

For test case:

  - input: What are the top complaint categories for Wells Fargo?
  - actual output: The top complaint categories for Wells Fargo include Checking account, General-p


