In [2]:
import boto3
from crewai import LLM
import pandas as pd
bedrock_runtime = boto3.client("bedrock-runtime", region_name="us-east-1")
llm = LLM(model="bedrock/us.anthropic.claude-3-5-sonnet-20241022-v2:0")

clean = pd.read_excel("nova_pro_optimized_generated_data.xlsx")
# Example cloud-related questions
questions = clean["user_input"].tolist()
ground_answers = clean["ground_truth"].tolist()
generated_answers = clean["ai_response"].tolist()
contexts = clean["cleaned_output"].tolist()
l = len(questions)
# Specify the filename
filename = "nova_pro_optimized_results.txt"
with open(filename, "w") as f:
    pass # No operation needed as just clearing the file


# Judge prompt template
def create_judge_prompt(question, answer_a, answer_b, context):
    return f"""
You are an expert in validating the generated answers based on ground answer and contexts provided to the agent based on the data from the vector database. 
Given the following question and their answers, validate the answer and its completeness.

Question: {question}

Ground Answer: {answer_a}

AI Generated Answer: {answer_b}

Context from tool provided to the agent: {context}

Your task is to evaluate the AI-generated answer based on the ground answer and the context provided.
Please respond with reasoning. Give some metrics to evaluate the answer and its completeness.
Return your response in the following JSON format:
{{
    "is_correct": true/false,
    "reasoning": "Your reasoning here",
    "metrics": {{
        "relevance": 0-1,
        "completeness": 0-1,
        "accuracy": 0-1
    }}
}}
"""
# Evaluate each question
for i in range(l):
    # q_key = f"Q{i}"
    prompt = create_judge_prompt(
        questions[i],
        ground_answers[i],
        generated_answers[i],
        contexts[i]
    )
    print(f"\nEvaluating Question {i}...\n")
    response = llm.call(prompt)
    # result = llm.call(prompt)
    print(response)
    with open(filename, "a") as file:
        file.write(f"Question {questions[i]}:\n")
        file.write(f"Response: {response}\n\n")
        print(f"Response for Question {i} written to {filename}")



Evaluating Question 0...

{
    "is_correct": true,
    "reasoning": "The AI-generated answer not only covers the core information from the ground answer but significantly expands upon it with valuable additional details. It correctly identifies SendGrid as the email service offered by Cloud Engineering and mentions the need for an API key (through the sub-user request process). The answer goes beyond the basic response by providing:
    1. Clear organization with sections
    2. Availability information
    3. Specific features of the service
    4. Detailed steps for getting started
    5. Relevant links to documentation and the request form
    All information provided aligns with the context and enhances the basic ground answer without contradicting it.",
    "metrics": {
        "relevance": 1.0,
        "completeness": 0.95,
        "accuracy": 1.0
    },
    "explanation_of_metrics": {
        "relevance": "Perfect score as all provided information directly addresses the questi