In [14]:
import traceback
import json
import tomllib as tomlib
from pathlib import Path
from dotenv import load_dotenv
from tqdm import tqdm

# from huggingface_hub import login
from llama_index.core import Settings, VectorStoreIndex, get_response_synthesizer, SimpleDirectoryReader, StorageContext
from llama_index.core.node_parser import TokenTextSplitter, SentenceSplitter
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.indices.vector_store.retrievers import VectorIndexRetriever
# from llama_index.core.postprocessor import SimilarityPostprocessor, NERPIINodePostprocessor, PrevNextNodePostprocessor
# from llama_index.core.tools import QueryEngineTool
# from llama_index.core.agent import AgentRunner, ReActAgentWorker
# from llama_index.core.base.llms.types import ChatMessage
# from llama_index.llms.huggingface import HuggingFaceLLM
# from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.neo4jvector import Neo4jVectorStore
# from llama_index.agent.openai import OpenAIAgentWorker
from llama_index.llms.google_genai import GoogleGenAI
# from llama_index.core.node_parser import TokenTextSplitter, SentenceSplitter
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric, ContextualRelevancyMetric
from deepeval.test_case import LLMTestCase
from deepeval.dataset import EvaluationDataset
from deepeval import evaluate
from neo4j import GraphDatabase

In [2]:
load_dotenv()

True

In [3]:
DATASET_SIZE = 3

In [4]:
i = 9
hs = True
llm_name = 'gemini-1.5-pro'
top_k = 3
chunk_size = 256
preparation = 'TokenTextSplitter'

In [None]:
###### parse configuration
with open('./config.toml', "rb") as file:
    cfg = tomlib.load(file)

cfg['config']['rag']['setup']['hybrid_search'] = hs
# cfg['config']['rag']['models']['llm_openai'] = "llama-3.2-11B-vision-instruct"
cfg['config']['rag']['setup']['similarity_top_k'] = top_k
cfg['config']['rag']['setup']['chunk_size'] = chunk_size
cfg['config']['rag']['setup']['preparation'] = preparation

print(f'{cfg=}')

###### load data
dataset = json.load(open('./data/q&a.json', 'r'))

In [6]:
llm = GoogleGenAI(
    model="gemini-1.5-pro",
    api_key=cfg['config']['env']['GEMINI_API_KEY'],
    temperature=cfg['config']['rag']['models']['temperature']
)
embed_model = OpenAIEmbedding(
    model=cfg['config']['rag']['models']['em_openai'],
    api_key=cfg['config']['env']['OPENAI_API_KEY'],
    dimensions=cfg['config']['rag']['setup']['embedding_dimension']
)

Settings.llm = llm
Settings.embed_model = embed_model 

## Predict

In [7]:
 # docstore = SimpleDocumentStore()
vector_store = Neo4jVectorStore(
    username=cfg['config']['env']['NEO4J_USER'],
    password=cfg['config']['env']['NEO4J_PASSWORD'],
    url=cfg['config']['env']['NEO4J_URI'],
    embedding_dimension=cfg['config']['rag']['setup']['embedding_dimension'],
    distance_strategy=cfg['config']['rag']['setup']['distance_strategy'],
    index_name=cfg['config']['rag']['setup']['index_name'],
    text_node_property=cfg['config']['rag']['setup']['text_node_property'],
    hybrid_search=cfg['config']['rag']['setup']['hybrid_search'],
)

db_index = VectorStoreIndex.from_vector_store(vector_store)

In [7]:
retriever = VectorIndexRetriever(
    index=db_index,
    similarity_top_k=cfg['config']['rag']['setup']['similarity_top_k'],
    vector_store_query_mode=cfg['config']['rag']['setup']['vector_store_query_mode']
)

# custom response synthesizer
response_synthesizer = get_response_synthesizer(
    response_mode=cfg['config']['rag']['setup']['response_mode']
)

# combine custom query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer
)

###### configure setup
setup = query_engine

In [8]:
###### predict
predictions = []
for item in tqdm(dataset[:DATASET_SIZE]):
    try:
        response = setup.query(item.get('question'))
        predictions.append(response)
    except Exception as e:
        print(e)
        traceback.format_exc()

print(f'{predictions=}')

100%|██████████| 1/1 [00:02<00:00,  2.45s/it]

predictions=[Response(response='Returns are accepted within 30 days of purchase.\n', source_nodes=[NodeWithScore(node=TextNode(id_='argo-team-9', embedding=None, metadata={'file_path': 'data/companies/argo-team.txt', 'file_name': 'argo-team.txt', 'file_type': 'text/plain', 'file_size': 6400, 'creation_date': '2025-03-24', 'last_modified_date': '2025-03-24'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='69335b38-18c9-4061-a9b3-9cd02d9b2138', node_type='4', metadata={'file_path': 'data/companies/argo-team.txt', 'file_name': 'argo-team.txt', 'file_type': 'text/plain', 'file_size': 6400, 'creation_date': '2025-03-24', 'last_modified_date': '2025-03-24'}, hash='2e292f2e0a2ffd3061d1861b1a0b9a614e72e04883d82143




In [11]:
output = predictions[0].response

context = [node.get_content() for node in predictions[0].source_nodes]
output, context

('Returns are accepted within 30 days of purchase.\n',
 ['500.00 |\n| ATE-403 | GPS-Guided Fertilizer Spreader FlexSpread 400 | Machinery | Precision spreading, GPS control | $12,500.00 |\n\n---\n\n## Policy Summaries:\n\n### Returns & Refunds Policy:\nAgro Team Inc. accepts returns within 30 days of purchase. Products returned must be in original packaging and accompanied by an original purchase receipt. Agro Team Inc. does not accept opened, used, or damaged products. Refunds will be processed within 5-7 business days.\n\n### Delivery & Shipping Policy:\nAgro Team Inc. provides nationwide delivery within 10 business days. Expedited shipping (3-5 business days) is available at additional charges. Orders exceeding $15,000.00 qualify for free standard shipping.\n\n### Customer Support Guidelines:\nAgro Team Inc. provides customer support Monday–Saturday between 08:00 AM–06:00 PM CST. Technical issues should be communicated to the assigned technical support supervisor (Monica L. Reyes), 

In [16]:
predictions[0].source_nodes[3].score

IndexError: list index out of range

## Evaluate

In [7]:
predictions = json.load(open(f'./predictions/i_{i}_hs_{hs}_llm_{llm_name}_topk_{top_k}_cs_{chunk_size}_prep_{preparation}.json', 'r'))

In [15]:
evaluations = []
test_cases = []
for j, item in enumerate(tqdm(dataset[:DATASET_SIZE])):
    # define test case
    test_case = LLMTestCase(
        input=item.get('question'),
        actual_output=predictions[j]['response'],
        expected_output=item.get('answer'),
        retrieval_context=predictions[j]['source_nodes']
    )
    test_cases.append(test_case)

answer_relevancy_metric = AnswerRelevancyMetric(model=cfg['config']['rag']['models']['llm_openai'])
faithfulness_metric = FaithfulnessMetric(model=cfg['config']['rag']['models']['llm_openai'])
contextual_relevancy_metric = ContextualRelevancyMetric(model=cfg['config']['rag']['models']['llm_openai'])

# eval_dataset = EvaluationDataset(test_cases)
# results = eval_dataset.evaluate([answer_relevancy_metric, faithfulness_metric, contextual_relevancy_metric])

results = evaluate(
    test_cases=test_cases, 
    metrics=[answer_relevancy_metric, faithfulness_metric, contextual_relevancy_metric],
    max_concurrent=50
)

# answer_relevancy_metric.measure(test_case)
# faithfulness_metric.measure(test_case)
# contextual_relevancy_metric.measure(test_case)
    
# evaluations.append({
#     "answer_relevancy": {
#         "reason": answer_relevancy_metric.reason,
#         "score": answer_relevancy_metric.score
#     },
#     "faithfulness": {
#         "reason": faithfulness_metric.reason,
#         "score": faithfulness_metric.score
#     },
#     "contextual_relevancy": {
#         "reason": contextual_relevancy_metric.reason,
#         "score": contextual_relevancy_metric.score
#     }
# })


100%|██████████| 3/3 [00:00<00:00, 18504.28it/s]


Evaluating 3 test case(s) in parallel: |██████▋   | 67% (2/3) [Time Taken: 00:15,  7.73s/test case]ERROR:root:OpenAI Error: Could not parse response content as the length limit was reached - CompletionUsage(completion_tokens=16384, prompt_tokens=1056, total_tokens=17440, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0)) Retrying: 1 time(s)...
Evaluating 3 test case(s) in parallel: |██████████|100% (3/3) [Time Taken: 03:38, 72.82s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4o-mini, reason: The score is 1.00 because the response directly addressed the mission of Blue Horizon Energy without including any irrelevant statements., error: None)
  - ✅ Faithfulness (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4o-mini, reason: The score is 1.00 because there are no contradictions between the actual output and the retrieval context, indicating perfect alignment., error: None)
  - ❌ Contextual Relevancy (score: 0.07692307692307693, threshold: 0.5, strict: False, evaluation model: gpt-4o-mini, reason: The score is 0.08 because the retrieval context lacks specificity about the mission of Blue Horizon Energy, with multiple statements noting that they do not define it directly, such as 'The statement does not specifically state the mission of Blue Horizon Energy.' The relevant statement about providing clean, sustainable energy solutions touc




In [18]:
results

EvaluationResult(test_results=[TestResult(name='test_case_2', success=False, metrics_data=[MetricData(name='Answer Relevancy', threshold=0.5, success=True, score=1.0, reason='The score is 1.00 because the response directly addressed the mission of Blue Horizon Energy without including any irrelevant statements.', strict_mode=False, evaluation_model='gpt-4o-mini', error=None, evaluation_cost=0.00025065, verbose_logs='Statements:\n[\n    "To innovate and provide clean energy solutions.",\n    "To provide sustainable energy solutions.",\n    "Support a greener future."\n] \n \nVerdicts:\n[\n    {\n        "verdict": "yes",\n        "reason": null\n    },\n    {\n        "verdict": "yes",\n        "reason": null\n    },\n    {\n        "verdict": "idk",\n        "reason": null\n    }\n]'), MetricData(name='Faithfulness', threshold=0.5, success=True, score=1.0, reason='The score is 1.00 because there are no contradictions between the actual output and the retrieval context, indicating perfe

In [23]:
results.test_results[0].metrics_data

[MetricData(name='Answer Relevancy', threshold=0.5, success=True, score=1.0, reason='The score is 1.00 because the response directly addressed the mission of Blue Horizon Energy without including any irrelevant statements.', strict_mode=False, evaluation_model='gpt-4o-mini', error=None, evaluation_cost=0.00025065, verbose_logs='Statements:\n[\n    "To innovate and provide clean energy solutions.",\n    "To provide sustainable energy solutions.",\n    "Support a greener future."\n] \n \nVerdicts:\n[\n    {\n        "verdict": "yes",\n        "reason": null\n    },\n    {\n        "verdict": "yes",\n        "reason": null\n    },\n    {\n        "verdict": "idk",\n        "reason": null\n    }\n]'),
 MetricData(name='Faithfulness', threshold=0.5, success=True, score=1.0, reason='The score is 1.00 because there are no contradictions between the actual output and the retrieval context, indicating perfect alignment.', strict_mode=False, evaluation_model='gpt-4o-mini', error=None, evaluation

In [None]:

{
    "answer_relevancy": {
        "reason": answer_relevancy_metric.reason,
        "score": answer_relevancy_metric.score
    },
    "faithfulness": {
        "reason": faithfulness_metric.reason,
        "score": faithfulness_metric.score
    },
    "contextual_relevancy": {
        "reason": contextual_relevancy_metric.reason,
        "score": contextual_relevancy_metric.score
    }
}

In [None]:
###### save the results
with open(f'./evaluations/i_{j}_hs_{hs}_llm_{llm_name}_topk_{top_k}_cs_{chunk_size}_prep_{preparation}.json', 'w') as f:
    json.dump(evaluations, f)