In [1]:
# -*- coding: utf-8 -*-
import os
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
from metrics.hallucination.metric import HallucinationMetric
from metrics.llm_test_case import LLMTestCase

load_dotenv()

True

In [2]:
data = pd.read_csv("OKG/filtered_questions_50_v3.csv", index_col=0)
data["dbpedia_entities"] = data["dbpedia_entities"].apply(lambda x: eval(x))
data["dbpedia_entities_re"] = data["dbpedia_entities_re"].apply(lambda x: eval(x))

df_res = pd.read_csv("OKG/generation/50_v3.csv", index_col=0)
res_list = df_res["zero_shot"].to_list()

In [39]:
test_cases = []
for index, sample in data.iterrows():
    entities = []
    for entity in sample["dbpedia_entities"].values():
        entities.append(entity.split("/")[-1])

    retrieved_context = ""
    for entity in entities:
        retrieved_context += open(f"OKG/wikipedia/{entity}.txt", "r").read()
    retrieved_context = retrieved_context.split("\n")

    question = sample["question"]

    test_cases.append(
        LLMTestCase(
            input=question,
            actual_output=res_list[index],
            retrieval_context=retrieved_context,
            expected_output="",
        )
    )

In [40]:
metric = HallucinationMetric(
    threshold=0.5,
    model="gpt-4o",
    api_key=os.getenv("OPENAI_API_KEY"),
    include_reason=True,
    strict_mode=False,
)

for test_case in tqdm(test_cases):
    metric.measure(test_case)
    print(metric.score)
    print(metric.reason)

  0%|          | 0/50 [00:00<?, ?it/s]

Event loop is already running. Applying nest_asyncio patch to allow async execution...


  2%|▏         | 1/50 [00:36<29:57, 36.68s/it]

**************************************************
Hallucination Verbose Logs
**************************************************

Truths:
[
    "Socrates was a Greek philosopher from Athens.",
    "Socrates is credited as the founder of Western philosophy.",
    "Socrates is among the first moral philosophers of the ethical tradition of thought.",
    "Socrates authored no texts.",
    "Socrates is known mainly through the posthumous accounts of classical writers.",
    "His students Plato and Xenophon wrote accounts of Socrates.",
    "These accounts are written as dialogues.",
    "Socrates was a polarizing figure in Athenian society.",
    "In 399 BC, Socrates was accused of impiety and corrupting the youth.",
    "Socrates was sentenced to death after a trial that lasted a day.",
    "Socrates spent his last day in prison.",
    "Socrates refused offers to help him escape from prison.",
    "Plato's dialogues are among the most comprehensive accounts of Socrates to survive from ant

  4%|▍         | 2/50 [01:03<24:53, 31.11s/it]

**************************************************
Hallucination Verbose Logs
**************************************************

Truths:
[
    "The Roman Empire was the state ruled by the Romans following Octavian's assumption of sole rule under the Principate in 27 BC.",
    "The Roman Empire included territories in Europe, North Africa, and Western Asia.",
    "The rulers of the Roman Empire were known as emperors.",
    "The fall of the Western Roman Empire in 476 AD conventionally marks the end of classical antiquity and the beginning of the Middle Ages.",
    "By 100 BC, Rome had expanded its rule to most of the Mediterranean and beyond.",
    "The Roman Empire was severely destabilized by civil wars and political conflicts, which culminated in the victory of Octavian over Mark Antony and Cleopatra at the Battle of Actium in 31 BC.",
    "In 27 BC, the Roman Senate granted Octavian overarching military power (imperium) and the new title of Augustus.",
    "The first two centuries

  6%|▌         | 3/50 [01:46<28:19, 36.16s/it]

**************************************************
Hallucination Verbose Logs
**************************************************

Truths:
[
    "John Fitzgerald Kennedy was born on May 29, 1917.",
    "John Fitzgerald Kennedy died on November 22, 1963.",
    "John Fitzgerald Kennedy is often referred to as JFK.",
    "JFK was an American politician.",
    "JFK served as the 35th president of the United States.",
    "JFK served as the president from 1961 until his assassination in 1963.",
    "JFK was the youngest person elected president.",
    "JFK served during the Cold War.",
    "Kennedy's foreign policy largely concerned relations with the Soviet Union and Cuba.",
    "Kennedy was a Democrat.",
    "Kennedy represented Massachusetts in both houses of the United States Congress prior to his presidency.",
    "Kennedy was born in Brookline, Massachusetts.",
    "Kennedy graduated from Harvard University in 1940.",
    "Kennedy joined the U.S. Naval Reserve in 1941.",
    "During Wo

  6%|▌         | 3/50 [02:13<34:55, 44.58s/it]


KeyboardInterrupt: 

**************************************************
Hallucination Verbose Logs
**************************************************

Truths:
[
    "The New Deal was a series of programs, public work projects, financial reforms, and regulations enacted by President Franklin D. Roosevelt in the United States between 1933 and 1938 to rescue the U.S. from the Great Depression.",
    "The New Deal included new constraints and safeguards on the banking industry and efforts to re-inflate the economy after prices had fallen sharply.",
    "New Deal programs included both laws passed by Congress as well as presidential executive orders during the first term of the presidency of Franklin D. Roosevelt.",
    "The programs focused on what historians refer to as the '3 R's': relief for the unemployed and for the poor, recovery of the economy back to normal levels, and reform of the financial system to prevent a repeat depression.",
    "The New Deal produced a political realignment, making the Democra

In [1]:
import os
from metrics.hallucination.metric import HallucinationMetric
from metrics.llm_test_case import LLMTestCase
from dotenv import load_dotenv

load_dotenv()

test_case = LLMTestCase(
    input="",
    actual_output="We offer a 30-day full refund at no extra cost. We have a pair of shoes",  # extract claims
    retrieval_context="All customers are eligible for a 30 day full refund at no extra cost. We have a pair of shoes.",
)

metric = HallucinationMetric(
    threshold=0.5,
    model="gpt-4o",
    api_key=os.getenv("OPENAI_API_KEY"),
    include_reason=True,
    strict_mode=False,
)

metric.measure(test_case)
print(metric.score)
print(metric.reason)

Event loop is already running. Applying nest_asyncio patch to allow async execution...
**************************************************
Hallucination Verbose Logs
**************************************************

Truths:
[
    "All customers are eligible for a 30 day full refund at no extra cost.",
    "The entity has a pair of shoes."
]

Claims:
[
    "We offer a 30-day full refund at no extra cost.",
    "We have a pair of shoes."
]

Verdicts:
[
    {
        "verdict": "yes",
        "reason": "The actual output claims a 30-day full refund at no extra cost, which is correct as the retrieval context states it."
    },
    {
        "verdict": "yes",
        "reason": "The actual output claims to have a pair of shoes, which is correct as the retrieval context states it."
    }
]

Score: 1.0
Reason: The score is 1.00 because there are no contradictions between the actual output and the retrieval context. Great job maintaining perfect faithfulness!

1.0
The score is 1.00 because the