<a target="_blank" href="https://colab.research.google.com/github/UpstageAI/cookbook/blob/main/Solar-Fullstack-LLM-101/18_fact_check_with_kg.ipynb">
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

# Fact Checking with Knowledge Graph


In [1]:
! pip3 install -qU langchain langchain-upstage langchain_community python-dotenv duckduckgo-search


## UPSTAGE_API_KEY
To obtain your Upstage API key, follow these steps:

1. Visit the Upstage AI console at <https://console.upstage.ai>.
2. Sign up for an account if you don't already have one.
3. Log in to your account.
4. Navigate to the API key section.
5. Generate your API key.
6. Copy the key and save it securely.

![Console](./figures/console.upstage.ai.jpg)

In [2]:
# @title set API key
from pprint import pprint
import os

import warnings

warnings.filterwarnings("ignore")

if "google.colab" in str(get_ipython()):
    # Running in Google Colab. Please set the UPSTAGE_API_KEY in the Colab Secrets
    from google.colab import userdata

    os.environ["UPSTAGE_API_KEY"] = userdata.get("UPSTAGE_API_KEY")
else:
    # Running locally. Please set the UPSTAGE_API_KEY in the .env file
    from dotenv import load_dotenv

    load_dotenv()

assert (
    "UPSTAGE_API_KEY" in os.environ
), "Please set the UPSTAGE_API_KEY environment variable"

In [3]:
from typing import List, Dict, Optional, Any, Union
import json

from langchain_upstage import ChatUpstage as Chat
from langchain_community.tools import DuckDuckGoSearchResults

from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
from typing import Optional, Dict, Union, List, Any

solar_pro = Chat(model="solar-pro")


In [4]:
text_to_check = """
Sung Kim is CEO of UpstageAI and it is founded in 1995.
"""

In [5]:
def extracted_claimed_facts(
    text: str, llm: Optional[Chat] = solar_pro
) -> List[Dict[str, Any]]:
    """
    Extract claimed facts from the given text, including entities and their relationships.

    Args:
        text (str): The input text to extract facts from.
        llm (Optional[Chat]): The language model to use for extraction, if needed.

    Returns:
        List[Dict[str, Any]]: A list of extracted facts, where each fact is represented as a dictionary.
    """

    # Create the prompt template
    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                "You are an expert fact extractor. Your task is to analyze the given text and extract a list of claimed facts, focusing on entities and their relationships. Extract precise and specific relations without categorizing them into predefined types.",
            ),
            (
                "human",
                """Extract the claimed facts from the following text, providing a list of dictionaries. Each dictionary should represent a fact and include keys for 'entity', 'relation', and 'value'. Be specific and precise with the relations.

Examples:
Input: "Albert Einstein developed the theory of relativity in 1915."
Output: [
    {{"entity": "Albert Einstein", "relation": "developed", "value": "theory of relativity"}},
    {{"entity": "theory of relativity", "relation": "developed in", "value": "1915"}}
]

Input: "The Eiffel Tower, completed in 1889, stands at a height of 324 meters."
Output: [
    {{"entity": "Eiffel Tower", "relation": "completed in", "value": "1889"}},
    {{"entity": "Eiffel Tower", "relation": "height", "value": "324 meters"}}
]

Now, extract facts from the following text:
{input_text}""",
            ),
            (
                "human",
                "Respond with a JSON array of fact dictionaries only, without any additional text.",
            ),
        ]
    )

    # Create the output parser
    output_parser = JsonOutputParser()

    # Create the chain
    chain = prompt | llm | output_parser

    # Run the chain
    result = chain.invoke({"input_text": text})

    return result

In [6]:
print("\nStep 1: Extracting claimed facts")
claimed_facts = extracted_claimed_facts(text_to_check)
print(f"Extracted {len(claimed_facts)} claimed facts:")
for i, fact in enumerate(claimed_facts):
    print(fact)
    print(f"  {i+1}. {fact['entity']} {fact['relation']} {fact['value']}")
    print()



Step 1: Extracting claimed facts
Extracted 2 claimed facts:
{'entity': 'Sung Kim', 'relation': 'CEO of', 'value': 'UpstageAI'}
  1. Sung Kim CEO of UpstageAI

{'entity': 'UpstageAI', 'relation': 'founded in', 'value': '1995'}
  2. UpstageAI founded in 1995



In [7]:
def search_context(
    text: str,
    claimed_facts: List[Dict[str, Any]],
    search_tool: DuckDuckGoSearchResults = DuckDuckGoSearchResults(),
    llm: Optional[Chat] = solar_pro,
) -> str:
    """
    Search for relevant information using claimed facts.

    Args:
        text (str): The original input text.
        claimed_facts (List[Dict[str, Any]]): The list of extracted claimed facts.
        search_tool (Any): The search tool to use for finding information (e.g., DuckDuckGoSearchResults).
        llm (Optional[Chat]): The language model to use for processing, if needed.

    Returns:
        str: The relevant context information found from the search.
    """

    # Step 1: Generate search keywords
    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                "You are an expert at generating concise and relevant search keywords. Your task is to analyze the given text and extracted facts, then produce a list of 3-5 search keywords or short phrases that would be most effective for finding additional context and verification information.",
            ),
            (
                "human",
                """Given the following text and extracted facts, generate a list of 3-5 search keywords or short phrases:

Text: {text}

Extracted Facts:
{facts}

Provide only the keywords or short phrases, separated by commas.""",
            ),
        ]
    )

    facts_str = "\n".join(
        [
            f"- {fact['entity']} {fact['relation']} {fact['value']}"
            for fact in claimed_facts
        ]
    )
    keywords_response = llm.invoke(prompt.format(text=text, facts=facts_str))

    # Parse the keywords from the response
    keywords = [kw.strip() for kw in keywords_response.content.split(",") if kw.strip()]

    # Step 2: Perform search using the generated keywords
    search_query = " ".join(keywords)
    search_results = search_tool.run(search_query)

    # Step 3: Return the search results
    return search_results

In [8]:
print("\nStep 2: Searching for relevant context")

relevant_context = search_context(text_to_check, claimed_facts)
pprint(relevant_context)


Step 2: Searching for relevant context
('snippet: SAN JOSE, California, Apr. 16, 2024 - Upstage, a pioneering AI '
 'company specializing in large language models (LLMs) and Document AI, today '
 'announced that it has raised $72 million in a Series B.With this latest '
 'investment, Upstage has now raised over $100 million since its founding in '
 'October 2020, making it the most-funded South Korean AI software company in '
 'history., title: Upstage Raises $72 Million in Series B Funding — Upstage, '
 'link: https://www.upstage.ai/feed/press/upstage-series-b-funding, snippet: '
 '"This SCA highlights our strong partnership with AWS and our shared vision '
 'to help enterprises globally adopt cutting-edge generative AI," said Sung '
 'Kim, CEO and co-founder of Upstage."With the ..., title: Upstage Signs '
 'Multi-Year Strategic Collaboration Agreement with AWS to ..., link: '
 'https://finance.yahoo.com/news/upstage-signs-multi-strategic-collaboration-120000054.html, '
 'snippet: W

In [9]:
def build_kg(
    claimed_facts: List[Dict[str, Any]],
    context: str,
    llm: Optional[Chat] = solar_pro,
) -> Dict[str, Any]:
    """
    Build a knowledge graph from claimed facts and context information.

    Args:
        claimed_facts (List[Dict[str, Any]]): The list of extracted claimed facts.
        context (str): The context information retrieved from the search.
        llm (Optional[Chat]): The language model to use for processing, if needed.

    Returns:
        Dict[str, Any]: The constructed knowledge graph with source information.
    """

    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                "You are an expert in building knowledge graphs. Your task is to analyze the given context and construct a knowledge graph, using the claimed facts only as inspiration for the schema without assuming their truth. Include source information for each fact.",
            ),
            (
                "human",
                """Given the following context and claimed facts, construct a knowledge graph. Assume all information in the context is true, but use the claimed facts only as hints for the types of relations to look for.

Context:
{context}

Claimed Facts (use only as schema hints):
{claimed_facts}

Construct the knowledge graph as a JSON object where keys are entities and values are dictionaries of relations. Each relation should have a "value" and a "source" (a relevant quote from the context).

Example format:
{{
  "Entity1": {{
    "relation1": {{
      "value": "Value1",
      "source": "Relevant quote from context"
    }},
    "relation2": {{
      "value": "Value2",
      "source": "Another relevant quote"
    }}
  }},
  "Entity2": {{
    ...
  }}
}}

Ensure that:
1. All information comes from the context, not the claimed facts.
2. Each fact has a source quote from the context.
3. The schema is inspired by, but not limited to, the relations in the claimed facts.

Construct the knowledge graph:""",
            ),
        ]
    )

    output_parser = JsonOutputParser()
    chain = prompt | llm | output_parser

    facts_str = "\n".join(
        [
            f"- {fact['entity']} {fact['relation']} {fact['value']}"
            for fact in claimed_facts
        ]
    )

    kg = chain.invoke({"context": context, "claimed_facts": facts_str})

    return kg


In [10]:
print("\nStep 3: Building knowledge graph")
kg = build_kg(claimed_facts, relevant_context)
print(f"Built knowledge graph with {len(kg)} entities")
pprint(kg)


Step 3: Building knowledge graph
Built knowledge graph with 3 entities
{'AWS': {'partnership': {'source': 'This SCA highlights our strong partnership '
                                   'with AWS and our shared vision to help '
                                   'enterprises globally adopt cutting-edge '
                                   'generative AI.',
                         'value': 'AWS has a multi-year strategic '
                                  'collaboration agreement with UpstageAI.'}},
 'Sung Kim': {'position': {'source': 'Claimed Facts (use only as schema '
                                     'hints): - Sung Kim CEO of UpstageAI',
                           'value': 'CEO and co-founder of UpstageAI'}},
 'UpstageAI': {'founding_year': {'source': 'With this latest investment, '
                                           'Upstage has now raised over $100 '
                                           'million since its founding in '
                                       

In [11]:
def verify_facts(
    claimed_facts: List[Dict[str, Any]],
    context: str,
    kg: Dict[str, Any],
    confidence_threshold: float,
    llm: Optional[Chat] = solar_pro,
) -> Dict[str, Dict[str, Union[str, float, bool]]]:
    """
    Verify the claimed facts against the knowledge graph and context.

    Args:
        claimed_facts (List[Dict[str, Any]]): The list of extracted claimed facts.
        context (str): The context information retrieved from the search.
        kg (Dict[str, Any]): The constructed knowledge graph.
        confidence_threshold (float): The confidence threshold for fact verification.
        llm (Optional[Chat]): The language model to use for verification, if needed.

    Returns:
        Dict[str, Dict[str, Union[str, float, bool]]]: Verified facts with confidence scores.
        The structure is: {fact_id: {"claimed": str, "verified": bool, "confidence": float, "explanation": str}}
    """

    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                "You are an expert fact-checker. Your task is to verify a claimed fact against a knowledge graph and context information. Provide a verification result, confidence score, and explanation for the fact.",
            ),
            (
                "human",
                """Verify the following claimed fact using the provided knowledge graph and context. Determine if it's verified, assign a confidence score (0.0 to 1.0), and provide a brief explanation.

Claimed Fact: {entity} {relation} {value}

Knowledge Graph:
{kg}

Context:
{context}

Provide a JSON object with the following structure:
{{
  "verified": bool,
  "confidence": float,
  "explanation": string
}}

Ensure that:
1. The verification is based on the information in the knowledge graph and context.
2. The confidence score reflects the certainty of the verification (1.0 for absolute certainty, lower for less certainty).
3. The explanation briefly justifies the verification decision and confidence score.

Provide the verification result:""",
            ),
        ]
    )

    output_parser = JsonOutputParser()
    chain = prompt | llm | output_parser

    kg_str = json.dumps(kg, indent=2)
    verified_facts = {}

    for i, fact in enumerate(claimed_facts):
        verification_result = chain.invoke(
            {
                "entity": fact["entity"],
                "relation": fact["relation"],
                "value": fact["value"],
                "kg": kg_str,
                "context": context,
            }
        )

        verified_facts[str(i)] = {
            "claimed": f"{fact['entity']} {fact['relation']} {fact['value']}",
            **verification_result,
        }

    return verified_facts

In [12]:
print("\nStep 4: Verifying facts")
verified_facts = verify_facts(claimed_facts, relevant_context, kg, confidence_threshold=0.7)
print(f"Verified {len(verified_facts)} facts:")
for fact_id, result in verified_facts.items():
    print(f"  Fact {fact_id}:")
    print(f"    Claimed: {result['claimed']}")
    print(f"    Verified: {result['verified']}")
    print(f"    Confidence: {result['confidence']}")
    print(
        f"    Explanation: {result['explanation']}"
    )  # Truncate long explanations


Step 4: Verifying facts
Verified 2 facts:
  Fact 0:
    Claimed: Sung Kim CEO of UpstageAI
    Verified: True
    Confidence: 1.0
    Explanation: The knowledge graph clearly states that Sung Kim holds the position of 'CEO and co-founder of UpstageAI'. Additionally, multiple sources in the context confirm this information, including a direct quote from Sung Kim referring to himself as the CEO of Upstage.
  Fact 1:
    Claimed: UpstageAI founded in 1995
    Verified: False
    Confidence: 1.0
    Explanation: The claimed fact is not verified. According to the knowledge graph, UpstageAI was founded in 2020, not in 1995. This information is also consistent with the context information provided.


In [13]:
def add_fact_check_comments_to_text(text, verified_facts, llm=solar_pro):
    # First, let's create a mapping of claimed facts to their verifications
    fact_map = {fact["claimed"]: fact for fact in verified_facts.values()}

    # Now, let's ask the LLM to annotate the original text
    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                """You are an AI assistant tasked with adding fact-check annotations to a given text.
    For each fact in the text that has been verified, add an inline annotation 
    right after the fact, using the following format:
    [Fact: <STATUS> (Confidence: <CONFIDENCE>) - <BRIEF_EXPLANATION>]
    Where <STATUS> is True, False, or Unsure, <CONFIDENCE> is the confidence score,
    and <BRIEF_EXPLANATION> is a very short explanation.
    """,
            ),
            (
                "human",
                """Original text:
    {text}

    Verified facts:
    {fact_map}

    Please add fact-check annotations to the original text based on the verified facts.
    """,
            ),
        ]
    )

    chain = prompt | llm | StrOutputParser()
    response = chain.invoke({"text": text, "fact_map": fact_map})

    return response

In [14]:
# Final step
print("\nStep 5: Adding fact-check annotations to the original text")
fact_checked_text = add_fact_check_comments_to_text(text_to_check, verified_facts)
print("Fact-checked text generated")
pprint(fact_checked_text)


Step 5: Adding fact-check annotations to the original text
Fact-checked text generated
('Sung Kim is CEO of UpstageAI [Fact: True (Confidence: 1.0) - The knowledge '
 'graph and multiple sources confirm this information] and it is founded in '
 '1995 [Fact: False (Confidence: 1.0) - UpstageAI was founded in 2020, not in '
 '1995].')
