### Fact Checker
- receives a json of facts, loop through them
- for each fact, queries the knowledge base tool to check
- if not found, uses web search tool

In [15]:
#install dependencies
import requests
from langchain.agents import create_agent
from langchain.messages import HumanMessage
from langchain.tools import tool

import os
import json
from typing import List, Literal, TypedDict
from pydantic import BaseModel, Field

from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.tools import tool
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_core.messages import HumanMessage

import os
from dotenv import load_dotenv

In [16]:
#LangChain has its own callback system, and Langfuse listens 
# to those callbacks to record what your chains and LLMs are doing.
from langfuse.langchain import CallbackHandler

langfuse_handler = CallbackHandler()

In [17]:
#load environment variables
load_dotenv()

# Access the variables
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
SILICON_FLOW_BASE_URL = os.getenv("SILICON_FLOW_BASE_URL")

In [18]:
#set paths
FACTS_JSONL_PATH = "C:\\Users\\HP\\Documents\\repos\\essay-checker-agentic-rag\\data\\processed\\extracted_facts.jsonl"
ALLOWED_DOMAINS = "" # can set to wikipedia domain if want to limit
FACT_CHECKING_OUTPUT_PATH = "C:\\Users\\HP\\Documents\\repos\\essay-checker-agentic-rag\\data\\processed\\fact_checking_output.json"
DOCUMENT_FOLDER_PATH = "C:\\Users\\HP\\Documents\\repos\\essay-checker-agentic-rag\\data\\knowledge_base"

In [19]:
#load knowledge base documents
loader = DirectoryLoader(DOCUMENT_FOLDER_PATH, glob="./*.pdf", loader_cls=PyPDFLoader)
docs = loader.load()
docs[:2]  # preview first two documents

[Document(metadata={'producer': 'Microsoft® Word 2024', 'creator': 'Microsoft® Word 2024', 'creationdate': '2025-12-20T19:55:41+08:00', 'author': 'hp404sk7@outlook.com', 'moddate': '2025-12-20T19:55:41+08:00', 'source': 'C:\\Users\\HP\\Documents\\repos\\essay-checker-agentic-rag\\data\\knowledge_base\\AI_and_assessment_in_higher_education.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1'}, page_content='AI and assessment in higher education \nhttps://www.timeshighereducation.com/campus/ai-and-assessment-higher-\neducation  \nThreat or opportunity? Advice for using, managing and embedding artificial intelligence \nin university assessment, skills development and task design \nNo sooner had generative AI (GenAI) tools, such as ChatGPT, ignited fears in universities \nabout risk to assessment practices and academic integrity, than academics began \nworking out how to embrace it to save time and enrich student skills such as critical \nthinking and analysis. This has required considerat

In [20]:
#split documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
splits = text_splitter.split_documents(docs)
splits[:2]  # show first two splits

[Document(metadata={'producer': 'Microsoft® Word 2024', 'creator': 'Microsoft® Word 2024', 'creationdate': '2025-12-20T19:55:41+08:00', 'author': 'hp404sk7@outlook.com', 'moddate': '2025-12-20T19:55:41+08:00', 'source': 'C:\\Users\\HP\\Documents\\repos\\essay-checker-agentic-rag\\data\\knowledge_base\\AI_and_assessment_in_higher_education.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1'}, page_content='AI and assessment in higher education \nhttps://www.timeshighereducation.com/campus/ai-and-assessment-higher-\neducation  \nThreat or opportunity? Advice for using, managing and embedding artificial intelligence \nin university assessment, skills development and task design \nNo sooner had generative AI (GenAI) tools, such as ChatGPT, ignited fears in universities \nabout risk to assessment practices and academic integrity, than academics began'),
 Document(metadata={'producer': 'Microsoft® Word 2024', 'creator': 'Microsoft® Word 2024', 'creationdate': '2025-12-20T19:55:41+08:00', 'a

In [21]:
#define embeddings model
#SiliconFlow hosts open-source embedding models that can be used with LangChain
embeddings = OpenAIEmbeddings(
    model="BAAI/bge-m3",
    api_key=OPENAI_API_KEY,
    base_url=SILICON_FLOW_BASE_URL,
    # Crucial for SiliconFlow/Local providers to avoid dimension errors
    check_embedding_ctx_length=False,
    chunk_size=64
)

In [22]:
#create vectorstore and retriever
VECTOR_DATABASE_PATH = "C:\\Users\\HP\\Documents\\repos\\essay-checker-agentic-rag\\data\\chroma_db"

if os.path.exists(VECTOR_DATABASE_PATH):
    vectorstore = Chroma(
        persist_directory=VECTOR_DATABASE_PATH,
        embedding_function=embeddings,
        collection_name="essay_kb")
else:
    vectorstore = Chroma.from_documents(
            documents=splits, 
            embedding=embeddings,
            collection_name="essay_kb",
            persist_directory=VECTOR_DATABASE_PATH
        )

retriever = vectorstore.as_retriever(search_kwargs={"k": 2})

In [23]:
#define tool used to search knowledge base
@tool 
def search_knowledge_base(query: str) -> str:
    """Search the internal knowledge base of relevant essay documents."""
    docs = retriever.invoke(query)
    return "\n\n".join([d.page_content for d in docs])

In [27]:
#define output structure
class FactEvaluation(BaseModel):
    statement: str = Field(description="The verbatim excerpt of the statement")
    correctness_score: Literal["correct", "wrong", "undetermined"] = Field(description="The final verdict")
    summary_description: str = Field(description="Summary of why this verdict was reached")
    #search_results: str = Field(description="A short verbatim of the search results")
    source_document: str = Field(description="One source document - if it is from knowledge base, output the *document name* and *page number*, if it is from web, output the *URL*")

In [25]:
system_prompt = f"""
You are a fact-checking assistant that verifies a single factual statement.

You MUST follow the tool usage and decision rules exactly.

====================
TOOL USAGE RULES
====================

1. You MUST call `search_knowledge_base` exactly ONCE as your first step.
2. After reviewing the retrieved content:
   - If the content clearly SUPPORTS the statement → verdict = "correct"
   - If the content clearly CONTRADICTS the statement → verdict = "wrong"
   - If the content is unrelated, neutral, ambiguous, or missing key information → verdict is NOT decided yet
3. ONLY if the knowledge base is insufficient as defined above, you MAY call `search_web` at most ONCE.
4. After web search:
   - If web content clearly supports → "correct"
   - If web content clearly contradicts → "wrong"
   - If still unclear or conflicting → "undetermined"
5. You MUST NOT call any tool more than once.

====================
DEFINITIONS
====================

- "Supports": Explicitly states the same fact without contradiction.
- "Contradicts": Explicitly states the opposite of the fact.
- "Insufficient": Mentions the topic but does not confirm or deny the exact claim.

====================
OUTPUT FORMAT
====================

Return ONLY a valid JSON object that strictly matches this schema:

{FactEvaluation.model_json_schema()}

====================
SOURCE DOCUMENT RULES
====================

- If the verdict is based on the knowledge base:
  - source_document MUST be: "<document_name>, page <page_number>"
- If the verdict is based on web search:
  - source_document MUST be a single URL
- If verdict is "undetermined":
  - source_document MUST be an empty string ""

Do NOT include markdown, explanations outside JSON, or multiple sources.
"""


In [26]:
from pydantic import BaseModel, Field
from typing import List, Optional, Literal, Dict, Any
#DONT RUN THIS

class EvidenceMetadata(BaseModel):
    # Common metadata (shared across sources)
    source_type: Literal["vectorstore", "web"] = Field(description="Where this evidence was retrieved from")
    relevance_score: Optional[float] = Field(description="Similarity or relevance score from retriever", default=None)

    # Vectorstore-specific (Chroma + PyPDF)
    #document_id: Optional[str] = Field(description="Unique ID of the document in Chroma", default=None)
    file_name: Optional[str] = Field(description="Original PDF or document filename", default=None)
    page_number: Optional[int] = Field(description="Page number in the PDF (if available)", default=None)
    #chunk_id: Optional[str] = Field(description="Chunk identifier in vectorstore", default=None)

    # Web-specific (Jina MCP)
    url: Optional[str] = Field(description="Source URL for web evidence", default=None)
    domain: Optional[str] = Field(description="Domain name of the web source", default=None)
    #published_date: Optional[str] = Field(description="Publication date if available", default=None)

    # Raw metadata passthrough (future-proof)
    raw_metadata: Optional[Dict[str, Any]] = Field(description="Unmodified metadata returned by the retriever", default=None)


class Evidence(BaseModel):
    excerpt: str = Field(description="Verbatim excerpt used as supporting or contradicting evidence")
    metadata: EvidenceMetadata = Field(description="Provenance and retrieval details for this evidence")
    supports_statement: Literal["supports", "contradicts", "neutral"] = Field(description="How this evidence relates to the statement")


class FactEvaluation(BaseModel):
    statement: str = Field(description="The verbatim statement being evaluated")
    correctness_score: Literal["correct", "wrong", "undetermined"] = Field(description="Final verdict after evaluating all evidence")
    summary_description: str = Field(description="Concise reasoning explaining how the verdict was reached or not reached")
    evidence: Evidence = Field(description="One evidence item used to reach the verdict if it is reached")
    confidence: float = Field(description="Confidence score (0-1) for the verdict", default=None)

In [28]:
#call mcp for web search tool and invoke agent
import asyncio
from langchain_mcp_adapters.client import MultiServerMCPClient
from langchain.agents import create_agent
#import nest_asyncio

# This allows async to run inside your notebook's existing event loop
#nest_asyncio.apply()

JINA_API_KEY = os.getenv("JINA_API_KEY")

async def run_fact_checker():
    # 2. Setup Client pointing to the REMOTE Jina server
    # Note: We filter for 'search' and 'read' tags to save tokens
    client = MultiServerMCPClient({
        "jina": {
            "transport": "streamable_http",
            "url": "https://mcp.jina.ai/v1?include_tags=search,read",
            "headers": {
                "Authorization": f"Bearer {JINA_API_KEY}"
            }
        }
    })

    # 1. Retrieve the web search and read tools from the MCP server
    mcp_tools = await client.get_tools()
    all_tools = [search_knowledge_base] + mcp_tools

    # 2. Initialize your LLM
    llm_model = ChatOpenAI(
        model="deepseek-ai/DeepSeek-V3",
        openai_api_key=OPENAI_API_KEY,
        openai_api_base="https://api.siliconflow.cn/v1", 
        temperature=0.2,
        max_tokens=800
    )   

    # 3. Create the agent
    agent = create_agent(
        tools=all_tools,    
        system_prompt=system_prompt,
        model=llm_model,
        name = "FactCheckerAgent",
    )

    results = []
    with open(FACTS_JSONL_PATH, 'r') as f:
        #for line in f:
        for i in range(2):
            line = f.readline()
            fact_data = json.loads(line)
            statement = fact_data["statement"]
            
            # Invoke agent for each fact
            inputs = {"messages": [HumanMessage(content=f"Evaluate this fact: {statement}")]}
            #config = {"configurable": {"thread_id": "fact_check"}}
            
            # Use structured output parsing
            response = await agent.ainvoke(inputs, config={"callbacks":[langfuse_handler]})
            # The last message contains the result. 
            print(response["messages"][-1].content)
            # We can force the agent to return the structured schema
            structured_llm = llm_model.with_structured_output(FactEvaluation)
            final_eval = await structured_llm.ainvoke(response["messages"][-1].content, config={"callbacks":[langfuse_handler]})
            
            results.append(final_eval.model_dump())
            print(f"Validated fact: {i}")

    return results

#output = asyncio.run(run_fact_checker())
output = await run_fact_checker()
for fact in output:
    print(fact)



```json
{
  "statement": "Generative Artificial Intelligence (GenAI) tools, such as ChatGPT and Claude, have fundamentally disrupted the landscape of higher education.",
  "correctness_score": "correct",
  "summary_description": "The knowledge base confirms that GenAI tools like ChatGPT have disrupted higher education, particularly in assessment practices and academic integrity.",
  "source_document": "AI and assessment in higher education, page 1"
}
```
Validated fact: 0
```json
{
  "statement": "GenAI’s ability to synthesise complex information and generate human-like text presents unprecedented challenges to established educational norms.",
  "correctness_score": "correct",
  "summary_description": "Multiple sources confirm that GenAI's capabilities disrupt traditional educational practices, raising concerns about academic integrity, ethical challenges, and the need for new pedagogical approaches.",
  "source_document": "https://www.mdpi.com/2227-7102/13/9/856"
}
```
Validated fact:

In [25]:
# export to JSON file
import json
from pathlib import Path

Path(FACT_CHECKING_OUTPUT_PATH).write_text(json.dumps(output, indent=2))
print(f"Output saved to {FACT_CHECKING_OUTPUT_PATH}")

Output saved to C:\Users\HP\Documents\repos\essay-checker-agentic-rag\data\processed\fact_checking_output.json


# Irrelevant

In [10]:
#define output structure
class FactEvaluation(BaseModel):
    statement: str = Field(description="The original fact statement")
    correctness_score: Literal["correct", "wrong", "undetermined"] = Field(description="The final verdict")
    summary_description: str = Field(description="Summary of why this verdict was reached")

In [None]:
# System prompt enforcing the logic
system_prompt = """You are a fact-checking assistant. 
Logic Flow:
1. For every fact, first use 'search_knowledge_base'.
2. If the retrieved info supports the fact, mark as 'correct'.
3. If it contradicts, mark as 'wrong'.
4. If the knowledge base is insufficient (neutral/unknown), you MUST call 'web_search' ONCE to check online.
5. If web results still don't clarify, mark as 'undetermined'.

Return the result strictly as a JSON object matching the FactEvaluation schema."""

In [15]:
#define the tools
tools = [search_knowledge_base, web_search]

In [16]:
#define model used
llm_model = ChatOpenAI(
    model="deepseek-ai/DeepSeek-V3",
    openai_api_key=OPENAI_API_KEY,
    openai_api_base="https://api.siliconflow.cn/v1", 
)

In [17]:
#create the agent
agent = create_agent(
    tools=tools,    
    system_prompt=system_prompt,
    model=llm_model,
    name = "FactCheckerAgent",
)

In [None]:
results = []
    
with open(FACTS_JSONL_PATH, 'r') as f:
    for line in f:
        fact_data = json.loads(line)
        statement = fact_data["statement"]
        
        # Invoke agent for each fact
        inputs = {"messages": [HumanMessage(content=f"Evaluate this fact: {statement}")]}
        config = {"configurable": {"thread_id": "fact_check"}}
        
        # Use structured output parsing
        response = agent.invoke(inputs)
        # The last message contains the result. 
        # We can force the agent to return the structured schema
        structured_llm = llm_model.with_structured_output(FactEvaluation)
        final_eval = structured_llm.invoke(response["messages"][-1].content)
        
        results.append(final_eval.dict())

results[:5]