### Fact Checker
- receives a json of facts, loop through them
- for each fact, queries the knowledge base tool to check
- if not found, uses web search tool

In [1]:
#install dependencies
import requests
from langchain.agents import create_agent
from langchain.messages import HumanMessage
from langchain.tools import tool

import os
import json
from typing import List, Literal, TypedDict
from pydantic import BaseModel, Field

from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.tools import tool
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_core.messages import HumanMessage

import os
from dotenv import load_dotenv

In [2]:
#load environment variables
load_dotenv()

# Access the variables
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
SILICON_FLOW_BASE_URL = os.getenv("SILICON_FLOW_BASE_URL")

In [3]:
#set paths
FACTS_JSONL_PATH = "C:\\Users\\HP\\Documents\\repos\\essay-checker-agentic-rag\\data\\processed\\extracted_facts.jsonl"
ALLOWED_DOMAINS = "" # can set to wikipedia domain if want to limit
OUTPUT_PATH = "C:\\Users\\HP\\Documents\\repos\\essay-checker-agentic-rag\\data\\processed\\fact_checking_output.json"
DOCUMENT_FOLDER_PATH = "C:\\Users\\HP\\Documents\\repos\\essay-checker-agentic-rag\\data\\knowledge_base"

In [4]:
#load knowledge base documents
loader = DirectoryLoader(DOCUMENT_FOLDER_PATH, glob="./*.pdf", loader_cls=PyPDFLoader)
docs = loader.load()
docs[:2]  # preview first two documents

[Document(metadata={'producer': 'Microsoft® Word 2024', 'creator': 'Microsoft® Word 2024', 'creationdate': '2025-12-20T19:55:41+08:00', 'author': 'hp404sk7@outlook.com', 'moddate': '2025-12-20T19:55:41+08:00', 'source': 'C:\\Users\\HP\\Documents\\repos\\essay-checker-agentic-rag\\data\\knowledge_base\\AI_and_assessment_in_higher_education.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1'}, page_content='AI and assessment in higher education \nhttps://www.timeshighereducation.com/campus/ai-and-assessment-higher-\neducation  \nThreat or opportunity? Advice for using, managing and embedding artificial intelligence \nin university assessment, skills development and task design \nNo sooner had generative AI (GenAI) tools, such as ChatGPT, ignited fears in universities \nabout risk to assessment practices and academic integrity, than academics began \nworking out how to embrace it to save time and enrich student skills such as critical \nthinking and analysis. This has required considerat

In [5]:
#split documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
splits[:2]  # show first two splits

[Document(metadata={'producer': 'Microsoft® Word 2024', 'creator': 'Microsoft® Word 2024', 'creationdate': '2025-12-20T19:55:41+08:00', 'author': 'hp404sk7@outlook.com', 'moddate': '2025-12-20T19:55:41+08:00', 'source': 'C:\\Users\\HP\\Documents\\repos\\essay-checker-agentic-rag\\data\\knowledge_base\\AI_and_assessment_in_higher_education.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1'}, page_content='AI and assessment in higher education \nhttps://www.timeshighereducation.com/campus/ai-and-assessment-higher-\neducation  \nThreat or opportunity? Advice for using, managing and embedding artificial intelligence \nin university assessment, skills development and task design \nNo sooner had generative AI (GenAI) tools, such as ChatGPT, ignited fears in universities \nabout risk to assessment practices and academic integrity, than academics began \nworking out how to embrace it to save time and enrich student skills such as critical \nthinking and analysis. This has required considerat

In [6]:
#define embeddings model
#SiliconFlow hosts open-source embedding models that can be used with LangChain
embeddings = OpenAIEmbeddings(
    model="BAAI/bge-m3",
    api_key=OPENAI_API_KEY,
    base_url=SILICON_FLOW_BASE_URL,
    # Crucial for SiliconFlow/Local providers to avoid dimension errors
    check_embedding_ctx_length=False,
    chunk_size=64
)

In [8]:
#create vectorstore and retriever
VECTOR_DATABASE_PATH = "C:\\Users\\HP\\Documents\\repos\\essay-checker-agentic-rag\\data\\chroma_db"

if os.path.exists(VECTOR_DATABASE_PATH):
    vectorstore = Chroma(
        persist_directory=VECTOR_DATABASE_PATH,
        embedding_function=embeddings,
        collection_name="essay_kb")
else:
    vectorstore = Chroma.from_documents(
            documents=splits, 
            embedding=embeddings,
            collection_name="essay_kb",
            persist_directory=VECTOR_DATABASE_PATH
        )

retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

In [9]:
#define tool used to search knowledge base
@tool 
def search_knowledge_base(query: str) -> str:
    """Search the internal knowledge base of relevant essay documents."""
    docs = retriever.invoke(query)
    return "\n\n".join([d.page_content for d in docs])

In [14]:
#define tool used for web search
@tool
def web_search(query: str) -> str:
    """Search the web for relevant information."""
    pass

In [10]:
#define output structure
class FactEvaluation(BaseModel):
    statement: str = Field(description="The original fact statement")
    correctness_score: Literal["correct", "wrong", "undetermined"] = Field(description="The final verdict")
    summary_description: str = Field(description="Summary of why this verdict was reached")

In [None]:
# System prompt enforcing the logic
system_prompt = """You are a fact-checking assistant. 
Logic Flow:
1. For every fact, first use 'search_knowledge_base'.
2. If the retrieved info supports the fact, mark as 'correct'.
3. If it contradicts, mark as 'wrong'.
4. If the knowledge base is insufficient (neutral/unknown), you MUST call 'web_search' ONCE to check online.
5. If web results still don't clarify, mark as 'undetermined'.

Return the result strictly as a JSON object matching the FactEvaluation schema."""

In [15]:
#define the tools
tools = [search_knowledge_base, web_search]

In [16]:
#define model used
llm_model = ChatOpenAI(
    model="deepseek-ai/DeepSeek-V3",
    openai_api_key=OPENAI_API_KEY,
    openai_api_base="https://api.siliconflow.cn/v1", 
)

In [17]:
#create the agent
agent = create_agent(
    tools=tools,    
    system_prompt=system_prompt,
    model=llm_model,
    name = "FactCheckerAgent",
)

In [None]:
results = []
    
with open(FACTS_JSONL_PATH, 'r') as f:
    for line in f:
        fact_data = json.loads(line)
        statement = fact_data["statement"]
        
        # Invoke agent for each fact
        inputs = {"messages": [HumanMessage(content=f"Evaluate this fact: {statement}")]}
        config = {"configurable": {"thread_id": "fact_check"}}
        
        # Use structured output parsing
        response = agent.invoke(inputs)
        # The last message contains the result. 
        # We can force the agent to return the structured schema
        structured_llm = llm_model.with_structured_output(FactEvaluation)
        final_eval = structured_llm.invoke(response["messages"][-1].content)
        
        results.append(final_eval.dict())

results[:5]