In [58]:
from dotenv import load_dotenv

load_dotenv()

True

In [92]:
from pydantic import BaseModel, Field
from typing import List, Optional

class Person(BaseModel):
    name: str = Field(..., description="The full name of the person.")

class Place(BaseModel):
    name: str = Field(..., description="The name of the place.")
    latitude: float = Field(..., description="The latitude of the place.")
    longitude: float = Field(..., description="The longitude of the place.")

class Summary(BaseModel):
    summary: str = Field(..., description="A brief summary of the content.")
    people: List[Person] = Field(..., description="A list of people mentioned in the summary.")
    places: List[Place] = Field(..., description="A list of places mentioned in the summary.")
    timeline: List[str] = Field(..., description="A chronological timeline of events mentioned of the format YYYY-MM-DD.'")

class Event(BaseModel):
    date: str = Field(..., description="The date of the event in the format YYYY-MM-DD.")
    people: List[Person] = Field(..., description="A list of people involved in the event.")
    places: List[Place] = Field(..., description="A list of places associated with the event.")

class Page(BaseModel):
    page_number: int = Field(..., description="The page number.")
    events: List[Event] = Field(..., description="A list of events on the page.")

In [93]:
class HistorianState(BaseModel):
    filename: Optional[str] = Field(None, description="The name of the file being analyzed.")
    text: Optional[str] = Field(None, description="The text content to be summarized.")
    summary: Optional[Summary] = Field(None, description="The summary of the content.")
    pages: Optional[List[Page]] = Field(None, description="A list of pages with their events.")

In [104]:
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_google_genai import ChatGoogleGenerativeAI
from typing import Callable, Any
from langgraph.graph import StateGraph, END

PAGE_SIZE = 2000  # Number of characters per page

def _default_filename(state: HistorianState, llm: BaseChatModel) -> dict:
    return {
        "filename": "../Library/data_patents.txt",
    }

def _load_text(state: HistorianState, llm: BaseChatModel) -> dict:
    if state.filename is None:
        raise ValueError("Filename is not set in the state.")
    filename = str(state.filename)
    with open(filename, 'rt') as file:
        text = file.read()
    return {
        "text": text
    }

def _extract_summary(state: HistorianState, llm: BaseChatModel) -> dict:
    # Placeholder implementation for extracting summary using LLM
    # In a real implementation, you would use the llm to process the state
    if state.text is None:
        raise ValueError("Text is not set in the state.")
    llm_summary = llm.with_structured_output(Summary)
    summary = llm_summary.invoke(state.text)
    return {
        "summary": summary
    }

def _extract_page(state: HistorianState, llm: BaseChatModel) -> dict:
    # Placeholder implementation for extracting page content
    if state.text is None:
        raise ValueError("Text is not set in the state.")
    pages = state.pages or []
    page_number = len(pages) + 1
    page_content = state.text[:PAGE_SIZE]
    llm_page = llm.with_structured_output(Page)
    context = """Please extract events, with people, and places from the following page content.\n\n""" + page_content
    context = context + f"\n\nThe page is part of the larger document with the following information: \n{state.summary.dict()}"
    result = llm_page.invoke(context)
    result.page_number = page_number
    return {
        "text": state.text[PAGE_SIZE:],
        "pages": pages + [result],
    }

def _text_available(state: HistorianState) -> bool:
    return state.text is not None and len(state.text) > 0

def build_react_agent(
    llm: BaseChatModel,
):  # -> CompiledGraph[AgentState, Any, Any]
    """Create the LangGraph ReAct agent with the three requested states."""

    graph = StateGraph(HistorianState)
    graph.add_node("default_filename", lambda state: _default_filename(state, llm=llm))
    graph.add_node("load_text", lambda state: _load_text(state, llm=llm))
    graph.add_node("extract_summary", lambda state: _extract_summary(state, llm=llm))
    graph.add_node("extract_page", lambda state: _extract_page(state, llm=llm))
    # graph.add_node("extract_page", lambda state: _extract_page(state, page_loader=page_loader))
    # graph.add_node("summarize_page", lambda state: _summarize_page(state, llm=llm))

    graph.set_entry_point("default_filename")
    graph.add_edge("default_filename", "load_text")
    graph.add_edge("load_text", "extract_summary")
    graph.add_edge("extract_summary", "extract_page")
    graph.add_conditional_edges("extract_page", _text_available, {
        True: "extract_page",
        False: END,
    })
    # graph.add_edge("extract_summary", "extract_page")
    # graph.add_edge("extract_page", "summarize_page")
    # graph.add_edge("summarize_page", END)

    return graph.compile()


In [105]:
llm = ChatGoogleGenerativeAI(
    model= "gemini-2.5-flash",
    temperature=0.0,
    max_retries=2,
    # google_api_key=api_key,
)

In [106]:
agent = build_react_agent(llm=llm)
state = HistorianState()
result = agent.invoke(state)

/var/folders/hp/jf4fkqss71lgzkp8m5rxb9vw0000gn/T/ipykernel_12235/698915019.py:43: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.12/migration/
  context = context + f"\n\nThe page is part of the larger document with the following information: \n{state.summary.dict()}"


In [97]:
from pprint import pprint

pprint(result)

{'filename': '../books/excerpt.txt',
 'pages': [Page(page_number=1, events=[Event(date='2006-04-22', people=[Person(name='Dagny'), Person(name='John Bickers'), Person(name='David Widger')], places=[]), Event(date='1919-01-01', people=[Person(name='Theodore Roosevelt'), Person(name='Joseph Bucklin Bishop')], places=[])]),
           Page(page_number=2, events=[Event(date='1903-XX-XX', people=[Person(name='Theodore Roosevelt'), Person(name='children')], places=[Place(name='Sagamore Hill', latitude=40.8726, longitude=-73.5432)]), Event(date='1905-01-XX', people=[Person(name='Theodore Roosevelt'), Person(name='nine boys')], places=[Place(name='Rock Creek Park, Washington', latitude=38.94, longitude=-77.05)])]),
           Page(page_number=3, events=[Event(date='1903-01-01', people=[Person(name='Theodore Roosevelt')], places=[])])],
 'summary': Summary(summary="An introduction to Theodore Roosevelt's 'Letters to His Children,' edited by Joseph Bucklin Bishop. The introduction highlights Roo

In [103]:
import json

json.dump({
    "summary": result["summary"].dict(),
    "pages": [page.dict() for page in result["pages"]] if result["pages"] else [],
}, open("example.json", "wt"), indent=2)

/var/folders/hp/jf4fkqss71lgzkp8m5rxb9vw0000gn/T/ipykernel_12235/2283981434.py:4: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.12/migration/
  "summary": result["summary"].dict(),
/var/folders/hp/jf4fkqss71lgzkp8m5rxb9vw0000gn/T/ipykernel_12235/2283981434.py:5: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.12/migration/
  "pages": [page.dict() for page in result["pages"]] if result["pages"] else [],


In [101]:
result

{'filename': '../books/excerpt.txt',
 'text': '',
 'summary': Summary(summary="An introduction to Theodore Roosevelt's 'Letters to His Children,' edited by Joseph Bucklin Bishop. The introduction highlights Roosevelt's practice of writing to his children for over twenty years, treating them as equals, and actively participating in their games and activities, whether at Sagamore Hill or the White House.", people=[Person(name='Theodore Roosevelt'), Person(name='Joseph Bucklin Bishop'), Person(name='Dagny'), Person(name='John Bickers'), Person(name='David Widger')], places=[Place(name='United States', latitude=39.8283, longitude=-98.5795), Place(name='Sagamore Hill', latitude=40.8726, longitude=-73.5432), Place(name='White House', latitude=38.8977, longitude=-77.0365), Place(name='Rock Creek Park, Washington', latitude=38.94, longitude=-77.05)], timeline=['1919-XX-XX', '2006-04-22', '1903-XX-XX', '1905-01-XX']),
 'pages': [Page(page_number=1, events=[Event(date='2006-04-22', people=[Perso