[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain-academy/blob/main/module-1/agent-memory.ipynb) [![Open in LangChain Academy](https://cdn.prod.website-files.com/65b8cd72835ceeacd4449a53/66e9eba12c7b7688aa3dbb5e_LCA-badge-green.svg)](https://academy.langchain.com/courses/take/intro-to-langgraph/lessons/58239417-lesson-7-agent-with-memory)

In [6]:
import os, getpass
import fitz
from pathlib import Path
from dotenv import dotenv_values

env_path = Path().absolute()/ '.env'
config = dotenv_values(env_path)

for key in config:
    print(f"{key}: {len(str(config[key]))} characters")
    os.environ[key] = config[key]
    
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "langchain-academy"

LANGCHAIN_API_KEY: 51 characters
OPENAI_API_KEY: 164 characters
TAVILY_API_KEY: 37 characters


In [7]:
from langchain_openai import ChatOpenAI
import json 
import PyPDF2
import os
import json
import logging
from pathlib import Path
import google.generativeai as genai
import sqlite3

def extraction(pdf_path: str, prompt_path:str, output_path:str) -> json:
    """Extracts information from a CP575 pdf, and save to local JSON file
    Args: pdf_path: The path (str) to the PDF file.
          prompt_path: The path (str) to the prompt file.
          output_path: The path (str) to the output file.
    """

    # Create outputs directory if it doesn't exist
    outputs_dir = Path.cwd() / "outputs"
    outputs_dir.mkdir(exist_ok=True)

    # Set up logging to file
    log_filename = outputs_dir / "output.log"
    json_filename = outputs_dir / output_path

    # Configure logging
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[
        logging.FileHandler(log_filename),
        logging.StreamHandler()
    ])

    # Set up your API key
    os.environ["GOOGLE_API_KEY"] = "AIzaSyAjewE7NRSlejNDAppxoS3kkLEpacy74Ag"
    genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

    # Load prompt from external Markdown file
    with open(prompt_path, "r") as file:
        prompt = file.read()

    # Load JSON data
    with open("prompt_3.json", "r") as file:
        prompt_data = json.load(file)

    media = Path.cwd() / pdf_path
    myfile = genai.upload_file(media)

    model = genai.GenerativeModel("gemini-1.5-flash")

    # Combine prompt and JSON data
    final_prompt = f"{prompt}\n{json.dumps(prompt_data)}"

    result = model.generate_content([final_prompt, myfile])

    json_response = json.loads(result.text.strip("```json").strip())
    with open(json_filename, "w") as json_file:
        json.dump(json_response, json_file, indent=2)

    logging.info(f"Full log saved to {log_filename}")
    logging.info(f"Result saved to {json_filename}")
    insert_data('data.db', json_response)
    return json_response







# This will be a tool
def filling(database_path:str, pdf_path:str, output_pdf_path:str) -> None:
    """filling a pdf with json data 

    Args:
        database_path: The path (str) to the database.
        pdf_path: The path (str) to the PDF file. 
        output_pdf_path: The path (str) to the output file. 
    """
    
    form_field_data = update_json_with_db_values('prompt_3.json', database_path)


    doc = fitz.open(pdf_path)

    for field_data in form_field_data:
        label = field_data["label"]
        value = field_data["value"]

        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            for field in page.widgets():
                if field.field_name == field_data["name"]:
                    value = field_data["value"]
                    
                    #handle checkboxes
                    #will only get checked if value in JSON file is true
                    if field.field_type == 2:
                        field.field_value = value
                              # Mark checkbox as unchecked

                    # Handle text fields (field_type == 7)
                    elif field.field_type == 7 or field.field_type == 1:  # Text field
                        field.field_value = value
                          # Ensure the field gets updated
                         # Ensure the checkbox gets updated
                    
                    # Handle radio boxes (field_type == 5)
                    #in order to fill them in, the JSON file would need to have the proper
                    #label for each radio box
                    elif field.field_type == 5:
                        #if field.on_state()
                        #print(field.field_value)
                        if value:
                            field.field_value = field.on_state()
                            #print(f"Radio button '{field.field_name}' on_state: {field.on_state()} {field.field_value}" )
                        #if field.on_state == "Choice1":
                        
                        
                    
                    field.update()  # Ensure the checkbox gets updated

    doc.save(output_pdf_path)



def insert_data(db_name, data):
    """filling a pdf with json data 

    Args:
        db_name: The path (str) to the database. 
        data: The JSON data. 
    """
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()
    
    for entry in data:
        if entry['label'] and entry['value']:
            # Check if the label already exists
            cursor.execute('''
                SELECT COUNT(*) FROM corporation_info WHERE field = ?
            ''', (entry['label'],))
            count = cursor.fetchone()[0]
            
            if count > 0:
                # Update the existing record
                cursor.execute('''
                    UPDATE corporation_info
                    SET value = ?, source = ?, status = ?, originalValue = ?
                    WHERE field = ?
                ''', (
                    entry['value'],
                    'Form Analysis',
                    'Not Verified',
                    entry['value'],
                    entry['label']
                ))
            else:
                # Insert a new record
                cursor.execute('''
                    INSERT INTO corporation_info (field, value, source, status, originalValue)
                    VALUES (?, ?, ?, ?, ?)
                ''', (
                    entry['label'],
                    entry['value'],
                    'Form Analysis',
                    'Not Verified',
                    entry['value']
                ))
    
    conn.commit()
    conn.close()


def fetch_value_from_db(db_name, label):
    """filling a pdf with json data 

    Args:
        db_name: The path (str) to the database. 
        label: The label in JSON file. 
    """
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()
    cursor.execute('''
        SELECT value FROM corporation_info WHERE field = ?
    ''', (label,))
    result = cursor.fetchone()
    conn.close()
    return result[0] if result else None

# Function to update JSON data with values from SQLite database
def update_json_with_db_values(json_file, db_name):
    """filling a pdf with json data 

    Args:
        json_file: The path (str) to the JSON template. 
        db_name: the path(str) to database.
    """
    # Use relative path for the JSON file

    
    with open(json_file, 'r') as file:
        data = json.load(file)
    
    for entry in data:
        if 'label' in entry:
            value = fetch_value_from_db(db_name, entry['label'])
            if value:
                entry['value'] = value
    
    return data




tools = [filling, extraction]
llm = ChatOpenAI(model="gpt-4o")
llm_with_tools = llm.bind_tools(tools)

In [8]:
from langgraph.graph import START, StateGraph
from langgraph.prebuilt import tools_condition, ToolNode
from IPython.display import Image, display
from langgraph.graph import MessagesState
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage

# System message
sys_msg = SystemMessage(content="You are a helpful assistant tasked with extracting information from pdf")

# Node
def assistant(state: MessagesState):
   return {"messages": [llm_with_tools.invoke([sys_msg] + state["messages"])]}
# Graph
builder = StateGraph(MessagesState)

# Define nodes: these do the work
builder.add_node("assistant", assistant)
builder.add_node("tools", ToolNode(tools))

# Define edges: these determine how the control flow moves
builder.add_edge(START, "assistant")
builder.add_conditional_edges(
    "assistant",
    # If the latest message (result) from assistant is a tool call -> tools_condition routes to tools
    # If the latest message (result) from assistant is a not a tool call -> tools_condition routes to END
    tools_condition,
)
builder.add_edge("tools", "assistant")
react_graph = builder.compile()

# Show
#display(Image(react_graph.get_graph(xray=True).draw_mermaid_png()))

Extract information from CP575/Article of Incorporation, save to database

In [9]:
#messages = [HumanMessage(content="extract information from CP575.pdf using prompt.md, save to output.json")] #CP575 
messages = [HumanMessage(content="extract information from AoI.pdf using prompt2.md, save to output2.json")] #Articles of Incorporation
messages = react_graph.invoke({"messages": messages})
for m in messages['messages']:
    m.pretty_print()

2024-11-27 11:08:46,307 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-27 11:09:11,246 - INFO - Full log saved to e:\starcycle\LangGraph\outputs\output.log
2024-11-27 11:09:11,249 - INFO - Result saved to e:\starcycle\LangGraph\outputs\output2.json
2024-11-27 11:09:14,872 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



extract information from AoI.pdf using prompt2.md, save to output2.json
Tool Calls:
  extraction (call_fJhi5HY80lNCscQzgIqIZk1P)
 Call ID: call_fJhi5HY80lNCscQzgIqIZk1P
  Args:
    pdf_path: AoI.pdf
    prompt_path: prompt2.md
    output_path: output2.json
Name: extraction

[{"name": "topmostSubform[0].Page1[0].f1_01[0]", "type": 7, "rect": [50.400001525878906, 93.9990234375, 446.3999938964844, 108.0], "options": 8388608, "label": "Name of Corporation", "value": "American Gene Engineer Corp."}, {"name": "topmostSubform[0].Page1[0].f1_02[0]", "type": 7, "rect": [50.400001525878906, 118.0, 446.3999938964844, 132.00103759765625], "options": 8388608, "label": "Address of Corporation", "value": "341 Raven Circle, Wyoming, 19934"}, {"name": "topmostSubform[0].Page1[0].f1_03[0]", "type": 7, "rect": [50.400001525878906, 142.00103759765625, 446.3999938964844, 156.00201416015625], "options": 8388608, "label": "City", "value": "Wyoming"}, {"name": "topmostSubform[0].Page1[0].f1_04[0]", "type": 7

Human-in-the-loop Implementaion: User Interaction with data through frontend app (verification, etc., changes wrote back to database)

Fill in form 966 from database

In [10]:
messages = [HumanMessage(content="Fill the f966.pdf form from data.db and save the result to filled_form_966_3.pdf")]
messages = react_graph.invoke({"messages": messages})
for m in messages['messages']:
    m.pretty_print()

2024-11-27 11:14:14,611 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-27 11:14:16,083 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



Fill the f966.pdf form from data.db and save the result to filled_form_966_3.pdf
Tool Calls:
  filling (call_iTIK6B2UIdHFs9bNF9a1c22h)
 Call ID: call_iTIK6B2UIdHFs9bNF9a1c22h
  Args:
    database_path: data.db
    pdf_path: f966.pdf
    output_pdf_path: filled_form_966_3.pdf
Name: filling

null

The form `f966.pdf` has been successfully filled using the data from `data.db`, and the resulting document has been saved as `filled_form_966_3.pdf`.
