START - Retrieve - generate - END

In [None]:
# %pip install -qU pypdf langchain-community langchain-text-splitters

In [None]:
from langchain_community.document_loaders import PyPDFLoader

pdf_file_path = './documents/income_tax.pdf'
loader = PyPDFLoader(pdf_file_path)
pages = []
async for page in loader.alazy_load():
    pages.append(page)

In [None]:
pages[35]

#1

랭체인 PDF 로더는 문서 내 이미지 파싱이 불가 
따라서, %pip install -q py-zerox ! 

In [None]:
from dotenv import load_dotenv

load_dotenv()

In [None]:
# %pip install -q nest_asyncio
import nest_asyncio

nest_asyncio.apply()

In [None]:
from pyzerox import zerox
import os
import json
import asyncio

# 모델 설정

# 일부 모델에서 추가로 요구하는 설정값을 전달하기 위한 확장 인자
kwargs = {}

# 모델에 사용할 시스템 프롬프트
custom_system_prompt = None

model = "gpt-4o" 
os.environ[" "] = " " 

# 메인 비동기 진입점 정의 
async def main():
    file_path = "./documents/income_tax.pdf" 

  
    select_pages = None

    output_dir = "./documents"
    result = await zerox(file_path=file_path, model=model, output_dir=output_dir,
                        custom_system_prompt=custom_system_prompt,select_pages=select_pages, **kwargs)
    return result

# 메인 함수 실행 및 출력
result = asyncio.run(main())
print(result)

#2

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 100,
    separators= ['\n\n', '\n']
)

In [None]:
# %pip install -q "unstructured[md]" nltk

In [None]:
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_core.documents import Document

markdown_path = "./documents/income_tax.md"
loader = UnstructuredMarkdownLoader(markdown_path)
documnet_list = loader.load_and_split(text_splitter)

In [None]:
documnet_list[39]

Markdown 파일도 이미지 파싱이 안될수도 있음
이럴 땐, Markdown → txt → load → split 순으로 진행해야함

Markdown → txt

In [None]:
import markdown
from bs4 import BeautifulSoup

text_path = './documents/income_tax.txt'
with open(markdown_path, 'r', encoding='utf-8') as md_file:
    md_content = md_file.read()

html_content = markdown.markdown(md_content)

soup = BeautifulSoup(html_content, 'html.parser')
text_content = soup.get_text()

with open(text_path, 'w', encoding='utf-8') as txt_file:
    txt_file.write(text_content)

print("OK")

txt → load

In [None]:
from langchain_community.document_loaders import TextLoader

text_path = './documents/income_tax.txt'

loader = TextLoader(text_path)
document_list = loader.load_and_split(text_splitter)

In [None]:
document_list[39]

DB(Chroman) 적재

In [None]:
# %pip install -q langchain-chroma

In [None]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model='text-embedding-3-large')

In [None]:
from langchain_chroma import Chroma

vector_store = Chroma.from_documents(
    documents=document_list,
    embedding=embeddings,
    collection_name='income_tax_coll',
    persist_directory='./income_tax_coll' # 해당 인자를 지정해야 로컬에 남아있게 됨
)

In [None]:
retriever = vector_store.as_retriever(search_kwargs={'k': 3})

In [None]:
query = '연봉 5천만원 직장인의 소득세는?'

In [None]:
retriever.invoke(query)

- State 선언 및 Agent 생성

In [None]:
from typing_extensions import List, TypedDict
from langchain_core.documents import Document

class AgentState(TypedDict):
    query: str
    context: List[Document]
    answer: str
    

In [None]:
from langgraph.graph import StateGraph

graph_builder = StateGraph(AgentState)

In [None]:
def retrieve(state: AgentState):

    query = state['query']
    docs = retriever.invoke(query)
    return {'context': docs}
    

generate → llm.invoke()에서 사용자의 쿼리와 context가 같이 들어가야함
이때, 효율적인 RAG를 위한 프롬프트를 작성해야함 

In [None]:
from langchain import hub
from langchain_openai import ChatOpenAI
from langsmith import Client

client = Client()
prompt = client.pull_prompt("rlm/rag-prompt")
llm = ChatOpenAI(model='gpt-4o')

In [None]:
def generate(state: AgentState):
    
    context = state['context']
    query = state['query']
    rag_chain = prompt | llm    # LCEL
    response = rag_chain.invoke({'question': query, 'context': context})
    return {'answer': response}

START → Retrieve → generate → END
하단 셀에서 노드 생성

In [None]:
graph_builder.add_node('retrieve', retrieve)
graph_builder.add_node('generate', generate)

In [None]:
from langgraph.graph import START, END

graph_builder.add_edge(START, 'retrieve')
graph_builder.add_edge('retrieve', 'generate')
graph_builder.add_edge('generate', END)

In [None]:
graph = graph_builder.compile()

In [None]:
from IPython.display import Image, display

display(Image(graph.get_graph().draw_mermaid_png()))

In [None]:
initial_state = {'query': query}
graph.invoke(initial_state)