In [None]:
# -----------------------------
# 1) Import
# -----------------------------
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFacePipeline
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
from langgraph.graph import StateGraph, END
import re
import pandas as pd

# 2) Model -----------------------------
emdeding_id = "jhgan/ko-sbert-sts"
qa_model_id = "monologg/koelectra-base-v3-finetuned-korquad"
tokenizer_opt = {
    "max_length": 512, 
    "truncation": True,
}
filepath = "./NewBie_개발환경가이드.docx"
persist_directory = "./chromaDB_Chatbot"

qa_tokenizer = AutoTokenizer.from_pretrained(qa_model_id)
qa_model = AutoModelForQuestionAnswering.from_pretrained(qa_model_id)
qa_pipeline = pipeline(
    task="question-answering",
    model=qa_model,
    tokenizer=qa_tokenizer,
    tokenizer_kwargs=tokenizer_opt,
    device=-1
)

# -----------------------------
# 3) Node 함수 정의
# -----------------------------
def load_documents(state):
    loader = UnstructuredWordDocumentLoader(filepath)
    docs = loader.load()
    texts = [d.page_content for d in docs]
    return {**state, "texts": texts}

def embed_documents(state):
    texts = state["texts"]
    docs = [Document(page_content=t) for t in texts]
    state["docs"] = docs

    emb = HuggingFaceEmbeddings(model_name=emdeding_id)
    state["embeddings"] = emb
    return state

def build_vector_db(state):
    docs = state["docs"]
    emb = state["embeddings"]

    vs = Chroma.from_documents(
        documents=docs,
        embedding=emb,
        persist_directory=persist_directory,
    )

    retriever = vs.as_retriever(search_kwargs={"k": 3})
    return {**state, "retriever": retriever}

def ask_question(state):
    q = input("질문을 입력하세요: ")
    state["question"] = q
    return state

def retrieve_docs(state):
    retriever = state["retriever"]
    q = state["question"]
    docs = retriever.invoke(q)
    context = "\n".join([d.page_content for d in docs])
    return {**state, "context": context}

def qa_answer(state):
    question = state["question"]
    context = state["context"]
    result = qa_pipeline(
        question=question,
        context=context
)
    return {**state, "answer": result["answer"]}

def respond(state):
    print("\n 최종 답변:", state["answer"])
    return state

Device set to use cpu


In [4]:
graph = StateGraph(dict)

graph.add_node("load_documents", load_documents)
graph.add_node("embed_documents", embed_documents)
graph.add_node("build_vector_db", build_vector_db)
graph.add_node("ask_question", ask_question)
graph.add_node("retrieve_docs", retrieve_docs)
graph.add_node("qa_answer", qa_answer)
graph.add_node("respond", respond)

graph.set_entry_point("load_documents")
graph.add_edge("load_documents", "embed_documents")
graph.add_edge("embed_documents", "build_vector_db")
graph.add_edge("build_vector_db", "ask_question")
graph.add_edge("ask_question", "retrieve_docs")
graph.add_edge("retrieve_docs", "qa_answer")
graph.add_edge("qa_answer", "respond")
graph.add_edge("respond", END)

app = graph.compile()
app.invoke({})

FileNotFoundError: no such file or directory: './data/*.docx'