In [56]:
#pip install langchain-huggingface

In [57]:
import os
import re
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel

VECTOR_STORE_PATH = "./vectorstore"
EMBEDDINGS_MODEL_NAME = "intfloat/multilingual-e5-small"
EMBEDDINGS = HuggingFaceEmbeddings(model_name=EMBEDDINGS_MODEL_NAME)





In [None]:
def clean_text(text):
    #Clean and preprocess text.
    #Remove hyphenation at line breaks
    text = re.sub(r'-\n','',text)
    #Replace newlines within paragraphs with spaces
    text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
    #Remove multiple newlines
    text = re.sub(r'\n+', '\n', text)
    #Normalize whitespace
    text = re.sub(r'[ \t]+', ' ', text)
    return text.strip()

In [None]:
def create_vectorstore():
    list_of_pdfs = [
        "pdfs/FairyTale1.pdf",
        "pdfs/FairyTale2.pdf"
    ]

    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=3000,
        chunk_overlap=100,
        length_function=len,
        is_separator_regex=False,
    )

    documents = []
    for pdf in list_of_pdfs:
        loader = PyPDFLoader(pdf)
        pdf_documents = loader.load()
        # Clean the text in each document
        for doc in pdf_documents:
            cleaned_content = clean_text(doc.page_content)
            doc.page_content = cleaned_content
            documents.append(doc)

    chunked_documents = text_splitter.split_documents(documents)

    vectorstore = FAISS.from_documents(chunked_documents, EMBEDDINGS)
    vectorstore.save_local(VECTOR_STORE_PATH)

    return vectorstore
vectorstore = create_vectorstore()


In [None]:
def load_vectorstore():
    if os.path.exists(VECTOR_STORE_PATH):
        vectorstore = FAISS.load_local(VECTOR_STORE_PATH, EMBEDDINGS)
    else:
        vectorstore = create_vectorstore()
    return vectorstore

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [58]:
from langchain import hub

prompt = hub.pull("rlm/rag-prompt")



In [59]:
from langchain_community.chat_models import ChatOllama

llm = ChatOllama(model="llama3.1:8b")

In [60]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel

retriever = vectorstore.as_retriever()

rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

In [62]:
def main():
    query = "단테 신곡 동화 작성해주세요"
    response = rag_chain_with_source.invoke(query)

    print("Answer:\n", response["answer"] + "\n")
    print("Sources:")
    sources = [doc.metadata for doc in response["context"]]
    for source in sources:
        print(source)

if __name__ == "__main__":
    main()

Answer:
 이 이야기의 주제는 복수와 그에 대한 결과입니다.蒙特克리스토 백작은 자신을 배신한 사람들에게 정의를 실현하기 위해 계획을 세웠지만, 결국 자신이 외롭고 슬플 것이라는 사실을 깨달았습니다.

Sources:
{'source': 'pdfs/FairyTale1.pdf', 'page': 9}
{'source': 'pdfs/FairyTale1.pdf', 'page': 8}
{'source': 'pdfs/FairyTale2.pdf', 'page': 35}
{'source': 'pdfs/FairyTale1.pdf', 'page': 53}
