# 1. RAG chain 구현 구문

In [2]:
# RAG chain 설계 및 LLM 연동을 위한 모듈
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
from langchain.prompts import ChatPromptTemplate, ChatMessagePromptTemplate, MessagesPlaceholder
from langchain_core.runnables import RunnablePassthrough, RunnableWithMessageHistory
from langchain_community.tools import TavilySearchResults

# 평가 알로리즘 모듈
from langchain_core.output_parsers import JsonOutputParser,StrOutputParser
from langchain import hub
from langchain_core.runnables import RunnablePassthrough, RunnableLambda

from ragas import EvaluationDataset, RunConfig, evaluate
from ragas.metrics import LLMContextRecall, Faithfulness, LLMContextPrecisionWithReference, AnswerRelevancy

from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper

from pydantic import BaseModel, Field


# 메모리 관련 모듈
from langchain_core.chat_history import InMemoryChatMessageHistory



from textwrap import dedent
from operator import itemgetter
from pprint import pprint
import random
from dotenv import load_dotenv
load_dotenv()


# config
COLLECTION_NAME = "bluer_db_openai"
PERSIST_DIRECTORY = "vector_store/chroma/bluer_db"
EMBEDDING_MODEL_NAME = "text-embedding-3-small"
EMBEDDING_MODEL = OpenAIEmbeddings(model=EMBEDDING_MODEL_NAME)

In [19]:
########################################################
# vector_db에서 데이터 불러오기
########################################################

# vector store 연결
vector_store = Chroma(
    embedding_function=EMBEDDING_MODEL,
    collection_name=COLLECTION_NAME,
    persist_directory=PERSIST_DIRECTORY
)

# 저장된 데이터 내용 확인
documents = vector_store._collection.get()['documents']
metadatas = vector_store._collection.get()['metadatas']

print(f"Documents: {documents[:5]}") 
print(f"Metadatas: {metadatas[:5]}")

Documents: ['foodDetailTypes: 스시\nheaderInfo_nameKR: 스시조\nheaderInfo_nameEN: Sushi Cho\nheaderInfo_nameCN: \nheaderInfo_bookYear: 2025\nheaderInfo_ribbonType: 3\ndefaultInfo_chefName: \ndefaultInfo_phone: 02-317-0373\ndefaultInfo_openHours: \ndefaultInfo_closeHours: \ndefaultInfo_openHoursWeekend: \ndefaultInfo_closeHoursWeekend: \ndefaultInfo_dayOff: 연중무휴\ndefaultInfo_app2Yn: False\nstatusInfo_parking: 가능\nstatusInfo_creditCard: y\nstatusInfo_visit: 웨스틴조선호텔 20층\nstatusInfo_menu: 런치(Hall)(1인 15만5천원~20만5천원), 디너(Hall)(1인 19만4천원~33만원), 스시조회덮밥(10만원, 프리미엄 13만원), 복가라아게돌솥밥(12만원), 굴돌솥밥(7만8천원), 활새우튀김(11만원), 조리장특선모둠스시(14만5천원)\nstatusInfo_priceRange: 25만원 이상\nstatusInfo_openDate: 2008년\nstatusInfo_businessHours: 12:00~15:00/17:30~22:00(마지막 주문 21:30)\njuso_detailAddress: 웨스틴조선호텔 20층\njuso_roadAddrPart1: 서울특별시 중구 소공로 106\njuso_engAddr: 106, Sogong-ro, Jung-gu, Seoul\njuso_bdNm: 서울 웨스틴조선호텔\njuso_siNm: 서울특별시\njuso_sggNm: 중구\njuso_emdNm: 소공동\njuso_liNm: \njuso_rn: 소공로\njuso_buldMnnm: 106\njuso_buldSln

In [None]:
# 데이터 로드 및 전처리
data = pd.read_csv(csv_path)
data.fillna("", inplace=True)  # NaN 값 처리

# 모든 데이터를 활용하도록 문서화
documents = []
for i, row in data.iterrows():
    # 텍스트 내용 (각 행 전체를 하나의 문서로 취급)
    page_content = "\n".join([f"{col}: {val}" for col, val in row.items()])
    
    # 메타데이터 생성 (필요 시 주요 컬럼만 선택 가능)
    metadata = row.to_dict()
    
    # Document 생성
    doc = Document(page_content=page_content, metadata=metadata)
    documents.append(doc)

print(f"총 {len(documents)}개의 문서가 생성되었습니다.")

In [23]:
from pprint import pprint 
pprint(documents)

['foodDetailTypes: 스시\n'
 'headerInfo_nameKR: 스시조\n'
 'headerInfo_nameEN: Sushi Cho\n'
 'headerInfo_nameCN: \n'
 'headerInfo_bookYear: 2025\n'
 'headerInfo_ribbonType: 3\n'
 'defaultInfo_chefName: \n'
 'defaultInfo_phone: 02-317-0373\n'
 'defaultInfo_openHours: \n'
 'defaultInfo_closeHours: \n'
 'defaultInfo_openHoursWeekend: \n'
 'defaultInfo_closeHoursWeekend: \n'
 'defaultInfo_dayOff: 연중무휴\n'
 'defaultInfo_app2Yn: False\n'
 'statusInfo_parking: 가능\n'
 'statusInfo_creditCard: y\n'
 'statusInfo_visit: 웨스틴조선호텔 20층\n'
 'statusInfo_menu: 런치(Hall)(1인 15만5천원~20만5천원), 디너(Hall)(1인 19만4천원~33만원), '
 '스시조회덮밥(10만원, 프리미엄 13만원), 복가라아게돌솥밥(12만원), 굴돌솥밥(7만8천원), 활새우튀김(11만원), '
 '조리장특선모둠스시(14만5천원)\n'
 'statusInfo_priceRange: 25만원 이상\n'
 'statusInfo_openDate: 2008년\n'
 'statusInfo_businessHours: 12:00~15:00/17:30~22:00(마지막 주문 21:30)\n'
 'juso_detailAddress: 웨스틴조선호텔 20층\n'
 'juso_roadAddrPart1: 서울특별시 중구 소공로 106\n'
 'juso_engAddr: 106, Sogong-ro, Jung-gu, Seoul\n'
 'juso_bdNm: 서울 웨스틴조선호텔\n'
 'juso_siNm: 서울

In [None]:
model = ChatOpenAI(model="gpt-4o-mini")


#########################################################
# InMemoryVectorStore 생성
#########################################################

store = {} 
# key : session_id, value : InMemoryChatMessageHistory ( session id별로 저장하는 기능이 없다.)

def get_session_history(session_id):
    '''
    ChatMessageHistory 객체를 반환하는 함수
    store에서 session_id의 History객체를 찾아서 반환, 없으면 생성해서 store 저장
    '''     
    
    if session_id not in store:
        store[session_id] = InMemoryChatMessageHistory()
    
    return store[session_id]



runnable = prompt_template | model

# Chain + ChatMessageHJistory => 대화 + 메세지 저장관리
chain = RunnableWithMessageHistory(
    runnable= runnable, # chain 객체(RunnableSequence)
    get_session_history=get_session_history, # session_id의 ChatMessageHistory객체를 반환하는 함수.
    input_messages_key= 'query',    # prompt_template에 입력 내용을 넣을 변수명.
    history_messages_key='history'  # prompt_template에 대화내역을 넣어줄 변수명.
)

########################################
# 질문을 Embedding Vector로 변환
########################################

query = "How much the bus ticket price?"
embedding_query = embedding_model.embed_query(query) # 한문장 변환.
print(type(embedding_query), len(embedding_query))


############################################################
# retriever
############################################################


# Retriever 생성 - "Map Reduce" 방식
retriever = vector_store.as_retriever(
    search_type="mmr",
    search_kwargs={"k":5, "fetch_k":10, "lambda_mult":0.5, "filter":{"source":"data/bluer.txt"} }
)


map_doc_prompt = ChatPromptTemplate.from_messages([
    ("system",  """
Use the following portion of a long document to see if any of the text is relevant to answer the question. 
Return any relevant text verbatim. If there is no relevant text, return : ''
-------
{context}
"""),
    ("human", "{question}"),
])



# 질문 - 문서 관련성을 비교하는 체인
map_doc_chain = map_doc_prompt | model
# map_doc_chain.invoke({"context":"사과는 맛있어요?", "question":"올림픽 종목에 대해 설명해줘."})
# map_doc_chain.invoke({"context":"올림픽에는 300개의 종목이 있습니다.", "question":"올림픽 종목에 대해 설명해줘."})

## retriever로 문서 조회 -> map_doc_chain으로 관련문서를 찾기 
def map_doc(inputs):
    """
    Runnable로 정의할 함수. 
    retriever가 조회한 문서들과 question을 받아서 map_doc_chain을 이용해 관련성을 확인한다.
    관련된 문서 내용만 모아서 반환.
    parameter
        inputs: dict[documents: list[Document], question:질문]. {"documents":retriever, "question":RunnablePassthrough()}
    """
    docs = inputs["documents"]   # list[Document, Document, Document, ...]
    question= inputs["question"] # str
    context = "" # 질문과 관련된 내용들만 모아 놓을 변수.
    for doc in docs:
        # Document와 question을 map_doc_chain에 전달해서 관련된 내용인지 확인.
        res = map_doc_chain.invoke({"context":doc.page_content, "question":question})
        context += res.content+"\n\n" # AIMessage.content

    return context

map_reduce_chain = {"documents":retriever, "question":RunnablePassthrough()} | RunnableLambda(map_doc)

#### Map_reduce 확인

In [None]:
r = map_reduce_chain.invoke("리본 두개 이상인 서울 한식집을 알려주세요.")
print(r)

#### 최종 답변 

In [None]:
final_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            Given the following extracted parts of a long document and a question, create a final answer. 
            If you don't know the answer, just say that you don't know. Don't try to make up an answer.
            ------
            {context}
            """,
        ),
        ("human", "{question}"),
    ]
)

chain = ({"context":map_reduce_chain, "question":RunnablePassthrough()} 
        | final_prompt
        | model
        | StrOutputParser())