# RAG using Upstage Document Parse and Groundedness Check
使用 Upstage 文档解析和接地性检查的 RAG

Upstage 是一家领先的人工智能 (AI) 公司，专门提供高于人类水平性能的 LLM 组件。langchain集成了upstage模块

This example illustrates RAG using [Upstage](https://python.langchain.com/docs/integrations/providers/upstage/) Document Parse and Groundedness Check.

In [2]:
from typing import List

from dotenv import load_dotenv
load_dotenv()

from langchain_community.vectorstores import DocArrayInMemorySearch
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables.base import RunnableSerializable
from langchain_upstage import (
    ChatUpstage,
    UpstageDocumentParseLoader,
    UpstageEmbeddings,
    UpstageGroundednessCheck,
)

model = ChatUpstage()

files = ["D:/Code/langchain_sample/langchain_cookbook/docs/15天AI Agent工程师面试学习路线-2f931c0bb2.pdf", 
         "D:/Code/langchain_sample/langchain_cookbook/docs/美团10.1红包活动分镜脚本及制作指导-83a64b0abb.pdf"]

loader = UpstageDocumentParseLoader(file_path=files, split="element")

docs = loader.load()



In [3]:
print(f"docs type {type(docs)}")

docs type <class 'list'>


In [4]:
for doc in docs:
    print(doc)

page_content='<p id='0' data-category='list'></p>' metadata={'id': 0, 'page': 1, 'category': 'list', 'coordinates': [{'x': 0.1155, 'y': 0.4467}, {'x': 0.526, 'y': 0.4467}, {'x': 0.526, 'y': 0.515}, {'x': 0.1155, 'y': 0.515}]}
page_content='<p id='1' data-category='list'></p>' metadata={'id': 1, 'page': 1, 'category': 'list', 'coordinates': [{'x': 0.1158, 'y': 0.5557}, {'x': 0.4494, 'y': 0.5557}, {'x': 0.4494, 'y': 0.5985}, {'x': 0.1158, 'y': 0.5985}]}
page_content='<p id='2' data-category='list'></p>' metadata={'id': 2, 'page': 1, 'category': 'list', 'coordinates': [{'x': 0.1132, 'y': 0.7154}, {'x': 0.7287, 'y': 0.7154}, {'x': 0.7287, 'y': 0.784}, {'x': 0.1132, 'y': 0.784}]}
page_content='<p id='3' data-category='list'></p>' metadata={'id': 3, 'page': 1, 'category': 'list', 'coordinates': [{'x': 0.115, 'y': 0.8243}, {'x': 0.6305, 'y': 0.8243}, {'x': 0.6305, 'y': 0.8693}, {'x': 0.115, 'y': 0.8693}]}
page_content='<p id='4' data-category='paragraph' style='font-size:14px'>asyncio</p>' me

In [6]:

vectorstore = DocArrayInMemorySearch.from_documents(
    docs, embedding=UpstageEmbeddings(model="solar-embedding-1-large")
)
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})

template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)
output_parser = StrOutputParser()

retrieved_docs = retriever.invoke("一般几天后可以编写完整的agent？")

groundedness_check = UpstageGroundednessCheck()
groundedness = ""
while groundedness != "grounded":
    chain: RunnableSerializable = RunnablePassthrough() | prompt | model | output_parser

    result = chain.invoke(
        {
            "context": retrieved_docs,
            "question": "一般几天后可以编写完整的agent？",
        }
    )

    groundedness = groundedness_check.invoke(
        {
            "context": retrieved_docs,
            "answer": result,
        }
    )

In [7]:
print(result)

根据所提供的上下文，项目实战阶段持续3天，在这个阶段预计可以编写完整的Agent。


In [8]:
print(groundedness)

grounded
