# RAG pipeline with Unstructured files

In [1]:
# Install Requirements
!apt-get -qq install poppler-utils tesseract-ocr
%pip install -q --user --upgrade pillow
%pip install -q unstructured["all-docs"]==0.12.5

Selecting previously unselected package poppler-utils.
(Reading database ... 123595 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.5_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.5) ...
Selecting previously unselected package tesseract-ocr-eng.
Preparing to unpack .../tesseract-ocr-eng_1%3a4.00~git30-7274cfa-1.1_all.deb ...
Unpacking tesseract-ocr-eng (1:4.00~git30-7274cfa-1.1) ...
Selecting previously unselected package tesseract-ocr-osd.
Preparing to unpack .../tesseract-ocr-osd_1%3a4.00~git30-7274cfa-1.1_all.deb ...
Unpacking tesseract-ocr-osd (1:4.00~git30-7274cfa-1.1) ...
Selecting previously unselected package tesseract-ocr.
Preparing to unpack .../tesseract-ocr_4.1.1-2.1build1_amd64.deb ...
Unpacking tesseract-ocr (4.1.1-2.1build1) ...
Setting up tesseract-ocr-eng (1:4.00~git30-7274cfa-1.1) ...
Setting up tesseract-ocr-osd (1:4.00~git30-7274cfa-1.1) ...
Setting up poppler-utils (22.02.0-2ubuntu0.5) ...
Setting up tess

# Example: PDF file

In [2]:
from unstructured.partition.pdf import partition_pdf

In [9]:
filename = "RAG_best_practices1-2.pdf"
pdf_elements = partition_pdf(filename=filename, strategy="fast")

In [10]:
from collections import Counter

display(Counter(type(element) for element in pdf_elements))

Counter({unstructured.documents.elements.Text: 5,
         unstructured.documents.elements.Title: 11,
         unstructured.documents.elements.NarrativeText: 8,
         unstructured.documents.elements.Footer: 2,
         unstructured.documents.elements.ListItem: 2})

In [44]:
for element in pdf_elements[:]:
    print(f"{element.category.upper()}: {element.text}")

UNCATEGORIZEDTEXT: 4 2 0 2
TITLE: l u J
UNCATEGORIZEDTEXT: 1
TITLE: ] L C . s c [
UNCATEGORIZEDTEXT: 1 v 9 1 2 1 0 . 7 0 4 2 : v i X r a
TITLE: Searching for Best Practices in Retrieval-Augmented Generation
UNCATEGORIZEDTEXT: Xiaohua Wang, Zhenghua Wang, Xuan Gao, Feiran Zhang, Yixin Wu, Zhibo Xu, Tianyuan Shi, Zhengyuan Wang, Shizheng Li, Qi Qian, Ruicheng Yin, Changze Lv, Xiaoqing Zheng∗, Xuanjing Huang School of Computer Science, Fudan University, Shanghai, China Shanghai Key Laboratory of Intelligent Information Processing {xiaohuawang22,zhenghuawang23}@m.fudan.edu.cn {zhengxq,xjhuang}@fudan.edu.cn
TITLE: Abstract
NARRATIVETEXT: Retrieval-augmented generation (RAG) techniques have proven to be effective in integrating up-to-date information, mitigating hallucinations, and enhancing response quality, particularly in specialized domains. While many RAG approaches have been proposed to enhance large language models through query-dependent retrievals, these approaches still suffer from

### Document Layout Detection (DLD) Method

In [12]:
from unstructured_client.models import shared
from unstructured_client.models.errors import SDKError
from unstructured.staging.base import dict_to_elements

In [13]:
from unstructured_client import UnstructuredClient

client = UnstructuredClient(
    api_key_auth='free-api-key',
    server='free-api'
)


In [14]:
with open(filename, "rb") as f:
    files=shared.Files(
        content=f.read(),
        file_name=filename,
    )

req = shared.PartitionParameters(
    files=files,
    strategy="hi_res",
    hi_res_model_name="yolox",
)

try:
    resp = client.general.partition(req)
    dld_elements = dict_to_elements(resp.elements)
except SDKError as e:
    print(e)

In [19]:
for element in dld_elements[:20]:
    print(f"{element.category.upper()}: {element.text}")

TITLE: Searching for Best Practices in Retrieval-Augmented Generation
HEADER: 4 2 0 2 l u J 1 ] L C . s c [ 1 v 9 1 2 1 0 . 7 0 4 2 : v i X r a
UNCATEGORIZEDTEXT: Xiaohua Wang, Zhenghua Wang, Xuan Gao, Feiran Zhang, Yixin Wu, Zhibo Xu, Tianyuan Shi, Zhengyuan Wang, Shizheng Li, Qi Qian, Ruicheng Yin, Changze Lv, Xiaoqing Zheng∗, Xuanjing Huang School of Computer Science, Fudan University, Shanghai, China Shanghai Key Laboratory of Intelligent Information Processing {xiaohuawang22,zhenghuawang23}@m.fudan.edu.cn {zhengxq,xjhuang}@fudan.edu.cn
TITLE: Abstract
NARRATIVETEXT: Retrieval-augmented generation (RAG) techniques have proven to be effective in integrating up-to-date information, mitigating hallucinations, and enhancing response quality, particularly in specialized domains. While many RAG approaches have been proposed to enhance large language models through query-dependent retrievals, these approaches still suffer from their complex implementation and prolonged response times. Typ

In [16]:
import json

# Salva elementos em um JSON

data = [{"category": element.category, "text": element.text} for element in dld_elements]
with open("elementos.json", "w") as outfile:
    json.dump(data, outfile)

In [17]:
from IPython.display import display

with open("elementos.json", "r") as infile:
    data = json.load(infile)

for element in data:
    display(element)


{'category': 'Title',
 'text': 'Searching for Best Practices in Retrieval-Augmented Generation'}

{'category': 'Header',
 'text': '4 2 0 2 l u J 1 ] L C . s c [ 1 v 9 1 2 1 0 . 7 0 4 2 : v i X r a'}

{'category': 'UncategorizedText',
 'text': 'Xiaohua Wang, Zhenghua Wang, Xuan Gao, Feiran Zhang, Yixin Wu, Zhibo Xu, Tianyuan Shi, Zhengyuan Wang, Shizheng Li, Qi Qian, Ruicheng Yin, Changze Lv, Xiaoqing Zheng∗, Xuanjing Huang School of Computer Science, Fudan University, Shanghai, China Shanghai Key Laboratory of Intelligent Information Processing {xiaohuawang22,zhenghuawang23}@m.fudan.edu.cn {zhengxq,xjhuang}@fudan.edu.cn'}

{'category': 'Title', 'text': 'Abstract'}

{'category': 'NarrativeText',
 'text': 'Retrieval-augmented generation (RAG) techniques have proven to be effective in integrating up-to-date information, mitigating hallucinations, and enhancing response quality, particularly in specialized domains. While many RAG approaches have been proposed to enhance large language models through query-dependent retrievals, these approaches still suffer from their complex implementation and prolonged response times. Typically, a RAG workflow involves multiple processing steps, each of which can be executed in various ways. Here, we investigate existing RAG approaches and their potential combinations to identify optimal RAG practices. Through extensive experiments, we suggest several strategies for deploying RAG that balance both performance and efficiency. Moreover, we demonstrate that multimodal retrieval techniques can significantly enhance question-answering capabilities about visual inputs and accelerate the generation of multimodal content us

{'category': 'Title', 'text': '1 Introduction'}

{'category': 'NarrativeText',
 'text': 'Generative large language models are prone to producing outdated information or fabricating facts, although they were aligned with human preferences by reinforcement learning [1] or lightweight alternatives [2–5]. Retrieval-augmented generation (RAG) techniques address these issues by com- bining the strengths of pretraining and retrieval-based models, thereby providing a robust framework for enhancing model performance [6]. Furthermore, RAG enables rapid deployment of applications for specific organizations and domains without necessitating updates to the model parameters, as long as query-related documents are provided.'}

{'category': 'NarrativeText',
 'text': 'Many RAG approaches have been proposed to enhance large language models (LLMs) through query-dependent retrievals [6–8]. A typical RAG workflow usually contains multiple intervening processing steps: query classification (determining whether retrieval is necessary for a given input query), retrieval (efficiently obtaining relevant documents for the query), reranking (refining the order of retrieved documents based on their relevance to the query), repacking (organizing the retrieved documents into a structured one for better generation), summarization (extracting key information for response generation from the repacked document and eliminating redundancies) modules. Implementing RAG also requires decisions on the ways to properly split documents into chunks, the types of embeddings to use for semantically representing these chunks, the choice of'}

{'category': 'Title', 'text': '∗Corresponding Author.'}

{'category': 'NarrativeText', 'text': 'Preprint. Under review.'}

{'category': 'Title', 'text': 'Evalu'}

{'category': 'Title',
 'text': 'General Performance Specific Domains Retrieval Capability'}

{'category': 'Title', 'text': 'Fine-tune'}

{'category': 'Title',
 'text': 'rb Random + Normal — = @@ (" Summarization ) S ummarization'}

{'category': 'UncategorizedText', 'text': '+'}

{'category': 'ListItem', 'text': ''}

{'category': 'Title', 'text': 'Extractive'}

{'category': 'ListItem', 'text': ''}

{'category': 'ListItem', 'text': ''}

{'category': 'Title', 'text': 'Recomp BM25'}

{'category': 'UncategorizedText', 'text': '+'}

{'category': 'Title', 'text': 'Contriever'}

{'category': 'Title',
 'text': 'Abstractive LongLLMlingua + SelectiveContext o Recomp P— — Repacking I - S'}

{'category': 'UncategorizedText', 'text': '+'}

{'category': 'UncategorizedText', 'text': '+'}

{'category': 'UncategorizedText', 'text': '«+'}

{'category': 'UncategorizedText', 'text': '+'}

{'category': 'Title', 'text': 'Sides Forward'}

{'category': 'ListItem', 'text': 'Reverse ~'}

{'category': 'UncategorizedText', 'text': '@'}

{'category': 'Image', 'text': ''}

{'category': 'NarrativeText',
 'text': '—— - "7 7[ 7 Retrieval Source | Query Classification [ s M, Chunking « Chunking Size * Small2big + Sliding Windows ReHIeve ) é [ S5 ___ edder « intfloat/e5 « BAAI/bge Jina-embeddings-v2 Gte all-mpnet-base-v2 Database . __Vector o Weaviate Qdrant S ===t - - Chroma'}

{'category': 'Title', 'text': 'Retrieval'}

{'category': 'UncategorizedText', 'text': '['}

{'category': 'Title', 'text': 'Original Query BM25'}

{'category': 'ListItem', 'text': ''}

{'category': 'Title', 'text': 'Contriever'}

{'category': 'UncategorizedText', 'text': '«'}

{'category': 'UncategorizedText',
 'text': 'LLM-Embedder Query Rewriting Query Decomposition HyDE Hybrid Search ch \\ H DE+H‘ brid Sear %'}

{'category': 'ListItem', 'text': ''}

{'category': 'UncategorizedText', 'text': '+'}

{'category': 'UncategorizedText', 'text': '«'}

{'category': 'UncategorizedText', 'text': '+'}

{'category': 'NarrativeText', 'text': 's - S| Reranking'}

{'category': 'Title', 'text': 'DLM-based'}

{'category': 'Title', 'text': 'us'}

{'category': 'UncategorizedText', 'text': '+'}

{'category': 'Title', 'text': 'Faiss'}

{'category': 'Title', 'text': 'monoT5'}

{'category': 'ListItem', 'text': ''}

{'category': 'Title', 'text': 'monoBERT RankLLaMA'}

{'category': 'ListItem', 'text': ''}

{'category': 'ListItem', 'text': ''}

{'category': 'ListItem', 'text': ''}

{'category': 'UncategorizedText', 'text': '«'}

{'category': 'Title', 'text': 'TILDE'}

{'category': 'UncategorizedText', 'text': '+'}

{'category': 'UncategorizedText', 'text': '_——'}

{'category': 'FigureCaption',
 'text': 'Figure 1: Retrieval-augmented generation workflow. This study investigates the contribution of each component and provides insights into optimal RAG practices through extensive experimentation. The optional methods considered for each component are indicated in bold fonts, while the methods underlined indicate the default choice for individual modules. The methods indicated in blue font denote the best-performing selections identified empirically.'}

{'category': 'NarrativeText',
 'text': 'vector databases to efficiently store feature representations, and the methods for effectively fine-tuning LLMs (see Figure 1).'}

{'category': 'NarrativeText',
 'text': 'What adds complexity and challenge is the variability in implementing each processing step. For example, in retrieving relevant documents for an input query, various methods can be employed. One approach involves rewriting the query first and using the rewritten queries for retrieval [9]. Alternatively, pseudo-responses to the query can be generated first, and the similarity between these pseudo-responses and the backend documents can be compared for retrieval [10]. Another option is to directly employ embedding models, typically trained in a contrastive manner using positive and negative query-response pairs [11, 12]. The techniques chosen for each step and their combinations significantly impact both the effectiveness and efficiency of RAG systems. To the best of our knowledge, there has been no systematic effort to pursue the optimal implementation of RAG, particularly for the entire RAG workflow.'}

{'category': 'NarrativeText',
 'text': 'In this study, we aim to identify the best practices for RAG through extensive experimentation. Given the infeasibility of testing all possible combinations of these methods, we adopt a three-step approach to identify optimal RAG practices. First, we compare representative methods for each RAG step (or module) and select up to three of the best-performing methods. Next, we evaluate the impact of each method on the overall RAG performance by testing one method at a time for an individual step, while keeping the other RAG modules unchanged. This allows us to determine the most effective method for each step based on its contribution and interaction with other modules during response generation. Once the best method is chosen for a module, it is used in subsequent experiments. Finally, we empirically explore a few promising combinations suitable for different application scenarios where efficiency might be prioritized over performance, or vice versa

{'category': 'NarrativeText',
 'text': 'The contributions of this study are three-fold:'}

{'category': 'ListItem',
 'text': '• Through extensive experimentation, we thoroughly investigated existing RAG approaches and their combinations to identify and recommend optimal RAG practices.'}

{'category': 'Footer', 'text': '2'}

# RAG pipeline with Langchain

In [20]:
! pip install langchain langchain_community langchain_chroma

Collecting langchain
  Downloading langchain-0.2.15-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain_community
  Downloading langchain_community-0.2.13-py3-none-any.whl.metadata (2.7 kB)
Collecting langchain_chroma
  Downloading langchain_chroma-0.1.3-py3-none-any.whl.metadata (1.5 kB)
Collecting langchain-core<0.3.0,>=0.2.35 (from langchain)
  Downloading langchain_core-0.2.35-py3-none-any.whl.metadata (6.2 kB)
Collecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl.metadata (2.1 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.106-py3-none-any.whl.metadata (13 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting chromadb!=0.5.4,!=0.5.5,<0.6.0,>=0.4.0 (from langchain_chroma)
  Downloading chromadb-0.5.3-py3-none-any.whl.metadata (6.8 kB)
Collecting fastapi<1,>=0.95.2 (from langchain_chro

In [21]:
! pip install -qU langchain-openai

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/52.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.0/52.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/362.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.9/362.9 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m48.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/318.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.9/318.9 kB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [23]:
!pip install jq

Collecting jq
  Downloading jq-1.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Downloading jq-1.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (737 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/737.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m348.2/737.4 kB[0m [31m10.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m737.4/737.4 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jq
Successfully installed jq-1.8.0


In [24]:
from langchain_community.document_loaders import JSONLoader

In [30]:
loader = JSONLoader(
    file_path='elementos.json',
    jq_schema='.[]',
    content_key='text')

data = loader.load()

In [32]:
from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

In [35]:
embeddings = AzureOpenAIEmbeddings(
    azure_endpoint="https://peplink.openai.azure.com/",
    openai_api_version="2024-02-01",
    api_key="api-key",
    azure_deployment="chatExcel"
)

In [37]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(data)
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)

In [41]:
chat = AzureChatOpenAI(
    azure_endpoint="azure_endpoint",
    openai_api_version="2024-02-01",
    openai_api_key="api-key",
    model="GPT4o",
    temperature=0
)

In [45]:
retriever = vectorstore.as_retriever()

template = """
    Você é um assistente especialista, use os seguintes trechos recuperados \
    para responder as perguntas.
    Se não souber a resposta, diga que não sabe.

    Pergunta: {pergunta}
    Contexto: {contexto}
    Resposta:
"""
prompt = ChatPromptTemplate.from_template(template)
rag_chain = (
        {"contexto": retriever, "pergunta": RunnablePassthrough()}
        | prompt
        | chat
        | StrOutputParser()
)

In [46]:
query = "De um resumo do artigo Searching for Best Practices in Retrieval-Augmented Generation"
print(rag_chain.invoke(query))

O artigo "Searching for Best Practices in Retrieval-Augmented Generation" investiga as melhores práticas para a geração aumentada por recuperação (RAG). As técnicas de RAG têm se mostrado eficazes na integração de informações atualizadas, mitigação de alucinações e melhoria da qualidade das respostas, especialmente em domínios especializados. No entanto, essas abordagens ainda enfrentam desafios devido à sua implementação complexa e tempos de resposta prolongados.

O estudo examina as contribuições de cada componente do fluxo de trabalho de RAG e fornece insights sobre práticas ideais por meio de experimentação extensiva. Métodos opcionais para cada componente são considerados, com as melhores seleções identificadas empiricamente. Além disso, o artigo sugere várias estratégias para implantar RAG que equilibram desempenho e eficiência. Também é demonstrado que técnicas de recuperação multimodal podem melhorar significativamente as capacidades de resposta a perguntas sobre entradas visua