# Simple RAG System based on pdf knowledge base

Using LangChain and pypdf to read pdf and create a simple RAG application

## Before Start

Install, Imports and Other Strange Animals

In [1]:
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
import os
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

import sys
sys.path.append('../../')  # replace with the actual path
import lib.key_param as key_param
import lib.Utils as utils

In [2]:
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = key_param.LANGSMITH_API_KEY
os.environ['OPENAI_API_KEY'] = key_param.OPENAI_API_KEY

## Pre-processing

### Load Documents (PDF)

In [None]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("../../sample_files/7cc24bccf34a87a9bc817fe89e320a112a9cec26b9c5db7250e1cef7dc06eff0.pdf")
pages = loader.load_and_split()

In [8]:
chunk1 = pages[2].page_content
print(chunk1)

3
AdoptedThe European Data Protection Board
Having regard to Article 70 (1 )(e) of the Regulation 2016/679/EU of the European Parliament and of
the Council of 27 April 2016 on the protection of natural persons with regard to the processing of
personal data and on the free movement of such data, and repealing Directiv e 95/46/EC .
HAS ADOPTED THE FOLL OWING GUIDELINES :
INTRODUCTION
The territorial scope of General Data Protection Regulation1(the GDPR) is determined by Article 3 of
the Regulation and represents a significant evolution of the EU data protection law compared to the
framework defined by Directive 95/46/EC2. In part, the GDPR confirms choices made by the EU
legislator and the Court of Justice of the European Union (CJEU) in the contex t of Directive 95/46/EC.
However, important new elements have been introduced. Most importantly, the main objective of
Article 4 of the Directive was to define which Member State’s national law is applicable ,whereas
Article 3 of the GDPR defi

In [13]:
docs = pages
docs

[Document(page_content='1\nAdopted\nGuidelines 3/2018onthe territorial scope of the GDPR\n(Article 3) -Version for public consultation\nAdopted on 16 November 2018', metadata={'source': '../../sample_files/7cc24bccf34a87a9bc817fe89e320a112a9cec26b9c5db7250e1cef7dc06eff0.pdf', 'page': 0}),
 Document(page_content='2\nAdoptedContents\nIntroduction ................................ ................................ ................................ ................................ .............3\n1Application of the establishment criterion -Art 3(1)................................ ................................ ......4\n2Application of the targeting criterion –Art 3(2)................................ ................................ ...........12\n3Processing in a place where Member State law applies by virtue of public international law ....19\n4Representative of controllers or processors not established in the Union ................................ ..19', metadata={'source': '../../sample

### Split

In [14]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

In [15]:
len(pages[2].page_content)

3450

In [16]:
len(pages)

27

In [17]:
len(splits[2].page_content)

973

In [18]:
len(splits)

102

## Embded

In [20]:
# Embed
vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever()

## Retrieval and Generation

Load template Prompt

In [21]:
from langchain.prompts import ChatPromptTemplate

# from langchain import hub
# prompt = hub.pull("usctrojan/in-house-legal")

# Prompt
template_rag = """System Message:

You are a privacy consultant for a marketing company that operate internationally. Provide your legal advice based on the context.

Context: {context}

Question: {question}
"""

prompt_rag = ChatPromptTemplate.from_template(template_rag)


In [65]:
prompt_rag.pretty_print()


System Message:

You are a privacy consultant for a marketing company that operate internationally. Provide your legal advice based on the context.

Context: [33;1m[1;3m{context}[0m

Question: [33;1m[1;3m{question}[0m


## Model selection

In [22]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

## Post-processing

In [23]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

## Chain

In [24]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt_rag
    | llm
    | StrOutputParser()
)

## Question

In [25]:
# 1 (bad) - what is the edpb position in relation to the one-stop shop mechanism?
# 1.2 (good) - What is the EDPB position in relation to the one stop shop mechanism provided by article 52 of the GDPR in the context of the of geographical scope of application of the GDPR (art. 3.2), so called targeting?
#     -- (art. 56 is the correct one)

output_rag = rag_chain.invoke("In the context of the material scope of application of GDPR, what is the edpb position in relation to the one-stop shop mechanism?")

## Answer without RAG

In [26]:
from langchain.prompts import ChatPromptTemplate

# Prompt
template_no_rag = """Question: {question}
"""

prompt_no_rag = ChatPromptTemplate.from_template(template_no_rag)

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

chain_no_rag = prompt_no_rag | llm | StrOutputParser()

output_no_rag = chain_no_rag.invoke({"question":"In the context of the material scope of application of GDPR, what is the edpb position in relation to the one-stop shop mechanism?"})

In [77]:
prompt_no_rag.pretty_print()


Question: [33;1m[1;3m{question}[0m


## Compare the results

In [28]:
print("--- NO RAG ---\n")
print(utils.format_print(output_no_rag, 100))

--- NO RAG ---

The European Data Protection Board (EDPB) supports the one-stop shop mechanism as outlined in the
GDPR. This mechanism allows businesses operating in multiple EU countries to deal with just one lead
supervisory authority for data protection issues, rather than having to comply with multiple
authorities in each country where they operate. The EDPB believes that the one-stop shop mechanism
streamlines the regulatory process and ensures consistent enforcement of data protection laws across
the EU.


In [29]:
print("--- RAG ---\n")
print(utils.format_print(output_rag, 100))

--- RAG ---

The EDPB confirms that in the absence of an establishment in the Union, a controller or processor
cannot benefit from the one-stop shop mechanism provided for in Article 56 of the GDPR. The GDPR’s
cooperation and consistency mechanism only applies to controllers and processors with an
establishment, or establishments, within the European Union. Controllers and processors will need to
take into account other applicable texts, such as EU or Member States’ sectorial legislation and
national laws, in addition to the GDPR.


# Citations

In [20]:
%pip install -qU langchain langchain-openai langchain-anthropic langchain-community wikipedia

Note: you may need to restart the kernel to use updated packages.


Setup the model, retriver and promopt. The source of text, in this example we're using WikipediaRetriever to get data from Wikipedia

In [25]:
from langchain_community.retrievers import WikipediaRetriever
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
wiki = WikipediaRetriever(top_k_results=6, doc_content_chars_max=2000)
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You're a helpful AI assistant. Given a user question and some Wikipedia article snippets, answer the user question. If none of the articles answer the question, just say you don't know.\n\nHere are the Wikipedia articles:{context}",
        ),
        ("human", "{question}"),
    ]
)
prompt.pretty_print()


You're a helpful AI assistant. Given a user question and some Wikipedia article snippets, answer the user question. If none of the articles answer the question, just say you don't know.

Here are the Wikipedia articles:[33;1m[1;3m{context}[0m


[33;1m[1;3m{question}[0m


Let's chain all together. We need to add some logic for formatting the text from wikipedia

In [26]:
from operator import itemgetter
from typing import List

from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import (
    RunnableLambda,
    RunnableParallel,
    RunnablePassthrough,
)

def format_docs(docs: List[Document]) -> str:
    """Convert Documents to a single string.:"""
    formatted = [
        f"Article Title: {doc.metadata['title']}\nArticle Snippet: {doc.page_content}"
        for doc in docs
    ]
    return "\n\n" + "\n\n".join(formatted)

format = itemgetter("docs") | RunnableLambda(format_docs)

# subchain for generating an answer once we've done retrieval
answer = prompt | llm | StrOutputParser()

# complete chain that calls wiki -> formats docs to string -> runs answer subchain -> returns just the answer and retrieved docs.
chain = (
    RunnableParallel(question=RunnablePassthrough(), docs=wiki)
    .assign(context=format)
    .assign(answer=answer)
    .pick(["answer", "docs"])
)

In [29]:
chain.invoke("How fast are cheetahs?")

{'answer': 'Cheetahs are capable of running at speeds between 93 to 104 km/h (58 to 65 mph). They have evolved specialized adaptations for speed, including a light build, long thin legs, and a long tail, making them the fastest land animals.',
 'docs': [Document(page_content='The cheetah (Acinonyx jubatus) is a large cat and the fastest land animal. It has a tawny to creamy white or pale buff fur that is marked with evenly spaced, solid black spots. The head is small and rounded, with a short snout and black tear-like facial streaks. It reaches 67–94 cm (26–37 in) at the shoulder, and the head-and-body length is between 1.1 and 1.5 m (3 ft 7 in and 4 ft 11 in). Adults weigh between 21 and 72 kg (46 and 159 lb). The cheetah is capable of running at 93 to 104 km/h (58 to 65 mph); it has evolved specialized adaptations for speed, including a light build, long thin legs and a long tail.\nThe cheetah was first described in the late 18th century. Four subspecies are recognised today that are

Now that we try to retrive information from wikipedia, let's try to cite which document we're using.

Converting Pydantic_v1 Object to JSONSchema format expected by OpenAI. Se we create a template for the answer.

In [30]:
from langchain_core.pydantic_v1 import BaseModel, Field

class cited_answer(BaseModel):
    """Answer the user question based only on the given sources, and cite the sources used."""

    answer: str = Field(
        ...,
        description="The answer to the user question, which is based only on the given sources.",
    )
    citations: List[int] = Field(
        ...,
        description="The integer IDs of the SPECIFIC sources which justify the answer.",
    )

Let's see the output of the model

In [31]:
llm_with_tool = llm.bind_tools(
    [cited_answer],
    tool_choice="cited_answer",
)
example_q = """What Brian's height?

Source: 1
Information: Suzy is 6'2"

Source: 2
Information: Jeremiah is blonde

Source: 3
Information: Brian is 3 inches shorted than Suzy"""
llm_with_tool.invoke(example_q)

AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_atgfrZZqUV4BIMGdLtLernXO', 'function': {'arguments': '{"answer":"Brian\'s height is 6\'2\\" - 3 inches.","citations":[1,3]}', 'name': 'cited_answer'}, 'type': 'function'}]}, response_metadata={'token_usage': {'completion_tokens': 24, 'prompt_tokens': 150, 'total_tokens': 174}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': 'fp_c2295e73ad', 'finish_reason': 'stop', 'logprobs': None}, id='run-21b7e1f6-3435-411b-87cc-81bfc1f57b08-0', tool_calls=[{'name': 'cited_answer', 'args': {'answer': 'Brian\'s height is 6\'2" - 3 inches.', 'citations': [1, 3]}, 'id': 'call_atgfrZZqUV4BIMGdLtLernXO'}])

Let's add the output parser

In [32]:
from langchain.output_parsers.openai_tools import JsonOutputKeyToolsParser

output_parser = JsonOutputKeyToolsParser(key_name="cited_answer", first_tool_only=True)
(llm_with_tool | output_parser).invoke(example_q)

{'answer': 'Brian\'s height is 6\'2" - 3 inches.', 'citations': [1, 3]}

Now we can create the full chain

In [33]:
def format_docs_with_id(docs: List[Document]) -> str:
    formatted = [
        f"Source ID: {i}\nArticle Title: {doc.metadata['title']}\nArticle Snippet: {doc.page_content}"
        for i, doc in enumerate(docs)
    ]
    return "\n\n" + "\n\n".join(formatted)

format_1 = itemgetter("docs") | RunnableLambda(format_docs_with_id)
answer_1 = prompt | llm_with_tool | output_parser
chain_1 = (
    RunnableParallel(question=RunnablePassthrough(), docs=wiki)
    .assign(context=format_1)
    .assign(cited_answer=answer_1)
    .pick(["cited_answer", "docs"])
)

In [34]:
chain_1.invoke("How fast are cheetahs?")

{'cited_answer': {'answer': 'Cheetahs can run at speeds of 93 to 104 km/h (58 to 65 mph). They are the fastest land animals.',
  'citations': [0]},
 'docs': [Document(page_content='The cheetah (Acinonyx jubatus) is a large cat and the fastest land animal. It has a tawny to creamy white or pale buff fur that is marked with evenly spaced, solid black spots. The head is small and rounded, with a short snout and black tear-like facial streaks. It reaches 67–94 cm (26–37 in) at the shoulder, and the head-and-body length is between 1.1 and 1.5 m (3 ft 7 in and 4 ft 11 in). Adults weigh between 21 and 72 kg (46 and 159 lb). The cheetah is capable of running at 93 to 104 km/h (58 to 65 mph); it has evolved specialized adaptations for speed, including a light build, long thin legs and a long tail.\nThe cheetah was first described in the late 18th century. Four subspecies are recognised today that are native to Africa and central Iran. An African subspecies was introduced to India in 2022. It is