# RAG Application - POC

### Enter your query

In [29]:
query_text = input("Ask anything about our final year project")
query_text

'o que dia a GED 13?'

### Import LangChain libraries

!pip install langchain-community
!pip install nltk
!pip install scikit-learn
!pip install langchain chromadb pypdf openai sentence-transformers accelerate


In [30]:
from langchain_community.document_loaders import PyPDFLoader # type: ignore
from langchain_text_splitters import RecursiveCharacterTextSplitter # type: ignore
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.llms import Ollama

### Import utility functions

In [31]:
from keyword_generator import extract_keywords
from db import get_db_collection, add_to_collection, query_collection

## Load pdf document and load it into Vector Database

In [32]:
file_path = (
    "docs/GED-13-3.pdf"
)
loader = PyPDFLoader(file_path)
document = loader.load()
print("No. of pages in the document:", len(document))

No. of pages in the document: 132


#### Split pages into chunks of texts

In [33]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunked_documents = text_splitter.split_documents(document)

#### Prepare data for indexing
- Generate Unique Id for individual chunks
- Generate keywords for metadata using NLP

In [34]:
contents = []
ids = []
keywords = []

page_no = 0
c_index = -1
for index, doc in enumerate(chunked_documents):
    metadata = doc.metadata
    source = metadata['source'].replace('/','-').replace('.','-')

    if metadata['page'] > page_no:
        c_index = 0
    else:
        c_index += 1

    page_no = metadata['page']
    
    chunk_id = f"{source}-p{page_no}-c{c_index}"

    contents.append(doc.page_content)
    ids.append(chunk_id)
    keywords.append(extract_keywords(doc.page_content))
    print("Processed chunk:", chunk_id)

Processed chunk: docs-GED-13-3-pdf-p0-c0
Processed chunk: docs-GED-13-3-pdf-p0-c1
Processed chunk: docs-GED-13-3-pdf-p0-c2
Processed chunk: docs-GED-13-3-pdf-p0-c3
Processed chunk: docs-GED-13-3-pdf-p0-c4
Processed chunk: docs-GED-13-3-pdf-p0-c5
Processed chunk: docs-GED-13-3-pdf-p0-c6
Processed chunk: docs-GED-13-3-pdf-p1-c0
Processed chunk: docs-GED-13-3-pdf-p1-c1
Processed chunk: docs-GED-13-3-pdf-p1-c2
Processed chunk: docs-GED-13-3-pdf-p1-c3
Processed chunk: docs-GED-13-3-pdf-p1-c4
Processed chunk: docs-GED-13-3-pdf-p2-c0
Processed chunk: docs-GED-13-3-pdf-p2-c1
Processed chunk: docs-GED-13-3-pdf-p2-c2
Processed chunk: docs-GED-13-3-pdf-p3-c0
Processed chunk: docs-GED-13-3-pdf-p3-c1
Processed chunk: docs-GED-13-3-pdf-p3-c2
Processed chunk: docs-GED-13-3-pdf-p4-c0
Processed chunk: docs-GED-13-3-pdf-p4-c1
Processed chunk: docs-GED-13-3-pdf-p5-c0
Processed chunk: docs-GED-13-3-pdf-p5-c1
Processed chunk: docs-GED-13-3-pdf-p5-c2
Processed chunk: docs-GED-13-3-pdf-p6-c0
Processed chunk:

### Create a collection in Chroma DB

In [35]:
COLLECTION_NAME = "my_project"
collection = get_db_collection(COLLECTION_NAME)

metadata = [{"tags": ", ".join(i) } for i in keywords]
add_to_collection(collection, contents, ids, metadata)

Insert of existing embedding ID: docs-GED-13-3-pdf-p0-c0
Insert of existing embedding ID: docs-GED-13-3-pdf-p0-c1
Insert of existing embedding ID: docs-GED-13-3-pdf-p0-c2
Insert of existing embedding ID: docs-GED-13-3-pdf-p0-c3
Insert of existing embedding ID: docs-GED-13-3-pdf-p0-c4
Insert of existing embedding ID: docs-GED-13-3-pdf-p0-c5
Insert of existing embedding ID: docs-GED-13-3-pdf-p0-c6
Insert of existing embedding ID: docs-GED-13-3-pdf-p1-c0
Insert of existing embedding ID: docs-GED-13-3-pdf-p1-c1
Insert of existing embedding ID: docs-GED-13-3-pdf-p1-c2
Insert of existing embedding ID: docs-GED-13-3-pdf-p1-c3
Insert of existing embedding ID: docs-GED-13-3-pdf-p1-c4
Insert of existing embedding ID: docs-GED-13-3-pdf-p2-c0
Insert of existing embedding ID: docs-GED-13-3-pdf-p2-c1
Insert of existing embedding ID: docs-GED-13-3-pdf-p2-c2
Insert of existing embedding ID: docs-GED-13-3-pdf-p3-c0
Insert of existing embedding ID: docs-GED-13-3-pdf-p3-c1
Insert of existing embedding ID

Documents loaded to DB


### Chunks retreived from the DB

In [36]:
query_result = query_collection(collection, query_text)
query_result

{'ids': [['docs-GED-13-3-pdf-p126-c2',
   'docs-GED-13-3-pdf-p111-c1',
   'docs-GED-13-3-pdf-p1-c0']],
 'distances': [[0.34087724049164814, 0.40453037184368423, 0.4205980896949768]],
 'metadatas': [[{'tags': 'ged, de, citações, item, para'},
   {'tags': '12, gaiola, indução, ou, 15'},
   {'tags': '37, 38, aterramento, carga, cálculo'}]],
 'embeddings': None,
 'documents': [['deverá ser evitado para evitar problemas de acesso.  \n2.21  04/05/18  - Revisão do GED 13 para adequação nos prazos de exigência do DPS nas \nDistribuidoras do Grupo CPFL Energia conforme item 8.3.    \n2.22  29/06/2018  - Retirada todas citações de RGE Sul.  \n- Retirada as citações das caixas descontinuadas GED – 4138, GED \n– 4139, GED – 4142, GED – 13768, GED – 4017, GED – 4018, GED – 4019, \nGED – 4020, GED – 4021, GED – 4022, GED – 4023, GED – 4024, GED – \n4025, GED – 4026, GED – 4027, GED – 5787, GED – 12903, GED – 12904, \nGED –  \n12905, GED – 12906, GED – 12907, GED – 12908, GED – 12909, GED –  \n12910,

### Prepare final prompt to give to LLM model

In [37]:
text = ""
for doc in query_result['documents']:
    for i in doc:
        text += i

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "answer in PT-BR"
    "\n\n"
    "{context}"
).format(context=text)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)
final_prompt = prompt.format(input=query_text)
final_prompt

"System: You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Use three sentences maximum and keep the answer concise.answer in PT-BR\n\ndeverá ser evitado para evitar problemas de acesso.  \n2.21  04/05/18  - Revisão do GED 13 para adequação nos prazos de exigência do DPS nas \nDistribuidoras do Grupo CPFL Energia conforme item 8.3.    \n2.22  29/06/2018  - Retirada todas citações de RGE Sul.  \n- Retirada as citações das caixas descontinuadas GED – 4138, GED \n– 4139, GED – 4142, GED – 13768, GED – 4017, GED – 4018, GED – 4019, \nGED – 4020, GED – 4021, GED – 4022, GED – 4023, GED – 4024, GED – \n4025, GED – 4026, GED – 4027, GED – 5787, GED – 12903, GED – 12904, \nGED –  \n12905, GED – 12906, GED – 12907, GED – 12908, GED – 12909, GED –  \n12910, GED – 12911, GED – 12912, GED – 12913, GED – 12914, GED – \n12915  \n- Retirada a citação dos padrões de poste GED 42

### Connect to local LLM, I'm using phi-3 from Microsoft

In [38]:
llm = Ollama(
    model="phi3:mini",
    keep_alive=-1,
    format="json"
)

### Final output from the LLM using the context

In [39]:
llm.invoke(final_prompt)

'{}\n\n\n\n\n   \n  \n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n'