In [59]:
!pip install lark

Collecting lark
  Downloading lark-1.1.7-py3-none-any.whl (108 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.9/108.9 kB[0m [31m717.9 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: lark
Successfully installed lark-1.1.7


In [48]:
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA, ConversationalRetrievalChain

from langchain.memory import ConversationBufferMemory



In [2]:
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

# Loading of PDF assets

The load of PDF documents is used to insert the data in the langflow

In [3]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("../inputs/Dissertação_inglês_10_09_23_pt2.pdf")
doc = loader.load()

Each page is a `Document`.

A `Document` contains text (`page_content`) and `metadata`.

In [4]:
len(doc)

87

In [5]:
page = doc[22]
print(page.page_content[0:500])

Acronyms list
3GPP3rd Generation Partnership Project
ACFAutoCorrelation Function
AIArtificial Intelligence
ARIMA AutoRegressive Moving Average
BSBase Station
BSsBase Stations
CDRCall Detail Records
CNCore Network
DFTDiscrete Fourier Transform
ETSIEuropean Telecommunications Standards Institute
GDPR General Data Protection Regulation


In [6]:
print(page.metadata)

{'source': '../inputs/Dissertação_inglês_10_09_23_pt2.pdf', 'page': 22}


# Creation of a Chroma Vectorstore

A vectorstore is a object that stores text data in a format that allows easy retrieval of information. This data is the context (text that stores information about the problem) embedded using a embedding. In this case, we are going to use OpenAI embedding.

The first step is to **split** the document in batches that allows the insertion of the data in the context input of the model

In [7]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

In [8]:
splits = text_splitter.split_documents(doc)

In [9]:
len(splits)

122

Now it is time to make **embeddings** of the information using the embedding of the model

In [10]:
!rm -Rf /Users/patrick/Documents/Pessoal/paper-rag/chroma/

embedding = OpenAIEmbeddings()
persist_directory = '/Users/patrick/Documents/Pessoal/paper-rag/chroma/'

In [11]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

In [12]:
print(vectordb._collection.count())

122


# Testing some retrieval searches

In [13]:
question = "what means NFV?"
docs_ssearch = vectordb.similarity_search(question,k=3)  # Simple similarity search
print(docs_ssearch[0].page_content)

usagesofNFVintheindustryand, accordingtoEuropeanTelecommunicationsStandards
Institute (ETSI) (ETSI, 2013), some advantages of network virtualization that stand out
are:
1. NFV as a service: a NFV can be provided as a service by a network operator similar
to cloud computing services (RANKOTHGE et al., 2015);
2. Virtualization of Core Network (CN) and BSs (BASTA et al., 2014);
3. Virtualization of the home environment: installation of new equipment and on-site
technical support can be less frequent (BRONSTEIN; SHRAGA, 2014);


In [14]:
doc_mmr = vectordb.max_marginal_relevance_search(question,k=2, fetch_k=3)  # MMR is a technique that tries to get 
print(doc_mmr[0].page_content)

usagesofNFVintheindustryand, accordingtoEuropeanTelecommunicationsStandards
Institute (ETSI) (ETSI, 2013), some advantages of network virtualization that stand out
are:
1. NFV as a service: a NFV can be provided as a service by a network operator similar
to cloud computing services (RANKOTHGE et al., 2015);
2. Virtualization of Core Network (CN) and BSs (BASTA et al., 2014);
3. Virtualization of the home environment: installation of new equipment and on-site
technical support can be less frequent (BRONSTEIN; SHRAGA, 2014);


# Testing metadata self-query retriever

In [15]:
metadata_field_info = [
    AttributeInfo(
        name="page",
        description="The page from the lecture",
        type="integer",
    ),
]

In [16]:
document_content_description = "Lecture notes"
llm = OpenAI(temperature=0)
retriever = SelfQueryRetriever.from_llm(
    llm=llm,
    vectorstore=vectordb,
    document_contents=document_content_description,
    metadata_field_info=metadata_field_info,
    verbose=True
)

In [19]:
question = "what means NFV?"

In [21]:
docs = retriever.get_relevant_documents(question)

for d in docs:
    print(d.metadata)
    print(d.page_content)
    print('\n')



query='NFV' filter=None limit=None
{'source': '../inputs/Dissertação_inglês_10_09_23_pt2.pdf', 'page': 53}
usagesofNFVintheindustryand, accordingtoEuropeanTelecommunicationsStandards
Institute (ETSI) (ETSI, 2013), some advantages of network virtualization that stand out
are:
1. NFV as a service: a NFV can be provided as a service by a network operator similar
to cloud computing services (RANKOTHGE et al., 2015);
2. Virtualization of Core Network (CN) and BSs (BASTA et al., 2014);
3. Virtualization of the home environment: installation of new equipment and on-site
technical support can be less frequent (BRONSTEIN; SHRAGA, 2014);


{'source': '../inputs/Dissertação_inglês_10_09_23_pt2.pdf', 'page': 54}
4.2. The predictive model in the 5G infrastructure 41
4. Virtualization of CDNs (MANGILI; MARTIGNON; CAPONE, 2014; KIM; LEE,
2014).
The main barrier of this new approach is the overall performance, especially in middle-
boxchains. Sometrafficcouldflowthroughvariousmiddleboxesbasedontheirne

# Final solution: compression method

Ref: https://blog.langchain.dev/improving-document-retrieval-with-contextual-compression/

In [23]:
# Wrap our vectorstore
llm = OpenAI(temperature=0)  # Model used
compressor = LLMChainExtractor.from_llm(llm)  # uses an LLMChain to extract from each document only the statements that are relevant to the query.

compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever(search_type = "mmr")  # Retriever method using MMR retriavel to ensure variability in the answer
)

In [24]:
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))

In [29]:
question = "Is the model trained on Milan or Trento data?"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)

Document 1:

Milan, Trento, Telecom-munications dataset in Milan, Novem-ber 1st, 2013 and December 31st, 2013, 10,000zonal regions listed in the city
----------------------------------------------------------------------------------------------------
Document 2:

"The optimizer used was Adamax, which is based on Adam (KINGMA; BA, 2017). The loss function was the Mean Squared Error (MSE) and 80% of the entire dataset was used to train, with the other 20% used as test."


# Making a Q&A chatbot to talk with PDF data

In [31]:
# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you can't make a answer with context, just say that you don't know, don't try to make up an answer. 
{context}
Question: {question}
Helpful Answer:"""

QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

In [84]:
# Run chain
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=compression_retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [85]:
question = "which database is recommended to be used and why"
result = qa_chain({"query": question})
print(result["result"])



 It depends on the specific needs of the project. Each of the databases mentioned (Cassandra, HBase, and Redis) have different strengths and weaknesses, so it is important to consider the specific needs of the project before deciding which one to use.


# Creating memory

In [38]:
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

In [76]:
# Run chain
qa = ConversationalRetrievalChain.from_llm(
    llm,
    retriever=compression_retriever,
    # retriever=vectordb.as_retriever(),
    memory=memory,
    return_source_documents=False,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

ValidationError: 1 validation error for ConversationalRetrievalChain
chain_type_kwargs
  extra fields not permitted (type=value_error.extra)

In [72]:
question = "Which databases are recommended by the author?"
result = qa({"question": question})
print(result['answer'])

 The author does not recommend any specific databases, they just mention that the database used is publicly available.


In [75]:
question = "Which is the publicly available database mentioned by the author?"
result = qa({"question": question})
print(result['answer'])

 I don't know.
