In [47]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.llms import HuggingFaceEndpoint


In [4]:
!pip install pypdf
loader = PyPDFLoader("./census/acsbr-015.pdf")
doc = loader.load()



In [13]:
!pip install sentence-transformers

text_split = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
document = text_split.split_documents(doc)
embedding = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",      #sentence-transformers/all-MiniLM-l6-v2
    model_kwargs={'device':'cpu'},
    encode_kwargs={'normalize_embeddings':True}
)
vector = FAISS.from_documents(doc[:50], embedding)



modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [49]:
retriever = vector.as_retriever()

In [43]:

llm = HuggingFaceEndpoint(
    task="text-generation",
    repo_id="mistralai/Mistral-7B-Instruct-v0.3",
    temperature=0.7,
    model_kwargs={
        "max_length":500
    },
    huggingfacehub_api_token="XXXX"
)

In [44]:
llm.invoke("who is the president of india")



" in 2021-2022?\n\nAs of 2021, the President of India is Ram Nath Kovind. He was elected on July 19, 2017, and his term ends on July 24, 2022. However, the President of India for the year 2022-2023 will be elected in the summer of 2022. The President is the head of state in India, while the Prime Minister is the head of government. The President's role is largely ceremonial, but they do have some important powers, such as the power to veto certain bills passed by Parliament."

In [79]:
prompt_template="""
Use the following piece of context to answer the question asked.
Please try to provide the answer only based on the context

{context}
Question:{question}

Helpful Answers:
 """
prompt=PromptTemplate(template=prompt_template,input_variables=["context","question"])

In [80]:
retrieval = RetrievalQA.from_chain_type(
    llm = llm, 
    chain_type="stuff", 
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt":prompt}
)

In [81]:
query="""DIFFERENCES IN THE
UNINSURED RATE BY STATE
IN 2022"""
# Call the QA chain with our query.
result = retrieval.invoke({"query": query})
print(result['result'])



1. Seven states had a decrease in uninsured rate in 2022 that was driven by increases in private coverage. These states were Florida, Kansas, Mississippi, North Carolina, Ohio, South Carolina, and Texas.

2. For seven states, the uninsured rate decrease was related to increases in public coverage with no corresponding change in the level of private coverage. These states were Alabama, California, Georgia, Illinois, Indiana, Michigan, and Oklahoma.

3. In three states (Missouri, New York, and Virginia), it was shifts in coverage from private to public that contributed to the decline in their uninsured rates.

4. Massachusetts had the lowest uninsured rate and Texas had the highest in 2022.

5. In 2022, Utah had the highest private coverage and lowest public coverage rate, while New Mexico had the highest public coverage and lowest private coverage rate.

6. Across the 25 most populous metropolitan areas, Boston-Cambridge-Newton, MA-NH had the lowest uninsured rate in 2022, while three m

In [82]:
prmpt = PromptTemplate(
    input_variables=["context", "input"],
    template="""
    Use the following context to answer the input question.
    Context: {context}
    Question: {input}
    Answer:"""
)

In [83]:
document_chain = create_stuff_documents_chain(llm, prmpt)
retrieval_chain = create_retrieval_chain(retriever, document_chain)

In [84]:
result = retrieval_chain.invoke({"input": query})
print(result["answer"])



 In 2022, the uninsured rate in Texas was 16.6%.
