In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.prompts import PromptTemplate

from langchain.chains import RetrievalQA

In [2]:
from typing import final


loader = PyPDFDirectoryLoader("./census")
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000 , chunk_overlap = 200)
final_docs = text_splitter.split_documents(documents)
final_docs[0]

Document(metadata={'source': 'census/acsbr-017.pdf', 'page': 0}, page_content='KEY DEFINITIONS\nHousehold income: Includes income of the \nhouseholder and all other people 15 years and older in the household, whether or not they are related to the householder.\nMedian: The point that divides the household \nincome distribution into halves, one half with income above the median and the other with income below the median. The median is based on the income distribution of all households, including those with no income.\nGini index: A summary measure of income \ninequality. The Gini index varies from 0 to 1, with 0 indicating perfect equality, where there is a proportional distribution of income. A Gini index of 1 indicates perfect inequality, where one household has all the income.Household Income in States and \nMetropolitan Areas: 2022\nAmerican  Community S urvey  Briefs\nBy Kirby G. Posey\nACSBR-017\nDecember 2023\nINTRODUCTION\nThis brief presents statistics on median household')

In [3]:
len(final_docs)

316

In [5]:
hF_embeddings = HuggingFaceBgeEmbeddings(model_name = "BAAI/bge-small-en-v1.5", 
                                         model_kwargs={"device": "cpu"}, 
                                         encode_kwargs={'normalize_embeddings' : True})

In [6]:
import numpy as np
np.array(hF_embeddings.embed_query(final_docs[0].page_content)).shape

(384,)

In [7]:
vector_stores = FAISS.from_documents(final_docs[:120] , hF_embeddings)

In [8]:
query = "What is health insurance coverage?"
relevant_docs = vector_stores.similarity_search(query)
print(relevant_docs[0].page_content)

detailed estimates of income and 
to measure change in national-
level estimates. The CPS ASEC 
is the official source of national 
poverty estimates. For more infor -
mation from the CPS ASEC about 
national income estimates, refer to 
the report “ Income in the United 
States: 2022 .”
For information on income esti -
mates from the ACS and how they 
differ from those based on the 
CPS ASEC, refer to “ Fact Sheet: 
Differences Between the American 
Community Survey and the Annual 
Social and Economic Supplement 
to the Current Population Survey  
(CPS ASEC) .”
WHAT IS THE AMERICAN COMMUNITY SURVEY?
The American Community Survey (ACS) is a nationwide survey designed to provide reliable and timely 
demographic, social, economic, and housing data for the nation, states, congressional districts, counties, 
places, and other localities every year. It has an annual sample size of about 3.5 million addresses across


In [9]:
retriever = vector_stores.as_retriever(search_type = "similarity" , search_kwargs = {"k" : 3})
print(retriever)

tags=['FAISS', 'HuggingFaceBgeEmbeddings'] vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x79a75c73cd70> search_kwargs={'k': 3}


In [11]:
import os
from dotenv import load_dotenv
load_dotenv()
os.environ['HUGGINGFACEHUB_API_TOKEN'] = os.getenv("HUGGINGFACEHUB_API_KEY")

Python-dotenv could not parse statement starting at line 1
Python-dotenv could not parse statement starting at line 7


In [12]:
from langchain_community.llms import HuggingFaceHub

hf=HuggingFaceHub(
    repo_id="mistralai/Mistral-7B-v0.1",
    model_kwargs={"temperature":0.1,"max_length":500}
)

hf.invoke(query)

'What is health insurance coverage?\n\nHealth insurance is a type of insurance coverage that covers the cost of an insured individual’s medical and surgical expenses. Depending on the type of health insurance coverage, either the insured pays costs out-of-pocket and is then reimbursed, or the insurer makes payments directly to the provider.\n\nHealth insurance is usually provided by a government-sponsored social insurance program, a private commercial insurer, or an employer.\n\nWhat is'

In [20]:
prompt_template = """Use the given context to answer the questions. Provide answers only based in the context
{context}
Questions : {question}
Answers:
"""

In [22]:
prompt = PromptTemplate(template = prompt_template , input_variables = ["context" , "question"])

In [23]:
retrievalQA = RetrievalQA.from_chain_type(
    llm = hf,
    chain_type = "stuff",
    retriever = retriever,
    return_source_documents = True,
    chain_type_kwargs = {"prompt" : prompt}
)

In [24]:
query = """ DIfferences in the uninsured rate by state in 2022"""

In [26]:
result = retrievalQA.invoke(query)
print(result['result'])

Use the given context to answer the questions. Provide answers only based in the context
as being “near poverty” in this 
brief.
Table 1 displays the percentage of 
people in near poverty in 2022 for 
the nation, states, the District of 
Columbia, and Puerto Rico (refer 
to Appendix Table 3 for 2021 esti -
mates). In 2022, the percentage of 
people in the United States in near 
poverty was 3.7 percent, not statis -
tically different from 2021.
Among the states, the percentage 
of individuals in near poverty in 
2022 ranged from 2.4 percent to 
17 In 2022, the percentages of people with 
an income-to-poverty ratio below 50 percent 
in the Minneapolis (4.1 percent), Washington,  
DC (4.2 percent), and Denver (4.3 percent) 
MSAs were not statistically different.
18 In 2022, the percentages of people 
with an income-to-poverty ratio below 50 
percent in the Houston (6.8 percent), Detroit 
(6.7 percent), New York (6.4 percent), Los 
Angeles (6.3 percent), and San Antonio 
(6.2 percent) MSAs