In [30]:
from langchain_community.document_loaders import PyPDFDirectoryLoader,PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

In [2]:
loader = PyPDFDirectoryLoader("./census")
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap = 100)
final_doc = text_splitter.split_documents(docs)
final_doc[0]

Document(page_content='Health Insurance Coverage Status and Type \nby Geography: 2021 and 2022\nAmerican Community Survey Briefs\nACSBR-015Issued September 2023Douglas Conway and Breauna Branch\nINTRODUCTION\nDemographic shifts as well as economic and govern-\nment policy changes can affect people’s access to health coverage. For example, between 2021 and 2022, the labor market continued to improve, which may have affected private coverage in the United States \nduring that time.\n1 Public policy changes included \nthe renewal of the Public Health Emergency, which \nallowed Medicaid enrollees to remain covered under the Continuous Enrollment Provision.\n2 The American \nRescue Plan (ARP) enhanced Marketplace premium subsidies for those with incomes above 400 percent of the poverty level as well as for unemployed people.\n3', metadata={'source': 'census\\acsbr-015.pdf', 'page': 0})

In [3]:
len(final_doc)

296

In [11]:
model_name = "BAAI/bge-small-en-v1.5"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
hf = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

  from tqdm.autonotebook import tqdm, trange
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [14]:
import numpy as np
np.array(hf.embed_query(final_doc[0].page_content)).shape

(384,)

In [15]:
db = FAISS.from_documents(final_doc[0:100],embedding=hf)

In [17]:
query = "what is health insurannce coverage?"
rel_doc = db.similarity_search(query,k = 3)
rel_doc

[Document(page_content='2 U.S. Census Bureau\nWHAT IS HEALTH INSURANCE COVERAGE?\nThis brief presents state-level estimates of health insurance coverage \nusing data from the American Community Survey (ACS). The  \nU.S. Census Bureau conducts the ACS throughout the year; the \nsurvey asks respondents to report their coverage at the time of \ninterview. The resulting measure of health insurance coverage, \ntherefore, reflects an annual average of current comprehensive \nhealth insurance coverage status.* This uninsured rate measures a \ndifferent concept than the measure based on the Current Population \nSurvey Annual Social and Economic Supplement (CPS ASEC). \nFor reporting purposes, the ACS broadly classifies health insurance \ncoverage as private insurance or public insurance. The ACS defines \nprivate health insurance as a plan provided through an employer \nor a union, coverage purchased directly by an individual from an \ninsurance company or through an exchange (such as healthca

In [19]:
retriever = db.as_retriever(
        search_type="similarity",
        search_kwargs={'k': 3}
    )
retriever

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceBgeEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000001A63DDB1390>, search_kwargs={'k': 3})

In [21]:
import os 
from dotenv import load_dotenv
load_dotenv()
os.environ['HUGGINGFACEHUB_API_TOKEN']= "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"

In [26]:
from langchain_community.llms import HuggingFaceHub

hf_hub = HuggingFaceHub(
    repo_id = "mistralai/Mistral-7B-v0.1",
    model_kwargs = {'tmeperature':0.1,"max_length":500}
)
query = "what is health insurance coverage?"
hf_hub.invoke(query)

'what is health insurance coverage?\n\nHealth insurance is a type of insurance coverage that covers the cost of an individual’s medical and surgical expenses. It is a contract between an individual and an insurance company where the individual agrees to pay a premium and the insurance company agrees to pay for covered medical expenses.\n\nHealth insurance can be obtained through an employer, through a government program such as Medicare or Medicaid, or through a private insurance company. The type of health insurance coverage an individual has will depend on their'

In [27]:
prompt = """
Use the following piece of context to answer the question asked.
Provide output only on the basis of provided context
{context}
Question: {question}

Helpful Answers:
"""

In [28]:
prompt= PromptTemplate(template=prompt,input_variables=['context','question'])

In [32]:
retriever_QA = RetrievalQA.from_chain_type(
    llm = hf_hub,
    chain_type = 'stuff',
    retriever = retriever,
    return_source_documents = True,
    chain_type_kwargs={"prompt":prompt})

In [33]:
query = """
Differences in the uninsured rate by state in 2022"""

In [35]:
result = retriever_QA.invoke({'query':query})
print(result['result'])


Use the following piece of context to answer the question asked.
Provide output only on the basis of provided context
coverage.• From 2021 to 2022, nine states 
reported increases in private 
coverage, while seven reported 
decreases (Appendix Table B-2). 
DIFFERENCES IN THE 
UNINSURED RATE BY STATE 
IN 2022
In 2022, uninsured rates at the 
time of interview ranged across 
states from a low of 2.4 percent 
in Massachusetts to a high of 16.6 
percent in Texas, compared to the 
national rate of 8.0 percent.10 Ten 
of the 15 states with uninsured 
10 The uninsured rates in the District 
of Columbia and Massachusetts were not 
statistically different.rates above the national aver -
age were states that have not 
expanded Medicaid eligibility, and 
two of those 15 states, Oklahoma 
(11.7 percent) and Missouri (8.6 
percent), had recently expanded 
Medicaid eligibility in 2022.11 
Twenty-nine states and the District 
of Columbia had an uninsured 
rate below the national average. 
11 Between