In [4]:
!pip install langchain_community
!pip install langchain

Collecting langchain
  Downloading langchain-0.1.17-py3-none-any.whl (867 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m867.6/867.6 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting langchain-text-splitters<0.1,>=0.0.1 (from langchain)
  Downloading langchain_text_splitters-0.0.1-py3-none-any.whl (21 kB)
Installing collected packages: langchain-text-splitters, langchain
Successfully installed langchain-0.1.17 langchain-text-splitters-0.0.1


In [3]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.prompts import PromptTemplate

from langchain.chains import RetrievalQA

## Read the PDFs

In [7]:
!pip install pypdf

Collecting pypdf
  Downloading pypdf-4.2.0-py3-none-any.whl (290 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/290.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━[0m [32m174.1/290.4 kB[0m [31m5.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.4/290.4 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pypdf
Successfully installed pypdf-4.2.0


In [4]:
loader = PyPDFDirectoryLoader("./us_census")

documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

final_documents = text_splitter.split_documents(documents)
final_documents[0]

Document(page_content='Poverty in States and Metropolitan  \nAreas: 2022\nAmerican Community Survey Briefs\nDecember 2023ACSBR-016By Craig Benson\nINTRODUCTION\nPlanners, policymakers, and community stakeholders \nuse poverty estimates as key indicators to evaluate trends and current economic conditions within com-munities and to make comparisons across demo-graphic groups. Federal and state governments often \nuse these estimates to allocate funds to local com-\nmunities. Government agencies, researchers, and local organizations regularly use these estimates to identify the number of individuals and families eligible for vari-ous programs and to measure economic well-being.\nThis brief uses the 2021 and 2022 American \nCommunity Survey (ACS) 1-year estimates and the \n2021 and 2022 Puerto Rico Community Surveys', metadata={'source': 'us_census/acsbr-016.pdf', 'page': 0})

In [5]:
len(final_documents)

316

## Embedding Using HuggingFace

In [11]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/171.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m163.8/171.5 kB[0m [31m4.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (

In [6]:
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name = "BAAI/bge-small-en-v1.5", # or use: sentence-transformers/all-MiniLM-16-v2
    model_kwargs = {'device':'cpu'},
    encode_kwargs={'normalize_embeddings':True}
)

In [7]:
import numpy as np
np.array(huggingface_embeddings.embed_query(final_documents[0].page_content)).shape

(384,)

## Vector Store Creation

In [16]:
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [8]:
vector_db = FAISS.from_documents(final_documents[:120],huggingface_embeddings)

## Query using Similarity Search

In [9]:
query="WHAT IS HEALTH INSURANCE COVERAGE?"
relevant_docments=vector_db.similarity_search(query)

print(relevant_docments[0].page_content)

2 U.S. Census Bureau
WHAT IS HEALTH INSURANCE COVERAGE?
This brief presents state-level estimates of health insurance coverage 
using data from the American Community Survey (ACS). The  
U.S. Census Bureau conducts the ACS throughout the year; the 
survey asks respondents to report their coverage at the time of 
interview. The resulting measure of health insurance coverage, 
therefore, reflects an annual average of current comprehensive 
health insurance coverage status.* This uninsured rate measures a 
different concept than the measure based on the Current Population 
Survey Annual Social and Economic Supplement (CPS ASEC). 
For reporting purposes, the ACS broadly classifies health insurance 
coverage as private insurance or public insurance. The ACS defines 
private health insurance as a plan provided through an employer 
or a union, coverage purchased directly by an individual from an 
insurance company or through an exchange (such as healthcare.


In [10]:
retriever=vector_db.as_retriever(search_type="similarity",search_kwargs={"k":3})
print(retriever)

tags=['FAISS', 'HuggingFaceBgeEmbeddings'] vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x79b4996f9240> search_kwargs={'k': 3}


## Hugging Face hub

In [11]:
import os
os.environ['HUGGINGFACEHUB_API_TOKEN']=""

In [12]:
from langchain_community.llms import HuggingFaceHub

hf=HuggingFaceHub(
    repo_id="tiiuae/falcon-7b",
    model_kwargs={"temperature":0.1,"max_length":500}

)
query="What is the health insurance coverage?"
hf.invoke(query)

  warn_deprecated(


'What is the health insurance coverage?\nHealth insurance is a contract between the insured and the insurance company. The insured pays a premium to the insurance company, and the insurance company agrees to pay for the insured’s medical expenses.\nWhat is the difference between health insurance and health care?\nHealth insurance is a contract between the insured and the insurance company. The insured pays a premium to the insurance company, and the insurance company agrees to pay for the insured’s medical expenses.\nHealth care is the actual services provided to the'

#### Run it locally using Hugging Face pipeline

In [None]:
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

hf = HuggingFacePipeline.from_model_id(
    model_id="tiiuae/falcon-7b",
    task="text-generation",
    pipeline_kwargs={"temperature": 0, "max_new_tokens": 300}
)
query="What is the health insurance coverage?"
llm = hf
llm.invoke(query)

tokenizer_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.73M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/281 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/16.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [1]:
prompt_template="""
Use the following piece of context to answer the question asked.
Please try to provide the answer only based on the context

{context}
Question:{question}

Helpful Answers:
 """

In [13]:
prompt=PromptTemplate(template=prompt_template,input_variables=["context","question"])

In [14]:
retrievalQA=RetrievalQA.from_chain_type(
    llm=hf,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt":prompt}
)

In [15]:
query="""DIFFERENCES IN THE
UNINSURED RATE BY STATE
IN 2022"""

In [16]:
# Call the QA chain with our query.
result = retrievalQA.invoke({"query": query})
print(result['result'])


Use the following piece of context to answer the question asked.
Please try to provide the answer only based on the context

comparison of ACS and CPS ASEC measures 
of health insurance coverage, refer to < www.
census.gov/topics/health/health-insurance/
guidance.html >.
9 Respondents may have more than one 
health insurance coverage type at the time 
of interview. As a result, adding the total 
number of people with private coverage and 
the total number with public coverage will 
sum to more than the total number with any 
coverage.• From 2021 to 2022, nine states 
reported increases in private 
coverage, while seven reported 
decreases (Appendix Table B-2). 
DIFFERENCES IN THE 
UNINSURED RATE BY STATE 
IN 2022
In 2022, uninsured rates at the 
time of interview ranged across 
states from a low of 2.4 percent 
in Massachusetts to a high of 16.6 
percent in Texas, compared to the 
national rate of 8.0 percent.10 Ten 
of the 15 states with uninsured 
10 The uninsured rates in the Distr