In [1]:
from langchain_community.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

In [2]:
loader = PyPDFDirectoryLoader("./us_census")
documents=loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
final_documents = text_splitter.split_documents(documents)
final_documents[0]

Document(metadata={'source': 'us_census/acsbr-016.pdf', 'page': 0}, page_content='Poverty in States and Metropolitan  \nAreas: 2022\nAmerican Community Survey Briefs\nDecember 2023ACSBR-016By Craig Benson\nINTRODUCTION\nPlanners, policymakers, and community stakeholders \nuse poverty estimates as key indicators to evaluate trends and current economic conditions within com-munities and to make comparisons across demo-graphic groups. Federal and state governments often \nuse these estimates to allocate funds to local com-\nmunities. Government agencies, researchers, and local organizations regularly use these estimates to identify the number of individuals and families eligible for vari-ous programs and to measure economic well-being.\nThis brief uses the 2021 and 2022 American \nCommunity Survey (ACS) 1-year estimates and the \n2021 and 2022 Puerto Rico Community Surveys')

In [3]:
len(final_documents)

316

In [7]:
## Embedding using Huggingface
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",  #sentence-transformers/all-MiniLM-l6-v2
    model_kwargs={'device':'cpu'},
    encode_kwargs={'normalize_embeddings':True}
    )

In [8]:
import  numpy as np
print(np.array(huggingface_embeddings.embed_query(final_documents[0].page_content)))
print(np.array(huggingface_embeddings.embed_query(final_documents[0].page_content)).shape)

[-1.00630485e-02 -3.98545228e-02 -2.14798469e-02  5.86761236e-02
  8.02909210e-02  5.45422100e-02 -1.89205129e-02 -3.84313846e-03
 -5.06586283e-02 -9.22432356e-03  5.20436242e-02  2.59992722e-02
 -1.59694124e-02 -1.32009629e-02 -3.79713103e-02 -2.48674881e-02
 -4.57807034e-02  5.27276881e-02  2.56091040e-02  1.23642124e-02
  5.47024012e-02 -9.55433398e-03 -7.26953149e-02 -1.91431139e-02
  5.93196191e-02 -3.60356569e-02 -1.51837972e-04 -5.13967909e-02
 -4.05045077e-02 -1.59561858e-01  6.18918333e-03  2.72261701e-03
  4.80102040e-02 -6.29635667e-03 -3.37925646e-03 -4.26316224e-02
  3.80310751e-02  5.58205806e-02  8.53929203e-03  5.33878878e-02
 -4.03700920e-04 -2.85844561e-02 -1.75250061e-02 -3.33270766e-02
 -6.52636960e-02 -6.25039861e-02 -1.69884041e-02 -3.36702913e-03
 -5.80814518e-02 -1.87588278e-02  3.09326667e-02 -1.79333501e-02
  1.11762872e-02  4.28264700e-02  4.17709276e-02  2.02438403e-02
  9.16246418e-03 -3.49306874e-02 -2.00626347e-02  4.42196541e-02
  3.16111892e-02  9.09481

In [9]:
## VectorStore Creation
vectorstore=FAISS.from_documents(final_documents[:120],huggingface_embeddings)

In [10]:
## Query using Similarity Search
query="WHAT IS HEALTH INSURANCE COVERAGE?"
relevant_docments=vectorstore.similarity_search(query)

print(relevant_docments[0].page_content)

The American Community Survey (ACS) is a nationwide survey designed to provide communities with reliable and timely demographic, social, economic, and housing data for the nation, states, congressional districts, counties, places, and other localities every year. It has an annual sample size of about 3.5 million 
addresses across the United States and Puerto Rico and includes 
both housing units and group quarters (e.g., nursing facilities and prisons)
.1 The ACS is conducted in every county throughout the 
nation and every municipio in Puerto Rico, where it is called the Puerto Rico Community Survey. Beginning in 2006, ACS 1-year data have been released annually for geographic areas with populations of 65,000 and greater. For information on the ACS sample design and 
other topics, visit <www.census.gov/acs>.
1 While people living in group quarters are sampled in the ACS, those living in 
institutional group quarters (e.g., nursing homes or correctional facilities) are not


In [11]:
retriever=vectorstore.as_retriever(search_type="similarity",search_kwargs={"k":3})
print(retriever)

tags=['FAISS', 'HuggingFaceBgeEmbeddings'] vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7f03643fc9b0> search_kwargs={'k': 3}


In [25]:
import os
from dotenv import load_dotenv
load_dotenv()
huggingfacehub_api_token = os.getenv('HUGGINGFACE_API_KEY')

In [26]:
from langchain_community.llms import HuggingFaceHub

hf=HuggingFaceHub(
    repo_id="mistralai/Mistral-7B-v0.1",
    model_kwargs={"temperature":0.1,"max_length":500},
    huggingfacehub_api_token=huggingfacehub_api_token

)
query="What is the health insurance coverage?"
hf.invoke(query)

'What is the health insurance coverage?\n\nThe health insurance coverage is a contract between the insurer and the insured. The insurer agrees to pay the insured’s medical expenses up to a certain amount, and the insured agrees to pay the premiums.\n\nWhat are the benefits of health insurance?\n\nThe benefits of health insurance are many. It can help you pay for medical expenses, protect you from financial ruin, and give you peace of mind.\n\nWhat are the different types of health insurance'

In [28]:
#Hugging Face models can be run locally through the HuggingFacePipeline class.
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

hf = HuggingFacePipeline.from_model_id(
    model_id="mistralai/Mistral-7B-v0.1",
    task="text-generation",
    pipeline_kwargs={"temperature": 0, "max_new_tokens": 300}
)

llm = hf 
llm.invoke(query)

tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

In [None]:
prompt_template="""
Use the following piece of context to answer the question asked.
Please try to provide the answer only based on the context

{context}
Question:{question}

Helpful Answers:
 """

In [None]:
prompt=PromptTemplate(template=prompt_template,input_variables=["context","question"])


In [None]:
retrievalQA=RetrievalQA.from_chain_type(
    llm=hf,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt":prompt}
)

In [None]:
query="""DIFFERENCES IN THE
UNINSURED RATE BY STATE
IN 2022"""

In [None]:

# Call the QA chain with our query.
result = retrievalQA.invoke({"query": query})
print(result['result'])