In [None]:
pip install -U pypdf langchain-community

In [None]:
pip install -U langchain-text-splitters

In [None]:
pip install -U langchain-classic

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

from langchain_community.embeddings import HuggingFaceBgeEmbeddings 
from langchain_core.prompts import PromptTemplate

from langchain_classic.chains import RetrievalQA



In [None]:
## Read the PDfs from the folder 
loader=PyPDFDirectoryLoader("./us_census")

documents=loader.load()

text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)

final_documents=text_splitter.split_documents(documents)
final_documents[0]

In [None]:
len(final_documents)

In [None]:
pip install sentence-transformers

In [None]:
## Embedding Using Huggingface 

huggingface_embeddings=HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",
    model_kwargs={'device':'cpu'},
    encode_kwargs={'normalize_embeddings':True}                                         #  can also use   sentence-transformers/all-MiniLM-16-v2

)



In [None]:
import numpy as np 
np.array(huggingface_embeddings.embed_query(final_documents[0].page_content))

In [None]:
## VectorStore Creation
vectorstore=FAISS.from_documents(final_documents[:120],huggingface_embeddings)

In [None]:
## Query using Similarity Search
query="WHAT IS HEALTH INSURANCE COVERAGE?"
relevant_docments=vectorstore.similarity_search(query)

print(relevant_docments[0].page_content)

In [None]:
retriever=vectorstore.as_retriever(search_type="similarity",search_kwargs={"k":3})
print(retriever)

In [None]:
import os
from dotenv import load_dotenv

# This pulls everything from your .env file into the OS environment
load_dotenv()

# Now fetch the token from the environment securely
os.environ['HUGGINGFACEHUB_API_TOKEN'] = os.getenv('HUGGINGFACEHUB_API_TOKEN')

In [None]:
pip install -U langchain-huggingface

In [None]:
pip install -U httpx huggingface_hub langchain-huggingface

In [None]:
import os
from dotenv import load_dotenv
from langchain_huggingface import HuggingFaceEndpoint

# 1. Load your .env
load_dotenv()
sec_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")

# 2. Define the Model
repo_id = "mistralai/Mistral-7B-Instruct-v0.3"

# 3. Setup the LLM - Note: We force the provider and task to Text Generation
# This avoids the "Conversational" mismatch error
llm = HuggingFaceEndpoint(
    repo_id=repo_id,
    huggingfacehub_api_token=sec_token,
    task="text-generation",      # Use this even for Chat models in 2026
    provider="hf-inference",     # Stay on the main HF infrastructure
    max_new_tokens=500,
    temperature=0.1,
    repetition_penalty=1.1
)

# 4. Correct Prompt Formatting (Mistral v0.3 requires [INST] tags)
# In 2026, the endpoint doesn't always add these automatically
query = "What is the health insurance coverage?"
formatted_prompt = f"<s>[INST] {query} [/INST]"

try:
    response = llm.invoke(formatted_prompt)
    print(f"Success! Response: \n{response}")
except Exception as e:
    print(f"Still an error? Let's try one last backup model: {e}")

In [None]:
#Hugging Face models can be run locally through the HuggingFacePipeline class.
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

hf = HuggingFacePipeline.from_model_id(
    model_id="mistralai/Mistral-7B-v0.1",
    task="text-generation",
    pipeline_kwargs={"temperature": 0, "max_new_tokens": 300}
)

llm = hf 
llm.invoke(query)

In [None]:
prompt_template="""
Use the following piece of context to answer the question asked.
Please try to provide the answer only based on the context

{context}
Question:{question}

Helpful Answers:
 """

In [None]:
prompt=PromptTemplate(template=prompt_template,input_variables=["context","question"])

In [None]:
retrievalQA=RetrievalQA.from_chain_type(
    llm=hf,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt":prompt}
)

In [None]:
query="""DIFFERENCES IN THE
UNINSURED RATE BY STATE
IN 2022"""

In [None]:
# Call the QA chain with our query.
result = retrievalQA.invoke({"query": query})
print(result['result'])