In [1]:
# !pip uninstall tensorflow tensorflow-gpu keras tf-keras -y
# !pip install torch --upgrade
# !pip uninstall torch torchvision torchaudio -y
# !pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cpu


In [1]:
import os
os.environ["USE_TF"] = "0"

In [None]:
import os
from urllib.request import urlretrieve
import numpy as np
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

In [4]:


os.makedirs("us_census", exist_ok=True)
files = [
    "https://www.census.gov/content/dam/Census/library/publications/2022/demo/p70-178.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-017.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-016.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-015.pdf",
]
for url in files:
    file_path = os.path.join("us_census", url.rpartition("/")[2])
    urlretrieve(url, file_path)

In [3]:

# Load pdf files in the local directory
loader = PyPDFDirectoryLoader("./us_census/")

docs_before_split = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 700,
    chunk_overlap  = 50,
)
docs_after_split = text_splitter.split_documents(docs_before_split)

In [8]:

docs_after_split[0]

Document(metadata={'source': 'us_census\\acsbr-015.pdf', 'page': 0}, page_content='Health Insurance Coverage Status and Type \nby Geography: 2021 and 2022\nAmerican Community Survey Briefs\nACSBR-015Issued September 2023Douglas Conway and Breauna Branch\nINTRODUCTION\nDemographic shifts as well as economic and govern-\nment policy changes can affect people’s access to health coverage. For example, between 2021 and 2022, the labor market continued to improve, which may have affected private coverage in the United States \nduring that time.\n1 Public policy changes included \nthe renewal of the Public Health Emergency, which \nallowed Medicaid enrollees to remain covered under the Continuous Enrollment Provision.\n2 The American')

In [6]:
avg_doc_length = lambda docs: sum([len(doc.page_content) for doc in docs])//len(docs)
avg_char_before_split = avg_doc_length(docs_before_split)
avg_char_after_split = avg_doc_length(docs_after_split)

print(f'Before split, there were {len(docs_before_split)} documents loaded, with average characters equal to {avg_char_before_split}.')
print(f'After split, there were {len(docs_after_split)} documents (chunks), with average characters equal to {avg_char_after_split} (average chunk length).')

Before split, there were 63 documents loaded, with average characters equal to 3830.
After split, there were 400 documents (chunks), with average characters equal to 618 (average chunk length).


In [7]:
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name="sentence-transformers/all-MiniLM-l6-v2",  # alternatively use "sentence-transformers/all-MiniLM-l6-v2" for a light and faster experience.
    model_kwargs={'device':'cpu'}, 
    encode_kwargs={'normalize_embeddings': True}
)

  from tqdm.autonotebook import tqdm, trange


In [9]:

sample_embedding = np.array(huggingface_embeddings.embed_query(docs_after_split[0].page_content))
print("Sample embedding of a document chunk: ", sample_embedding)
print("Size of the embedding: ", sample_embedding.shape)

Sample embedding of a document chunk:  [ 3.62204537e-02  2.08621677e-02  6.65481165e-02  6.49365457e-03
  6.40591085e-02  1.03430480e-01 -2.49341540e-02 -1.88159272e-02
 -8.36599097e-02  4.00242731e-02 -2.08016913e-02  1.11279204e-01
 -6.59805676e-03 -7.69476071e-02  2.79085655e-02 -5.11180609e-03
 -1.75802298e-02 -2.69326493e-02 -2.69206204e-02  6.55195266e-02
  1.30409105e-02  4.62137796e-02 -1.29380506e-02  4.29106876e-02
 -8.32871068e-03  8.16423632e-03  6.18163608e-02 -2.42102332e-02
  5.30794170e-03  5.58978058e-02  4.98897173e-02  3.47684845e-02
 -7.17514846e-03  1.98417250e-02  2.15069186e-02 -8.55922177e-02
 -4.41659279e-02  2.56719831e-02 -5.36422618e-02  2.86921728e-02
 -3.75566781e-02 -8.61203521e-02 -7.37674832e-02  9.83738378e-02
  2.79736686e-02  1.51086235e-02 -6.12781942e-02  7.05660805e-02
 -5.04604913e-03  5.60576953e-02  2.26505883e-02 -2.16067303e-02
  3.39269862e-02 -3.91952991e-02  4.61026803e-02 -3.05536259e-02
 -8.71737674e-03 -1.78834759e-02 -1.64443906e-02  4

In [10]:
vectorstore = FAISS.from_documents(docs_after_split, huggingface_embeddings)

In [11]:

query = """What were the trends in median household income across different states in the United States between 2021 and 2022."""  # Sample question, change to other questions you are interested in.
relevant_documents = vectorstore.similarity_search(query)
print(f'There are {len(relevant_documents)} documents retrieved which are relevant to the query. Display the first one:\n')
print(relevant_documents[0].page_content)

There are 4 documents retrieved which are relevant to the query. Display the first one:

hold income in 2022 was $24,112 
(Table 1 and Figure 2). Median 
household income was lower than 
the U.S. median in 30 states and 
Puerto Rico. It was higher than the 
U.S. median in 17 states and the 
District of Columbia. The medians 
for Arizona, Oregon, and Vermont were not statistically different from 
the U.S. median.
From 2021 to 2022, five states—
Alabama, Alaska, Delaware, Florida, 
and Utah—showed a statistically 
significant increase in real median 
household income; 17 states 
showed a decrease. Real median 
household income in 2022 was not 
statistically different from that in 
2021 for 28 states, the District of 
Columbia, and Puerto Rico  
(Table 1).


In [12]:

# Use similarity searching algorithm and return 3 most relevant documents.
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

#### load full percision Model from HF using Lang chain

In [None]:
# from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
# from huggingface_hub import login

# login(token="")

# hf = HuggingFacePipeline.from_model_id(
#     model_id="mistralai/Mistral-7B-v0.1",
#     task="text-generation",
#     pipeline_kwargs={"temperature": 0, "max_new_tokens": 300}
# )

# llm = hf

#### Quantization Applied - Need GPU

In [None]:
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from huggingface_hub import login
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig

# Login to Hugging Face
login(token="")

model_id = "mistralai/Mistral-7B-Instruct-v0.2"  # Use instruct version to avoid gating issues

# Configure 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",   # Normalized float4
    bnb_4bit_compute_dtype="float16"  # Compute in float16 for performance
)

# Load model & tokenizer in 4-bit
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

# Create HuggingFace pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    temperature=0,
    max_new_tokens=300
)

# Wrap in LangChain LLM
hf = HuggingFacePipeline(pipeline=pipe)
llm = hf

#### Loading Quantized Model directly 

In [1]:
# from langchain_community.llms import CTransformers

# # Load 4-bit quantized Mistral (GGUF format) from HuggingFace
# llm = CTransformers(
#     model="TheBloke/Mistral-7B-Instruct-v0.2-GGUF",  # 4-bit GGUF model repo
#     model_file="mistral-7b-instruct-v0.2.Q4_K_M.gguf", # 4-bit file
#     config={
#         "max_new_tokens": 300,
#         "temperature": 0.0,
#         "context_length": 2048,
#         "threads": 8  # Adjust based on your CPU cores
#     }
# )

# # Test
# print(llm("What is the capital of France?"))

In [14]:
# from langchain_community.llms import CTransformers

# # Load directly from local path
# llm = CTransformers(
#     model="./models/mistral-7b-instruct-v0.2.Q4_K_M.gguf",  # Path to local file
#     config={
#         "max_new_tokens": 300,
#         "temperature": 0.0,
#         "context_length": 2048,
#         "threads": 8
#     }
# )

# # Test inference
# print(llm("Explain quantum computing in simple terms."))

#### Load Quantized Model


In [None]:
from huggingface_hub import snapshot_download
from langchain_community.llms import CTransformers
import os

# Local models folder
local_dir = "./models/mistral-7b-instruct-v0.2"

# Download the model into local_dir (only if not already downloaded)
if not os.path.exists(local_dir):
    snapshot_download(
        repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
        local_dir=local_dir,
        allow_patterns=["*.gguf"],  # Only download GGUF files
        resume_download=True
    )

# Path to the 4-bit GGUF model file
model_path = os.path.join(local_dir, "mistral-7b-instruct-v0.2.Q4_K_M.gguf")

# Load with CTransformers from local path
llm = CTransformers(
    model=model_path,
    config={
        "max_new_tokens": 300,
        "temperature": 0.0,
        "context_length": 2048,
        "threads": 8
    }
)

# Test
print(llm("What is the capital of France?"))


  print(llm("What is the capital of France?"))




The capital city of France is Paris. It is one of the most famous cities in the world and is known for its art, fashion, gastronomy, culture, and landmarks such as the Eiffel Tower, Louvre Museum, Notre-Dame Cathedral, and Montmartre. Paris has been a major political and cultural center of Europe since the 13th century and continues to be an important city today. It is also home to many international organizations including UNESCO and the OECD.


In [16]:
llm.invoke(query)

" I couldn't find a definitive answer to this question with a quick search, but I did find some data from the U.S. Census Bureau that may be helpful.\n\nAccording to the Census Bureau's annual estimate of median household income for states and metropolitan areas, median household income increased in 36 states between 2021 and 2022. The largest percentage increases were in Hawaii (5.4%), New Mexico (4.9%), and Utah (4.8%).\n\nOn the other hand, median household income decreased in 14 states: Connecticut (-3.6%), Delaware (-2.7%), Illinois (-1.8%), Maryland (-1.5%), Massachusetts (-1.3%), Michigan (-0.9%), New Jersey (-0.8%), New York (-0.7%), Pennsylvania (-0.4%), Rhode Island (-0.4%), Vermont (-0.2%), Virginia (-0.1%), West Virginia (-0.1%), and Wyoming (-0.1%).\n\nIt's important to note that these estimates are subject to revision as more data becomes available, so they should be considered preliminary. Additionally, the Census Bureau does not provide a comprehensive list of all state

In [16]:
prompt_template = """Use the following pieces of context to answer the question at the end. Please follow the following rules:
1. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer but you may want to check the following links".
2. If you find the answer, write the answer in a concise way with five sentences maximum.

{context}

Question: {question}

Helpful Answer:
"""

PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [17]:
retrievalQA = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT}
)
# Call the QA chain with our query.
result = retrievalQA.invoke({"query": query})
print(result['result'])

Five states - Alabama, Alaska, Delaware, Florida, and Utah - experienced a statistically significant increase in real median household income from 2021 to 2022. Conversely, 17 states showed a decrease. The median household income in 28 states, the District of Columbia, and Puerto Rico remained unchanged from the previous year. These findings are based on data from the U.S. Census Bureau's "Current Population Survey," available at www.census.gov/programs-surveys/acs. The overall U.S. median household income declined 0.8 percent to $74,755 in 2022 after adjusting for inflation. Additionally, the Gini Index, a measure of income inequality, increased by 4.7 percent from 0.464 to 0.486 during this period.


In [18]:
relevant_docs = result['source_documents']
print(f'There are {len(relevant_docs)} documents retrieved which are relevant to the query.')
print("*" * 100)
for i, doc in enumerate(relevant_docs):
    print(f"Relevant Document #{i+1}:\nSource file: {doc.metadata['source']}, Page: {doc.metadata['page']}\nContent: {doc.page_content}")
    print("-"*100)

There are 3 documents retrieved which are relevant to the query.
****************************************************************************************************
Relevant Document #1:
Source file: us_census\acsbr-017.pdf, Page: 3
Content: hold income in 2022 was $24,112 
(Table 1 and Figure 2). Median 
household income was lower than 
the U.S. median in 30 states and 
Puerto Rico. It was higher than the 
U.S. median in 17 states and the 
District of Columbia. The medians 
for Arizona, Oregon, and Vermont were not statistically different from 
the U.S. median.
From 2021 to 2022, five states—
Alabama, Alaska, Delaware, Florida, 
and Utah—showed a statistically 
significant increase in real median 
household income; 17 states 
showed a decrease. Real median 
household income in 2022 was not 
statistically different from that in 
2021 for 28 states, the District of 
Columbia, and Puerto Rico  
(Table 1).
----------------------------------------------------------------------------------