In [None]:
!pip install transformers sentence-transformers langchain==0.3.25 torch faiss-cpu numpy langchain_community pypdf sentence_transformers langchain_huggingface



In [25]:
import os
from urllib.request import urlretrieve
import numpy as np
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.prompts import PromptTemplate

# Data loading

In [26]:
# Download documents from U.S. Census Bureau to local directory.
os.makedirs("us_census", exist_ok=True)
files = [
    "https://www.census.gov/content/dam/Census/library/publications/2022/demo/p70-178.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-017.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-016.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-015.pdf",
]
for url in files:
    file_path = os.path.join("us_census", url.rpartition("/")[2])
    urlretrieve(url, file_path)

In [31]:
# Load pdf files in the local directory
loader = PyPDFDirectoryLoader("./us_census/")

docs_before_split = loader.load()
print(len(docs_before_split))

63


# Chunking

In [36]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 700,
    chunk_overlap  = 50,
)
docs_after_split = text_splitter.split_documents(docs_before_split)

docs_after_split[0]

Document(metadata={'producer': 'Adobe PDF Library 16.0.5', 'creator': 'Adobe InDesign 17.1 (Windows)', 'creationdate': '2022-07-21T14:09:01-04:00', 'author': 'U.S. Census Bureau', 'moddate': '2022-07-21T14:55:54-04:00', 'subject': 'Household Economic Studies', 'title': 'Occupation, Earnings, and Job Characeristics', 'trapped': '/False', 'source': 'us_census/p70-178.pdf', 'total_pages': 21, 'page': 0, 'page_label': '1'}, page_content='Occupation, Earnings, and Job \nCharacteristics\nJuly 2022\nP70-178\nClayton Gumber and Briana Sullivan\nCurrent Population Reports\nINTRODUCTION\nWork is a critical component of our lives and provides \na way to obtain material and nonmonetary benefits \nlike employer-provided health insurance. Scholars \nsuggest that our identities are also tied to the notion \nof “what we do” (Christiansen, 1999), and that who \nwe are is determined partly by our occupational iden -\ntity (Skorikov and Vondracek, 2011). However, work \nis time consuming—the American Tim

In [41]:
docs_after_split = docs_after_split[:50]

# Embedding Model Initialization

In [50]:
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name="sentence-transformers/all-MiniLM-l6-v2",  # alternatively use "sentence-transformers/all-MiniLM-l6-v2" for a light and faster experience.
    model_kwargs={'device':'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)

# Embedding Model Test

In [43]:
sample_embedding = np.array(huggingface_embeddings.embed_query(docs_after_split[0].page_content))
print("Sample embedding of a document chunk: ", sample_embedding)
print("Size of the embedding: ", sample_embedding.shape)

Sample embedding of a document chunk:  [-1.86189227e-02  2.46237703e-02 -1.32489745e-02  5.79542480e-02
  2.89789084e-02  1.27435802e-02  8.26062560e-02 -6.73673078e-02
 -2.07576044e-02 -3.87655273e-02 -4.64275852e-02  2.84431707e-02
 -6.16011247e-02  6.79046847e-03 -2.96421666e-02 -2.40446199e-02
  5.89308981e-03 -2.99502574e-02  4.01671603e-02  3.04162363e-03
 -3.02758664e-02  1.95388738e-02  3.94134782e-02  1.10068610e-02
  2.65370253e-02  2.45576035e-02  2.79611629e-02  8.82640574e-03
  2.18598973e-02  6.10523634e-02  8.10193550e-03  3.47977914e-02
  8.98535028e-02  9.96246655e-03 -3.32964286e-02  2.99190897e-02
  8.64695609e-02  4.18684147e-02  7.01388484e-03  5.04911179e-03
 -3.21369171e-02 -6.62606731e-02  1.59508770e-03  2.17842441e-02
 -7.93712735e-02 -1.95213705e-02  3.44601274e-02 -2.71609966e-02
 -5.60447872e-02  5.23194447e-02  1.85130760e-02  6.01829328e-02
  7.26569593e-02 -3.72973792e-02  7.50669092e-02  1.59940422e-02
 -9.33719054e-03 -3.23310681e-03  5.84940845e-03  4

# Embeddings creation & Vector db data injection

In [44]:
vectorstore = FAISS.from_documents(docs_after_split, huggingface_embeddings)

# Retriving Chunks

In [45]:
query = """Details of Clayton Gumber and Briana Sullivan"""
         # Sample question, change to other questions you are interested in.
relevant_documents = vectorstore.similarity_search(query,k=4)
print(f'There are {len(relevant_documents)} documents retrieved which are relevant to the query. Display the first one:\n')
print(relevant_documents[0].page_content)

There are 4 documents retrieved which are relevant to the query. Display the first one:

Occupation, Earnings, and Job 
Characteristics
July 2022
P70-178
Clayton Gumber and Briana Sullivan
Current Population Reports
INTRODUCTION
Work is a critical component of our lives and provides 
a way to obtain material and nonmonetary benefits 
like employer-provided health insurance. Scholars 
suggest that our identities are also tied to the notion 
of “what we do” (Christiansen, 1999), and that who 
we are is determined partly by our occupational iden -
tity (Skorikov and Vondracek, 2011). However, work 
is time consuming—the American Time Use Survey 
shows that in 2017 workers spent an average 8.21 
hours each day engaged in work and work-related


In [46]:
final_content=""
for i in range(len(relevant_documents)):
  final_content+=relevant_documents[i].page_content
final_content

'Occupation, Earnings, and Job \nCharacteristics\nJuly 2022\nP70-178\nClayton Gumber and Briana Sullivan\nCurrent Population Reports\nINTRODUCTION\nWork is a critical component of our lives and provides \na way to obtain material and nonmonetary benefits \nlike employer-provided health insurance. Scholars \nsuggest that our identities are also tied to the notion \nof “what we do” (Christiansen, 1999), and that who \nwe are is determined partly by our occupational iden -\ntity (Skorikov and Vondracek, 2011). However, work \nis time consuming—the American Time Use Survey \nshows that in 2017 workers spent an average 8.21 \nhours each day engaged in work and work-relatedUnincorporated partnership³  .................  1,603,000  162,900 9.3 0.9\n¹ A margin of error (MOE) is a measure of an estimate’s variability. The larger the MOE in relation to the size of the estimate, the less reli -\nable the estimate. This number, when added to and subtracted from the estimate, forms the 90 percent c

In [47]:
final_prompt = f"""Use the following pieces of context to answer the question at the end. Please follow the following rules:
1. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer but you may want to check the following links".
2. If you find the answer, write the answer in a concise way with five sentences maximum.

{final_content}

Question: {query}

Helpful Answer:
"""
final_prompt

'Use the following pieces of context to answer the question at the end. Please follow the following rules:\n1. If you don\'t know the answer, don\'t try to make up an answer. Just say "I can\'t find the final answer but you may want to check the following links".\n2. If you find the answer, write the answer in a concise way with five sentences maximum.\n\nOccupation, Earnings, and Job \nCharacteristics\nJuly 2022\nP70-178\nClayton Gumber and Briana Sullivan\nCurrent Population Reports\nINTRODUCTION\nWork is a critical component of our lives and provides \na way to obtain material and nonmonetary benefits \nlike employer-provided health insurance. Scholars \nsuggest that our identities are also tied to the notion \nof “what we do” (Christiansen, 1999), and that who \nwe are is determined partly by our occupational iden -\ntity (Skorikov and Vondracek, 2011). However, work \nis time consuming—the American Time Use Survey \nshows that in 2017 workers spent an average 8.21 \nhours each day

In [48]:
!pip install groq



# Answer Generation

In [49]:
import os
from groq import Groq
client = Groq(
    # This is the default and can be omitted
    api_key='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'
)

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant."
        },
        {
            "role": "user",
            "content": final_prompt,
        }
    ],
    model="llama-3.3-70b-versatile",
)

print(chat_completion.choices[0].message.content)

Clayton Gumber and Briana Sullivan are the authors of the report "Occupation, Earnings, and Job Characteristics" published in July 2022. They are associated with the Current Population Reports. The report provides information about work and its importance in our lives. It covers topics such as the time spent on work and work-related activities, as well as occupational identity. The report is based on data from the U.S. Census Bureau, specifically the 2018 Survey of Income and Program Participation.


In [51]:
!pip freeze

absl-py==1.4.0
accelerate==1.7.0
aiofiles==24.1.0
aiohappyeyeballs==2.6.1
aiohttp==3.11.15
aiosignal==1.3.2
alabaster==1.0.0
albucore==0.0.24
albumentations==2.0.8
ale-py==0.11.1
altair==5.5.0
annotated-types==0.7.0
antlr4-python3-runtime==4.9.3
anyio==4.9.0
argon2-cffi==25.1.0
argon2-cffi-bindings==21.2.0
array_record==0.7.2
arviz==0.21.0
astropy==7.1.0
astropy-iers-data==0.2025.6.2.0.38.23
astunparse==1.6.3
atpublic==5.1
attrs==25.3.0
audioread==3.0.1
autograd==1.8.0
babel==2.17.0
backcall==0.2.0
backports.tarfile==1.2.0
beautifulsoup4==4.13.4
betterproto==2.0.0b6
bigframes==2.5.0
bigquery-magics==0.9.0
bleach==6.2.0
blinker==1.9.0
blis==1.3.0
blobfile==3.0.0
blosc2==3.3.4
bokeh==3.7.3
Bottleneck==1.4.2
bqplot==0.12.45
branca==0.8.1
build==1.2.2.post1
CacheControl==0.14.3
cachetools==5.5.2
catalogue==2.0.10
certifi==2025.4.26
cffi==1.17.1
chardet==5.2.0
charset-normalizer==3.4.2
chex==0.1.89
clarabel==0.11.0
click==8.2.1
cloudpathlib==0.21.1
cloudpickle==3.1.1
cmake==3.31.6
cmdstanpy