In [4]:
import os
from urllib.request import urlretrieve
import numpy as np

In [1]:
import sentence_transformers

  from tqdm.autonotebook import tqdm, trange


In [21]:
os.makedirs("us_census", exist_ok=True)
files = [
    "https://www.census.gov/content/dam/Census/library/publications/2022/demo/p70-178.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-017.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-016.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-015.pdf",
]
for url in files:
    file_path = os.path.join("us_census", url.rpartition("/")[2])
    urlretrieve(url, file_path)

In [10]:
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

ImportError: cannot import name 'BaseModel' from 'langchain_core.pydantic_v1' (/Users/rahulpandey/miniconda3/envs/rag_app/lib/python3.12/site-packages/langchain_core/pydantic_v1/__init__.py)

In [11]:
# Load pdf files in the local directory
loader = PyPDFDirectoryLoader("./us_census/")

docs_before_split = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 700,
    chunk_overlap  = 50,
)
docs_after_split = text_splitter.split_documents(docs_before_split)

docs_after_split[0]

Document(metadata={'source': 'us_census/acsbr-015.pdf', 'page': 0}, page_content='Health Insurance Coverage Status and Type \nby Geography: 2021 and 2022\nAmerican Community Survey Briefs\nACSBR-015Issued September 2023Douglas Conway and Breauna Branch\nINTRODUCTION\nDemographic shifts as well as economic and govern-\nment policy changes can affect people’s access to health coverage. For example, between 2021 and 2022, the labor market continued to improve, which may have affected private coverage in the United States \nduring that time.\n1 Public policy changes included \nthe renewal of the Public Health Emergency, which \nallowed Medicaid enrollees to remain covered under the Continuous Enrollment Provision.\n2 The American')

In [12]:
avg_doc_length = lambda docs: sum([len(doc.page_content) for doc in docs])//len(docs)
avg_char_before_split = avg_doc_length(docs_before_split)
avg_char_after_split = avg_doc_length(docs_after_split)

print(f'Before split, there were {len(docs_before_split)} documents loaded, with average characters equal to {avg_char_before_split}.')
print(f'After split, there were {len(docs_after_split)} documents (chunks), with average characters equal to {avg_char_after_split} (average chunk length).')

Before split, there were 63 documents loaded, with average characters equal to 3830.
After split, there were 400 documents (chunks), with average characters equal to 618 (average chunk length).


In [15]:
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name="sentence-transformers/all-MiniLM-l6-v2",  # alternatively use "sentence-transformers/all-MiniLM-l6-v2" for a light and faster experience.
    model_kwargs={'device':'cpu'}, 
    encode_kwargs={'normalize_embeddings': True}
)

AttributeError: module 'pyarrow.lib' has no attribute 'ListViewType'

In [None]:
sample_embedding = np.array(huggingface_embeddings.embed_query(docs_after_split[0].page_content))
print("Sample embedding of a document chunk: ", sample_embedding)
print("Size of the embedding: ", sample_embedding.shape)

Sample embedding of a document chunk:  [-5.05340844e-02 -1.07632130e-02 -3.22085731e-02  4.64025103e-02
  5.17264977e-02  7.26015121e-02 -1.94752458e-02  3.01965866e-02
 -1.02138430e-01 -1.42446375e-02  6.02175705e-02  6.26361147e-02
 -2.24275328e-02 -3.16375382e-02 -1.75427068e-02  2.35421062e-02
 -1.67436078e-02 -2.35973727e-02 -4.86445986e-02  3.57785933e-02
 -3.60899046e-02  4.26089019e-02 -3.19934711e-02 -5.33744358e-02
  2.33906023e-02 -6.23915438e-03 -2.61042211e-02  2.67631840e-02
 -5.50710373e-02 -1.58236027e-01  1.29768243e-02  2.72521414e-02
 -5.12634777e-02 -1.78340636e-02  1.00316564e-02 -2.32964708e-03
 -1.63029530e-03  5.71662374e-02  5.18324114e-02  4.13078368e-02
 -1.27497045e-02  2.22270768e-02 -2.50609545e-03 -1.75849367e-02
 -3.96118164e-02  5.91322547e-03 -3.97015363e-02  3.12928762e-03
  6.95993239e-03 -4.72988933e-02  4.16573696e-02 -3.69540006e-02
  5.37371971e-02  7.53688887e-02  5.36738820e-02 -2.20242292e-02
  1.54242162e-02 -1.95293054e-02 -2.51309127e-02  1

In [None]:
vectorstore = FAISS.from_documents(docs_after_split, huggingface_embeddings)


In [None]:
query = """What were the trends in median household income across
           different states in the United States between 2021 and 2022."""  
         # Sample question, change to other questions you are interested in.
relevant_documents = vectorstore.similarity_search(query)
print(f'There are {len(relevant_documents)} documents retrieved which are relevant to the query. Display the first one:\n')
print(relevant_documents[0].page_content)

There are 4 documents retrieved which are relevant to the query. Display the first one:

in 2022 was $74,755, according 
Figure 1.
Median Household Income in the Past 12 Months in the United States: 2005–2022
 
Note: Estimates for 2020 experimental data not shown. For more information on the 2020 experimental data products, 
refer to <www.census.gov/programs-surveys/acs/technical-documentation/user-notes/2021-02.html>. Information on conﬁdentiality protection, sampling error, nonsampling error, and deﬁnitions is available at <www.census.gov/acs>.
Source: U.S. Census Bureau, 2005–2022 American Community Survey, 1-year estimates.Recession
/zero.tab/five.tab/five.tab/six.tab/zero.tab/six.tab/five.tab/seven.tab/zero.tab/seven.tab/five.tab/eight.tab/zero.tab


In [8]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})


In [14]:
from langchain_huggingface import HuggingFaceEndpoint

hf = HuggingFaceEndpoint(
    repo_id="mistralai/Mistral-7B-v0.1",
    model_kwargs={"max_length":2000},
    huggingfacehub_api_token='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')

query = """What were the trends in median household income across different states in the United States between 2021 and 2022."""  # Sample question, change to other questions you are interested in.
hf.invoke(query)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /Users/rahulpandey/.cache/huggingface/token
Login successful


In [13]:
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

hf = HuggingFacePipeline.from_model_id(
    model_id="mistralai/Mistral-7B-v0.1",
    task="text-generation",
    pipeline_kwargs={"temperature": 0, "max_new_tokens": 300},
    use_auth_token='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'
)

llm = hf 
llm.invoke(query)

KeyboardInterrupt: 