This notebook is meant to:

1. Test the input of single PDF file from Databricks DBFS.


2. Use Chroma DB to create a local vector database from the document chunks.


3. Use Hugging Face (all-MiniLM-L6-v2) to embed the text and LangChain to pass queries to LLaMA (Llama-2-7b-chat-hf) for response generation.


4. Return a grounded answer by combining the retrieved document context with LLaMA's language capabilities.   

In [0]:
%pip install \
transformers \
accelerate \
bitsandbytes \
torch \
sentence-transformers \
langchain \
langchain-community \
langchain-huggingface \
chromadb \
unstructured[local-inference] \
pdfminer.six \
tiktoken

Collecting transformers
  Obtaining dependency information for transformers from https://files.pythonhosted.org/packages/a9/b6/5257d04ae327b44db31f15cce39e6020cc986333c715660b1315a9724d82/transformers-4.51.3-py3-none-any.whl.metadata
  Using cached transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting accelerate
  Obtaining dependency information for accelerate from https://files.pythonhosted.org/packages/63/b1/8198e3cdd11a426b1df2912e3381018c4a4a55368f6d0857ba3ca418ef93/accelerate-1.6.0-py3-none-any.whl.metadata
  Using cached accelerate-1.6.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Obtaining dependency information for bitsandbytes from https://files.pythonhosted.org/packages/07/b7/cb5ce4d1a382cf53c19ef06c5fc29e85f5e129b4da6527dd207d90a5b8ad/bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata
  Using cached bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting torch
  Obtaining dependency information for torch fr

In [0]:
%pip install hf_xet

Collecting hf_xet
  Obtaining dependency information for hf_xet from https://files.pythonhosted.org/packages/a1/de/00b2e2568a39c01b0e013db3300f4d5841f2e597d7b0518923c7881bd166/hf_xet-1.0.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading hf_xet-1.0.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (494 bytes)
Downloading hf_xet-1.0.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (53.8 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.8 MB[0m [31m?[0m eta [36m-:--:--[0m
[2K   [91m━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/53.8 MB[0m [31m48.7 MB/s[0m eta [36m0:00:02[0m
[2K   [91m━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.3/53.8 MB[0m [31m107.3 MB/s[0m eta [36m0:00:01[0m
[2K   [91m━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/53.8 MB[0m [31m168.3 MB/s[0m eta [36m0:00:01[0m
[2K   [91m━━━━━━━━━━━━━

In [0]:
%restart_python

In [0]:
# Imports & Config
import os
import shutil
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from langchain_community.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

config = {
    "DOCS_DIR": "/dbfs/FileStore/uploads/2024lfpp.pdf",
    "CHROMA_PATH": "/tmp/LLM/chroma_db",
    "EMBEDDER_PATH": "sentence-transformers/all-MiniLM-L6-v2",
    "MODEL_NAME": "meta-llama/Llama-2-7b-chat-hf",
    "HF_TOKEN": "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", # Replace with your token
}


In [0]:
# Load and Chunk Documents
def load_docs(path):
    loader = UnstructuredFileLoader(path)
    return loader.load()

def split_docs(documents, chunk_size=300, chunk_overlap=200):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_documents(documents)

documents = load_docs(config["DOCS_DIR"])
docs_chunks = split_docs(documents)
print(f"Loaded {len(documents)} docs, split into {len(docs_chunks)} chunks")

  loader = UnstructuredFileLoader(path)


Loaded 1 docs, split into 514 chunks


In [0]:
# Init Embedding Model
embedding_model = HuggingFaceEmbeddings(model_name=config["EMBEDDER_PATH"])

In [0]:
# Create Chroma Vector Store (Local)
if os.path.exists(config["CHROMA_PATH"]):
    shutil.rmtree(config["CHROMA_PATH"])

vector_db = Chroma.from_documents(
    documents=docs_chunks,
    embedding=embedding_model,
    persist_directory=config["CHROMA_PATH"]
)

retriever = vector_db.as_retriever(search_type="similarity", search_kwargs={"k": 4})

In [0]:
# STEP 6: Load LLaMA 2 Chat Model
from transformers import AutoTokenizer, AutoModelForCausalLM

token = "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"  # Replace with your actual Hugging Face token
model_name = "meta-llama/Llama-2-7b-chat-hf"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    token=token
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    token=token
)



tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

[W417 14:26:23.380771666 socket.cpp:204] [c10d] The hostname of the client socket cannot be retrieved. err=-3
[W417 14:26:23.382287474 socket.cpp:204] [c10d] The hostname of the client socket cannot be retrieved. err=-3


model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [0]:
# Prompt Builder
def build_prompt(question, docs, max_chars=4000):
    context = "\n\n".join([doc.page_content for doc in docs])[:max_chars]
    return f"""<s>[INST] <<SYS>>
You are an assistant for grants from the usda. You are answering question about usda grants and awards you have data on. If the question is not related to one of these topics, kindly decline to answer. If you don't know the answer, just say that you don't know, don't try to make up an answer. If the question appears to be for a grant you don't have data on, say so.  Keep the answer as concise as possible.  Provide all answers only in English.
Use the following pieces of context to answer the question at the end:
<</SYS>>

Context:
{context}

Question:
{question}
[/INST]"""

In [0]:
# Run a Query
query = "Summarize 5 grant project for lfpp using 3 sentences each."
relevant_docs = retriever.invoke(query)
prompt = build_prompt(query, relevant_docs)

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.inference_mode():
    outputs = model.generate(
        **inputs,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        max_new_tokens=500
    )
response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

# Output
print("=== Prompt ===")
print(prompt)
print("\n=== Response ===")
print(response)


In [0]:
#Checking for GPU compute
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


/bin/bash: line 1: nvidia-smi: command not found
