This notebook is meant to:

1. Test the input of deltatable information scraped from volume of pdfs.


2. Use Chroma DB to create a local vector database from the document chunks.


3. Use Hugging Face (all-MiniLM-L6-v2) to embed the text and LangChain to pass queries to LLaMA (Llama-2-7b-chat-hf) for response generation.


4. Return a grounded answer by combining the retrieved document context with LLaMA's language capabilities.


In [0]:
%pip install -U transformers accelerate bitsandbytes langchain chromadb sentence-transformers langchain-huggingface 

In [0]:
%pip install --upgrade accelerate
%restart_python

In [0]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
from langchain.llms.base import LLM
from langchain.schema import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma


In [0]:
# Load Document Chunks from Delta Table
df = spark.sql("SELECT id, text FROM hive_metastore.sr_test.docs_text").toPandas()

documents = [
    Document(page_content=row["text"], metadata={"id": row["id"]})
    for _, row in df.iterrows()
]

In [0]:
# Setup Embeddings + Chroma Vector Store
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
persist_dir = "/tmp/chroma_llama_rag"

vector_db = Chroma.from_documents(documents=documents, embedding=embedding_model, persist_directory=persist_dir)
retriever = vector_db.as_retriever(search_type="similarity", search_kwargs={"k": 5})

In [0]:
# STEP 5: Load LLaMA 2 from Hugging Face
model_name = "meta-llama/Llama-2-7b-chat-hf"
token = "" # Replace with your real token

tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    token=token
)

In [0]:
# STEP 6: LangChain LLM Wrapper
class Llama2LLM(LLM):
    def _call(self, prompt: str, stop=None) -> str:
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        with torch.inference_mode():
            outputs = model.generate(
                **inputs,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
                max_new_tokens=500
            )
        return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

    def invoke(self, input: str, config=None) -> str:
        return self._call(input)

    @property
    def _llm_type(self):
        return "llama-2"

llm = Llama2LLM()



In [0]:
# Prompt Template
def build_llama_prompt(question, docs, max_chars=4000):
    context = "\n\n".join([doc.page_content for doc in docs])[:max_chars]
    return f"""<s>[INST] <<SYS>>
You are a helpful grants AI assistant. Use the context below to answer the question clearly and concisely. Only answer questions that pertain to grants within the document reviewed.
<</SYS>>

Context:
{context}

Question:
{question}
[/INST]"""

In [0]:
# STEP 8: Ask a Question
query = "Give me 5 different grant projects with a 3 sentence summary of each."

relevant_docs = retriever.invoke(query)
prompt = build_llama_prompt(query, relevant_docs)
response = llm.invoke(prompt)

print("=== Prompt Sent ===")
print(prompt)
print("\n=== Response ===")
print(response)