In [1]:
import time
import json
import numpy as np
import chromadb
import ollama
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
import tiktoken

In [2]:

PDF_PATH = "/Users/avneetsoni/Desktop/ds4300/DS4300--Practical-2/module3-6.pdf"
CHUNK_SIZES = [500]
CHUNK_OVERLAPS = [50]
EMBEDDING_MODEL = "hkunlp/instructor-xl" 
TOP_K = 3



In [3]:
chroma_client = chromadb.HttpClient(host="localhost", port=8000)  
collection = chroma_client.get_or_create_collection(name="notes")
embedding_model = SentenceTransformer(EMBEDDING_MODEL)

In [4]:

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    reader = PdfReader(pdf_path)
    text = "\n".join([page.extract_text() or "" for page in reader.pages])
    return text


def chunk_text(text, chunk_size, overlap):
    """Chunk text into smaller pieces with overlapping."""
    encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(text)

    chunks = []
    for i in range(0, len(tokens), chunk_size - overlap):
        chunk = tokens[i:i + chunk_size]
        chunks.append(encoding.decode(chunk))

    return chunks


def embed_and_store_chunks(chunks, chunk_size, overlap):
    """Generate embeddings and store them in ChromaDB."""
    embeddings = embedding_model.encode(chunks, normalize_embeddings=True)

    # Store in ChromaDB
    for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
        collection.add(
            ids=[f"chunk_{chunk_size}_{overlap}_{i}"],
            documents=[chunk],
            embeddings=[embedding.tolist()],
            metadatas=[{"chunk_size": chunk_size, "overlap": overlap}],
        )


def retrieve_relevant_chunks(query, top_k=TOP_K):
    """Retrieve the top K most relevant chunks from ChromaDB."""
    query_embedding = embedding_model.encode([query], normalize_embeddings=True)[0]

    results = collection.query(
        query_embeddings=[query_embedding.tolist()],
        n_results=top_k,
    )

    return results["documents"][0] if results["documents"] else []

def generate_response(model,context, query):
    """Generate a response using Llama 2 (local Ollama)."""
    prompt = f"Given the context:\n{context}\n\nAnswer the following question:\n{query}"
    
    response = ollama.chat(model=model, messages=[{"role": "user", "content": prompt}])
    
    return response["message"]["content"]

In [5]:

if __name__ == "__main__":
    
    text = extract_text_from_pdf(PDF_PATH)
    print(f"Extracted {len(text)} characters from PDF\n")

    for chunk_size in CHUNK_SIZES:
        for overlap in CHUNK_OVERLAPS:
            print(f"Processing chunk_size={chunk_size}, overlap={overlap}")
            chunks = chunk_text(text, chunk_size, overlap)

            start_time = time.time()
            embed_and_store_chunks(chunks, chunk_size, overlap)
            print(f" - Stored {len(chunks)} chunks in ChromaDB in {time.time() - start_time:.2f} sec\n")


Extracted 31374 characters from PDF

Processing chunk_size=500, overlap=50


RuntimeError: MPS backend out of memory (MPS allocated: 8.72 GB, other allocations: 656.00 KB, max allowed: 9.07 GB). Tried to allocate 669.39 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
# Query Testing
query = "In the context of a relational database system, what is a transaction?"
retrieved_chunks = retrieve_relevant_chunks(query)

print("\nRetrieved Context Chunks:")
for chunk in retrieved_chunks:
    print(f" - {chunk[:200]}...\n")

# Generate response using Llama 2
if retrieved_chunks:
    response = generate_response("llama2","\n".join(retrieved_chunks), query)
    print("\nLlama 2 Response:\n", response)


In [None]:
# Generate response using Mistral
if retrieved_chunks:
    response = generate_response("mistral","\n".join(retrieved_chunks), query)
    print("\nMistral Response:\n", response)