In [9]:
# ✅ Final Working Example: Prevent Duplicate Processing Using File Hash

import os
import warnings
import hashlib
from docling.document_converter import DocumentConverter
from docling.chunking import HybridChunker
from langchain.schema import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import HuggingFaceHub
from langchain.chains import RetrievalQA

# 🔇 Suppress warnings
warnings.filterwarnings("ignore")

# 🔐 Set your Hugging Face API token
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "<<TOKEN>>"

# 📂 Folder where PDFs are stored
folder_path = "source/"

# 📁 Where to persist vectorstore
persist_directory = "./chroma_docling_db"

# 🧠 Initialize embedding and LLM models
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-Instruct-v0.1", task="text-generation", model_kwargs={"temperature": 0.3, "max_new_tokens": 512})

# 🔧 Initialize Docling document converter
converter = DocumentConverter()

# 🔧 File hash function
def get_file_hash(filepath):
    hash_md5 = hashlib.md5()
    with open(filepath, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

# 🧩 Chunking logic
def get_chunk_list(document):
    chunker = HybridChunker(max_tokens=200, overlap_tokens=50)
    return list(chunker.chunk(dl_doc=document))

# 🧱 Convert Docling chunks to LangChain documents with file hash
def safe_metadata(idx, chunk, file_hash):
    heading = chunk.meta.headings[0] if chunk.meta.headings else ""
    filename = getattr(chunk.meta.origin, "filename", "unknown.pdf")
    page_numbers = str(
        list({prov.page_no for item in chunk.meta.doc_items for prov in item.prov})
    )
    return {
        "chunk_id": idx,
        "heading": heading or "",
        "filename": filename or "unknown",
        "file_hash": file_hash,
        "page_numbers": page_numbers
    }

# 💾 Store LangChain documents into Chroma vector DB
def store_vector_db(documents):
    vectorstore = Chroma.from_documents(documents, embedding_model, persist_directory=persist_directory)
    vectorstore.persist()
    print("✅ Data stored successfully!")

# 🔎 Check if file is already processed by file_hash
def is_file_already_stored(vectorstore, file_hash):
    result = vectorstore._collection.get(where={"file_hash": file_hash})
    return len(result["ids"]) > 0

# 🔁 Loop over PDFs and process if new
vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embedding_model)

for filename in os.listdir(folder_path):
    if filename.endswith(".pdf"):
        source = os.path.join(folder_path, filename)
        file_hash = get_file_hash(source)

        if is_file_already_stored(vectorstore, file_hash):
            print(f"⚠️ Skipping already-processed file (hash matched): {filename}")
            continue

        try:
            print(f"📄 Processing: {filename}")
            result = converter.convert(source)
            document = result.document
            chunks = get_chunk_list(document)

            long_chain_docs = [
                Document(
                    page_content=chunk.text,
                    metadata=safe_metadata(idx, chunk, file_hash)
                )
                for idx, chunk in enumerate(chunks)
            ]

            store_vector_db(long_chain_docs)

        except Exception as e:
            print(f"❌ Failed to process {filename}: {e}")

# 🧠 Load vectorstore for retrieval
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

# 🔍 Ask question interactively
qa_chain = RetrievalQA.from_llm(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)

query = input("💬 Enter your question: ")
result = qa_chain(query)

print("\n🧠 Answer:", result['result'])
print("\n📄 Source Documents:")
for doc in result['source_documents']:
    print("-", doc.page_content.strip())


⚠️ Skipping already-processed file (hash matched): 1728565027059.pdf
⚠️ Skipping already-processed file (hash matched): Application_Agreement_.pdf
⚠️ Skipping already-processed file (hash matched): aws-data-engineer-resume-example (1).pdf
⚠️ Skipping already-processed file (hash matched): Bank Declaration Page 1.pdf
⚠️ Skipping already-processed file (hash matched): Document.pdf
⚠️ Skipping already-processed file (hash matched): K-5th - 2025 - GT Program Referral Flyer.pdf
⚠️ Skipping already-processed file (hash matched): Leasing_Policy_.pdf
⚠️ Skipping already-processed file (hash matched): Lesson Overview_LLM_GEN_AI.pdf
⚠️ Skipping already-processed file (hash matched): LLM_2.pdf
⚠️ Skipping already-processed file (hash matched): marriage_certificate.pdf
⚠️ Skipping already-processed file (hash matched): USA1749351208.pdf


💬 Enter your question:  what is data?



🧠 Answer: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context:
Data Collection: Use large datasets from various sources, including internet scrapes and specialized corpora. Self-Supervised Learning: Model learns language patterns and structures. Compute Resources: Requires significant compute power, often GPUs. Data Quality: Only a small percentage (1-3%) of collected tokens are used after quality processing.

Context:
Data Collection: Use large datasets from various sources, including internet scrapes and specialized corpora. Self-Supervised Learning: Model learns language patterns and structures. Compute Resources: Requires significant compute power, often GPUs. Data Quality: Only a small percentage (1-3%) of collected tokens are used after quality processing.

Context:
Data Collection: Use large datasets from various sources, including internet scrapes and special