Environment Setup:

cell 1:

In [None]:
# %%capture

%pip install -q langchain langchain-core langchain-nvidia-ai-endpoints gradio rich
%pip install -q arxiv pymupdf faiss-cpu langchain-community
%pip install -U langchain-text-splitters

!pip install -qU pennylane pennylane-lightning scikit-learn


from google.colab import userdata

from langchain_nvidia_ai_endpoints import ChatNVIDIA
ChatNVIDIA.get_available_models(api_key=userdata.get("NVIDIA_API_KEY"))

from functools import partial
from rich.console import Console
from rich.style import Style
from rich.theme import Theme

console = Console()
base_style = Style(color="#76B900", bold=True)
pprint = partial(console.print, style=base_style)



[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/49.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.8/49.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m46.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m57.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m48.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.5/81.5 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.7/64.7 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2

In [None]:
import sys
import pennylane as qml
import sklearn
import torch

def verify_quantum_env():
    print("---Quantum Foundation Check ---")

    # Check 1: Libraries
    print(f"Python version: {sys.version.split()[0]}")
    print(f"PennyLane (Quantum Lib): {qml.__version__}")
    print(f"Scikit-learn (for PCA): {sklearn.__version__}")

    # Check 2: Quantum Device Initialization
    # We'll try to create a simple 4-qubit
    try:
        dev = qml.device("default.qubit", wires=4)
        print("✅ Quantum Simulator: Operational")
    except Exception as e:
        print(f"❌ Quantum Simulator Error: {e}")

    # Check 3: PyTorch Integration
    print(f"✅ PyTorch (Classical backend): {torch.__version__}")
    print("----------------------------------")

verify_quantum_env()

---Quantum Foundation Check ---
Python version: 3.12.12
PennyLane (Quantum Lib): 0.44.0
Scikit-learn (for PCA): 1.8.0
✅ Quantum Simulator: Operational
✅ PyTorch (Classical backend): 2.9.0+cpu
----------------------------------


cell 2:

In [None]:
from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings
from google.colab import userdata

#NVIDIAEmbeddings.get_available_models()
embedder = NVIDIAEmbeddings(model="nvidia/nv-embed-v1", truncate="END",api_key=userdata.get("NVIDIA_API_KEY"))

#ChatNVIDIA.get_available_models()
instruct_llm = ChatNVIDIA(model="mistralai/mixtral-8x22b-instruct-v0.1",api_key=userdata.get("NVIDIA_API_KEY"))

In [None]:
from google.colab import userdata
from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings

def verify_classical_rag():
    print("---NVIDIA AI Stack Check ---")

    try:
        # 1. Initialize Embedder

        embedder = NVIDIAEmbeddings(
            model="nvidia/nv-embed-v1",
            truncate="END",
            api_key=userdata.get("NVIDIA_API_KEY")
        )

        # Test Embedding
        test_text = "What is a hybrid quantum neural network?"
        query_result = embedder.embed_query(test_text)
        print(f"✅ Embedder: Ready (Vector dimension: {len(query_result)})")

        # 2. Initialize LLM

        instruct_llm = ChatNVIDIA(
            model="mistralai/mixtral-8x22b-instruct-v0.1",
            api_key=userdata.get("NVIDIA_API_KEY")
        )

        # Test LLM
        res = instruct_llm.invoke("Say 'Classical RAG is active'")
        print(f"✅ LLM: Ready (Response: '{res.content.strip()}')")

        return embedder, instruct_llm

    except Exception as e:
        print(f"❌Stack Error: {e}")
        return None, None

# Run verification and keep models for next steps
embedder, instruct_llm = verify_classical_rag()

---NVIDIA AI Stack Check ---
✅ Embedder: Ready (Vector dimension: 4096)
✅ LLM: Ready (Response: '"Classical RAG is active" - It seems like there's a specific context missing here, but I can assure you that I'm actively ready to assist you with any questions or information you need regarding classical music or any other topic.')


# **We** will compress your 4096-dimensional NVIDIA embeddings into 16 dimensions. This allows us to use 16 qubits on your IBM hardware later.

In [None]:
from sklearn.decomposition import PCA
import numpy as np

class QuantumDataBridge:
    def __init__(self, n_qubits=16):
        self.n_qubits = n_qubits
        self.pca = PCA(n_components=n_qubits)
        self.is_fitted = False

    def fit(self, embeddings):
        """Train the PCA on your document corpus embeddings"""
        print(f"Fitting Quantum Bridge: 4096 dims -> {self.n_qubits} qubits")
        # Ensure input is a numpy array
        embeddings_array = np.array(embeddings)
        self.pca.fit(embeddings_array)
        self.is_fitted = True
        print("✅ Quantum Bridge: Successfully fitted to data.")

    def transform(self, embeddings):
        """Compress embeddings and map to quantum rotation angles [0, pi]"""
        if not self.is_fitted:
            raise ValueError("Bridge must be fitted before transform! Run .fit() first.")

        reduced_data = self.pca.transform(embeddings)

        # Min-Max Scaling to map data to [0, pi] for Angle Embedding
        #gate rotation
        min_vals = reduced_data.min(axis=0)
        max_vals = reduced_data.max(axis=0)

        denom = (max_vals - min_vals) + 1e-9
        norm_data = np.pi * (reduced_data - min_vals) / denom

        return norm_data

# --- Verification of Step 3 ---
def verify_step_3():
    # Simulate 50 document embeddings from your NVIDIA model (4096 dims)
    mock_data = np.random.rand(50, 4096)

    bridge = QuantumDataBridge(n_qubits=16)
    bridge.fit(mock_data)

    # Transform a single 'query' vector
    quantum_query = bridge.transform(mock_data[0:1])

    print(f"Input Shape: {mock_data[0:1].shape}")
    print(f"Quantum-Ready Shape: {quantum_query.shape}")
    print(f"First 5 Qubit Angles: {quantum_query[0][:5]}")

    if quantum_query.shape == (1, 16) and np.max(quantum_query) <= np.pi:
        print("✅ Step 3 Clear: Data is ready for the 16-qubit circuit.")
        return bridge
    else:
        print("❌ Step 3 Error: Dimensions or scaling incorrect.")
        return None

bridge = verify_step_3()

Fitting Quantum Bridge: 4096 dims -> 16 qubits
✅ Quantum Bridge: Successfully fitted to data.
Input Shape: (1, 4096)
Quantum-Ready Shape: (1, 16)
First 5 Qubit Angles: [0. 0. 0. 0. 0.]
✅ Step 3 Clear: Data is ready for the 16-qubit circuit.


Step 4: The Quantum Kernel Implementation
This is the "heart" of your project. We are going to build a Quantum Kernel Estimator. Instead of just measuring the angle between two vectors (classical cosine similarity), this circuit embeds your query and a document into a complex quantum state and measures their Overlap (Fidelity).

Because we are using Angle Embedding, the features are mapped to rotations on the Bloch sphere. The kernel measures how closely these rotations align in a high-dimensional Hilbert space.

In [None]:
import pennylane as qml
from pennylane import numpy as pnp

# We're setting up for our 16-qubit IBM hardware, but let's stick
# with the simulator for now while we're testing the logic.
n_qubits = 16
dev = qml.device("default.qubit", wires=n_qubits)

@qml.qnode(dev)
def quantum_kernel_circuit(x1, x2):
    """
    This is our overlap circuit. We're encoding the query, then
    running the inverse of the document chunk to see if they cancel out.
    """
    # First, we encode our query vector
    qml.AngleEmbedding(x1, wires=range(n_qubits), rotation='X')

    # Now we apply the adjoint of our document encoding.
    # If x1 and x2 are the same, this brings us back to the |0...0> state.

    qml.adjoint(qml.AngleEmbedding)(x2, wires=range(n_qubits), rotation='X')

    # We just need the probability of the zero state to get our similarity score.

    return qml.probs(wires=range(n_qubits))

def get_quantum_similarity(query_angles, doc_angles):
    """
    Grabbing the first element of the probability vector, which
    represents our |00...0> state (perfect overlap).
    """
    probs = quantum_kernel_circuit(query_angles, doc_angles)
    return float(probs[0])

# --- Quick check to make sure our kernel logic holds up ---
def verify_step_4():
    print("---Checking Our Quantum Kernel ---")

    # If we pass in the same vector twice, our score should be 1.0
    vec_a = pnp.array([np.pi/2] * 16)
    sim_identical = get_quantum_similarity(vec_a, vec_a)
    print(f"Overlap (Same): {sim_identical:.4f}")

    # Different vectors should give us a much lower score
    vec_b = pnp.array([0.0] * 16)
    sim_different = get_quantum_similarity(vec_a, vec_b)
    print(f"Overlap (Different): {sim_different:.4f}")

    if sim_identical > 0.99:
        print("✅ Step 4 Clear: Our kernel is calculating state overlap correctly.")
    else:
        print("❌ Step 4 Error: Something is wrong with our similarity calculation.")

verify_step_4()

---Checking Our Quantum Kernel ---
Overlap (Same): 1.0000
Overlap (Different): 0.0000
✅ Step 4 Clear: Our kernel is calculating state overlap correctly.


RAG For Document Chunk Retrieval:


Task 1: Loading And Chunking Your Documents

In [None]:
import json
from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings

from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter


from langchain_community.document_loaders import ArxivLoader


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=100,
    separators=["\n\n", "\n", ".", ";", ",", " "],
)

## pick some papers and add them to the list

print("Loading Documents")
docs = [
    ArxivLoader(query="1706.03762").load(),  ## Attention Is All You Need Paper
    ArxivLoader(query="1810.04805").load(),  ## BERT Paper
    ArxivLoader(query="2005.11401").load(),  ## RAG Paper
    ArxivLoader(query="2205.00445").load(),  ## MRKL Paper
    ArxivLoader(query="2310.06825").load(),  ## Mistral Paper
    ArxivLoader(query="2306.05685").load(), ## LLM-as-a-Judge
    ArxivLoader(query="2312.10997").load(), ## new paper->Retrieval-Augmented Generation for LLMs: A Survey)
    #ArxivLoader(query="2509.14277").load(),

    ## Some longer papers
    # ArxivLoader(query="2210.03629").load(),  ## ReAct Paper
    # ArxivLoader(query="2112.10752").load(),  ## Latent Stable Diffusion Paper
    # ArxivLoader(query="2103.00020").load(),  ## CLIP Paper

]


## This is a standard string in papers.
for doc in docs:
    content = json.dumps(doc[0].page_content)
    if "References" in content:
        doc[0].page_content = content[:content.index("References")]

## Splitting  the documents and also filter out stubs (overly short chunks)
print("Chunking Documents")
docs_chunks = [text_splitter.split_documents(doc) for doc in docs]
docs_chunks = [[c for c in dchunks if len(c.page_content) > 200] for dchunks in docs_chunks]

## Making some custom Chunks
doc_string = "Available Documents:"
doc_metadata = []
for chunks in docs_chunks:
    metadata = getattr(chunks[0], 'metadata', {})
    doc_string += "\n - " + metadata.get('Title')
    doc_metadata += [str(metadata)]

extra_chunks = [doc_string] + doc_metadata

## Printing out some summary information for reference
pprint(doc_string, '\n')
for i, chunks in enumerate(docs_chunks):
    print(f"Document {i}")
    print(f" - # Chunks: {len(chunks)}")
    print(f" - Metadata: ")
    pprint(chunks[0].metadata)
    print()



Loading Documents
Chunking Documents


Document 0
 - # Chunks: 35
 - Metadata: 



Document 1
 - # Chunks: 45
 - Metadata: 



Document 2
 - # Chunks: 46
 - Metadata: 



Document 3
 - # Chunks: 40
 - Metadata: 



Document 4
 - # Chunks: 21
 - Metadata: 



Document 5
 - # Chunks: 44
 - Metadata: 



Document 6
 - # Chunks: 122
 - Metadata: 





Task 2: Construct Your Document Vector Stores
Now that we have all of the components, we can go ahead and create indices surrounding them:

In [None]:
%%time
print("Constructing Vector Stores")
vecstores = [FAISS.from_texts(extra_chunks, embedder)]
vecstores += [FAISS.from_documents(doc_chunks, embedder) for doc_chunks in docs_chunks]

Constructing Vector Stores
CPU times: user 1.38 s, sys: 71.2 ms, total: 1.45 s
Wall time: 29.9 s


From there, we can combine our indices into a single one using the following utility:

In [None]:
from faiss import IndexFlatL2
from langchain_community.docstore.in_memory import InMemoryDocstore

embed_dims = len(embedder.embed_query("test"))
def default_FAISS():
    '''Useful utility for making an empty FAISS vectorstore'''
    return FAISS(
        embedding_function=embedder,
        index=IndexFlatL2(embed_dims),
        docstore=InMemoryDocstore(),
        index_to_docstore_id={},
        normalize_L2=False
    )

def aggregate_vstores(vectorstores):
    ## Initialize an empty FAISS Index and merge others into it
    ## using default_faiss for simplicity,
    agg_vstore = default_FAISS()
    for vstore in vectorstores:
        agg_vstore.merge_from(vstore)
    return agg_vstore

## Unintuitive optimization; merge_from seems to optimize constituent vector stores away
docstore = aggregate_vstores(vecstores)

print(f"Constructed aggregate docstore with {len(docstore.docstore._dict)} chunks")

Constructed aggregate docstore with 361 chunks


This is where we finally train our PCA on our real data.

In [None]:
# Now that we've merged everything into our 'docstore', we need to
# pull out all those 4096-dim NVIDIA vectors. This lets us train
# our bridge to compress our specific research papers into 16 qubits.

print("---Priming Our Quantum Bridge ---")

# grabbing every single vector from our unified FAISS index
all_vectors = docstore.index.reconstruct_n(0, docstore.index.ntotal)

# fit our bridge. This is the moment our classical
# embeddings are mapped to our quantum circuit's rotation angles.
bridge.fit(all_vectors)

print(f"✅ Our bridge is now tuned to all {len(all_vectors)} document chunks.")

---Priming Our Quantum Bridge ---
Fitting Quantum Bridge: 4096 dims -> 16 qubits
✅ Quantum Bridge: Successfully fitted to data.
✅ Our bridge is now tuned to all 361 document chunks.


Now we move to Step 5.2: The Quantum Re-ranker Function. This is where we combine everything: we'll take the "candidates" that FAISS finds and use our 16-qubit circuit to determine which ones actually have the best quantum overlap with our query.

Step 5.2: The Quantum Re-ranker Function

In [None]:
#this code is also fuctional but, it takes too long to respond to queries
'''def quantum_rerank(query, documents, top_k=5):
    """
    Our hybrid re-ranking logic. We take the 'rough' results from FAISS,
    run them through our 16-qubit kernel, and pick the true winners.
    """
    if not documents:
        return []

    # 1. First, we get our query's 4096-dim embedding
    query_vec = embedder.embed_query(query)

    # 2. We grab the embeddings for our candidate chunks
    doc_texts = [d.page_content for d in documents]
    doc_vecs = embedder.embed_documents(doc_texts)

    # 3. We use our primed bridge to compress them into 16-qubit rotation angles
    q_angles = bridge.transform([query_vec])[0]
    d_angles_list = bridge.transform(doc_vecs)

    # 4. Now we loop through and calculate the quantum similarity for each
    scored_docs = []
    for i, doc in enumerate(documents):
        # We're calling our circuit here to measure the state overlap
        score = get_quantum_similarity(q_angles, d_angles_list[i])
        scored_docs.append((score, doc))

    # 5. We sort them so the highest quantum overlap comes first
    scored_docs.sort(key=lambda x: x[0], reverse=True)

    # We return only the top_k chunks for our LLM to read
    return [doc for score, doc in scored_docs[:top_k]]

print("✅ Our Quantum Re-ranker function is live.")'''

'def quantum_rerank(query, documents, top_k=5):\n    """\n    Our hybrid re-ranking logic. We take the \'rough\' results from FAISS,\n    run them through our 16-qubit kernel, and pick the true winners.\n    """\n    if not documents:\n        return []\n\n    # 1. First, we get our query\'s 4096-dim embedding\n    query_vec = embedder.embed_query(query)\n\n    # 2. We grab the embeddings for our candidate chunks\n    doc_texts = [d.page_content for d in documents]\n    doc_vecs = embedder.embed_documents(doc_texts)\n\n    # 3. We use our primed bridge to compress them into 16-qubit rotation angles\n    q_angles = bridge.transform([query_vec])[0]\n    d_angles_list = bridge.transform(doc_vecs)\n\n    # 4. Now we loop through and calculate the quantum similarity for each\n    scored_docs = []\n    for i, doc in enumerate(documents):\n        # We\'re calling our circuit here to measure the state overlap\n        score = get_quantum_similarity(q_angles, d_angles_list[i])\n        scored_

In [None]:
# --- Our Optimized Quantum Re-ranker code ---

@qml.qnode(dev)
def batched_quantum_kernel(query_angle, doc_angles):
    """
    This is our speed-boosted circuit. By passing a list of doc_angles,
    PennyLane and IBM Cloud process the entire batch in one go.
    """
    # 1. Encode our query (this is broadcasted automatically)
    qml.AngleEmbedding(query_angle, wires=range(16), rotation='X')

    # 2. Encode our batch of documents using the adjoint (inverse)
    # The 'doc_angles' here is a matrix of (20, 16)
    qml.adjoint(qml.AngleEmbedding)(doc_angles, wires=range(16), rotation='X')

    # 3. We get a probability vector for every document in the batch
    return qml.probs(wires=range(16))

def quantum_rerank(query, documents, top_k=5):

    if not documents:
        return []

    # Prepare our vectors as we did before
    query_vec = embedder.embed_query(query)
    doc_texts = [d.page_content for d in documents]
    doc_vecs = embedder.embed_documents(doc_texts)

    # Compress them using our 16-qubit bridge
    q_angles = bridge.transform([query_vec])[0]
    d_angles_batch = bridge.transform(doc_vecs)

    # --- SPEED FIX: One single call to the QPU instead of 20 ---
    print(f"---Broadcasting Batch Job to ibm_qpu (Batch Size: {len(documents)}) ---")
    all_probs = batched_quantum_kernel(q_angles, d_angles_batch)

    # We grab the first column (the probability of the |0...0> state) for all docs
    scores = all_probs[:, 0]

    # Map the scores back to our documents and sort
    scored_docs = list(zip(scores, documents))
    scored_docs.sort(key=lambda x: x[0], reverse=True)

    return [doc for score, doc in scored_docs[:top_k]]

print("Speed Patch Applied: Sequential loops replaced with Batch Processing.")

Speed Patch Applied: Sequential loops replaced with Batch Processing.


 Task 3:Implement Your RAG Chain

In [None]:
from langchain_community.document_transformers import LongContextReorder
from langchain_core.runnables import RunnableLambda
from langchain_core.runnables.passthrough import RunnableAssign
from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

import gradio as gr
from functools import partial
from operator import itemgetter

#classical reranking Rag chain code
'''# NVIDIAEmbeddings.get_available_models()
embedder = NVIDIAEmbeddings(model="nvidia/nv-embed-v1", truncate="END",api_key=userdata.get("NVIDIA_API_KEY"))
# ChatNVIDIA.get_available_models()
instruct_llm = ChatNVIDIA(model="mistralai/mixtral-8x7b-instruct-v0.1",api_key=userdata.get("NVIDIA_API_KEY"))
# instruct_llm = ChatNVIDIA(model="meta/llama-3.1-8b-instruct")

convstore = default_FAISS()

def save_memory_and_get_output(d, vstore):
    """Accepts 'input'/'output' dictionary and saves to convstore"""
    vstore.add_texts([
        f"User previously responded with {d.get('input')}",
        f"Agent previously responded with {d.get('output')}"
    ])
    return d.get('output')

initial_msg = (
    "Hello! I am a document chat agent here to help the user!"
    f" I have access to the following documents: {doc_string}\n\nHow can I help you?"
)

chat_prompt = ChatPromptTemplate.from_messages([
    ("system",
     "You are a document-based question answering assistant.\n\n"

     "You have access to retrieved content from user-uploaded documents.\n"
     "You must follow ALL rules below strictly:\n\n"

     "RULES FOR ANSWERING:\n"
     "1. Use the retrieved document context as the primary source of truth.\n"
     "2. If a statement is supported by the retrieved documents, include it in the answer.\n"
     "3. If a statement is NOT supported by the retrieved documents but is common knowledge, "
     "you MAY include it, but it MUST be explicitly labeled as general knowledge.\n"
     "4. NEVER attribute general knowledge to a document.\n"
     "5. NEVER fabricate citations.\n\n"

     "SOURCE ATTRIBUTION RULES:\n"
     "- At the end of EVERY response, include a section titled exactly:\n"
     "  \"Sources\"\n"
     "- Under \"Sources\", list ONLY the exact titles of documents that directly support the answer.\n"
     "- If part of the answer comes from general knowledge, include a bullet:\n"
     "  \"General knowledge of the language model (not found in uploaded documents)\"\n"
     "- If NO documents support the answer, list ONLY the general knowledge bullet.\n\n"

     "FORMAT REQUIREMENTS:\n"
     "- Write the main answer first.\n"
     "- Then write a blank line.\n"
     "- Then write \"Sources:\" on its own line.\n"
     "- Then list sources as bullet points.\n\n"

     "User question:\n{input}\n\n"

     "Conversation history (may be empty):\n{history}\n\n"

     "Retrieved document context:\n{context}\n\n"

     "Answer in a clear, precise, and honest tone."
    ),
    ("user", "{input}")
])



stream_chain = chat_prompt| instruct_llm | StrOutputParser()

################################################################################################
##Implement the retrieval chain

long_reorder = LongContextReorder()

def docs2str(docs, max_chars=4000):
    """Join retrieved docs into a single text blob including simple source titles."""
    parts = []
    for d in docs:
        md = getattr(d, "metadata", {}) or {}
        title = md.get("Title") or md.get("title") or md.get("source") or "unknown"
        parts.append(f"[{title}]\n{d.page_content}")
    text = "\n\n---\n\n".join(parts)
    return text[:max_chars]

retrieval_chain = (
    {'input' : (lambda x: x)}  # input is a raw string
    | RunnableAssign({
        # Retrieve recent conversational memory from convstore
        "history": lambda d: docs2str(convstore.similarity_search(d["input"], k=4), max_chars=2000),
        # Retrieve relevant document chunks from docstore, reorder long context, and stringify
        "context": lambda d: docs2str(
            long_reorder.transform_documents(docstore.similarity_search(d["input"], k=8)),
            max_chars=4000
        ),
    })
)
#################################################################################

def chat_gen(message, history=[], return_buffer=True):
    buffer = ""
    ## First performing retrieval based on the input message
    retrieval = retrieval_chain.invoke(message)
    line_buffer = ""

    ## Then, stream the results of the stream_chain
    for token in stream_chain.stream(retrieval):
        buffer += token

        yield buffer if return_buffer else token

    ## Lastly, saving the chat exchange to the conversation memory buffer
    save_memory_and_get_output({'input':  message, 'output': buffer}, convstore)


## Start of Agent Event Loop
test_question = "Tell me about RAG!"

## Before launching gradio interface,test the working
for response in chat_gen(test_question, return_buffer=False):
    print(response, end='') '''

# ---Our Quantum-Enhanced RAG Chain ---


embedder = NVIDIAEmbeddings(model="nvidia/nv-embed-v1", truncate="END", api_key=userdata.get("NVIDIA_API_KEY"))
instruct_llm = ChatNVIDIA(model="mistralai/mixtral-8x7b-instruct-v0.1", api_key=userdata.get("NVIDIA_API_KEY"))
convstore = default_FAISS()

# save our conversation to memory
def save_memory_and_get_output(d, vstore):
    vstore.add_texts([
        f"User previously responded with {d.get('input')}",
        f"Agent previously responded with {d.get('output')}"
    ])
    return d.get('output')

# Helper to turn our retrieved chunks into a single string for the LLM
def docs2str(docs, max_chars=4000):
    parts = []
    for d in docs:
        md = getattr(d, "metadata", {}) or {}
        title = md.get("Title") or md.get("title") or md.get("source") or "unknown"
        parts.append(f"[{title}]\n{d.page_content}")
    text = "\n\n---\n\n".join(parts)
    return text[:max_chars]

# This creates the greeting message for our chatbot
initial_msg = (
    "Hello! I am a document chat agent here to help our user!"
    f" I have access to the following documents: {doc_string}\n\nHow can I help us today?"
)

# Our instruction set for the assistant
chat_prompt = ChatPromptTemplate.from_messages([
    ("system",
     "You are a document-based question answering assistant.\n\n"
     "You have access to retrieved content from user-uploaded documents.\n"
     "You must follow ALL rules below strictly:\n\n"
     "RULES FOR ANSWERING:\n"
     "1. Use the retrieved document context as the primary source of truth.\n"
     "2. If a statement is supported by the retrieved documents, include it in the answer.\n"
     "3. If a statement is NOT supported by the retrieved documents but is common knowledge, "
     "you MAY include it, but it MUST be explicitly labeled as general knowledge.\n"
     "4. NEVER attribute general knowledge to a document.\n"
     "5. NEVER fabricate citations.\n\n"
     "SOURCE ATTRIBUTION RULES:\n"
     "- At the end of EVERY response, include a section titled exactly:\n"
     "  \"Sources\"\n"
     "- Under \"Sources\", list ONLY the exact titles of documents that directly support the answer.\n"
     "- If part of the answer comes from general knowledge, include a bullet:\n"
     "  \"General knowledge of the language model (not found in uploaded documents)\"\n"
     "- If NO documents support the answer, list ONLY the general knowledge bullet.\n\n"
     "FORMAT REQUIREMENTS:\n"
     "- Write the main answer first.\n"
     "- Then write a blank line.\n"
     "- Then write \"Sources:\" on its own line.\n"
     "- Then list sources as bullet points.\n\n"
     "User question:\n{input}\n\n"
     "Conversation history (may be empty):\n{history}\n\n"
     "Retrieved document context:\n{context}\n\n"
     "Answer in a clear, precise, and honest tone."
    ),
    ("user", "{input}")
])

# final response

stream_chain = chat_prompt | instruct_llm | StrOutputParser()

# --- Our Hybrid Retrieval Logic ---
# We grab 20 candidates and let our 16-qubit re-ranker find the best 5
retrieval_chain = (
    {'input' : (lambda x: x)}
    | RunnableAssign({
        "history": lambda d: docs2str(convstore.similarity_search(d["input"], k=4), max_chars=2000),
        "context": lambda d: docs2str(
            quantum_rerank(
                d["input"],
                docstore.similarity_search(d["input"], k=20)
            ),
            max_chars=4000
        ),
    })
)

# Our final function that Gradio will call
def chat_gen(message, history=[], return_buffer=True):
    buffer = ""
    # We run our quantum-enhanced retrieval first
    retrieval = retrieval_chain.invoke(message)

    # Then we stream the LLM's response based on those results
    for token in stream_chain.stream(retrieval):
        buffer += token
        yield buffer if return_buffer else token

    # Save the exchange to our memory
    save_memory_and_get_output({'input':  message, 'output': buffer}, convstore)

print("✅ Our full Hybrid Chain is now active and defined.")

# --- Quick Test Run ---
test_question = "Explain the two main components of the RAG framework and how they interact according to the papers."
print(f"Testing our brain with: {test_question}\n")

for response in chat_gen(test_question, return_buffer=False):
    print(response, end='')


✅ Our full Hybrid Chain is now active and defined.
Testing our brain with: Explain the two main components of the RAG framework and how they interact according to the papers.

---Broadcasting Batch Job to ibm_qpu (Batch Size: 20) ---
 The two main components of the RAG (Retrieval-Augmented Generation) framework are "Retrieval" and "Generation."

- Retrieval: This component is responsible for searching relevant documents in external databases based on the input. It uses various optimization methods like indexing, query, and embedding optimization to enhance the retrieval process.

- Generation: After the retrieval phase, the model uses the fetched documents to generate appropriate responses. This process involves post-retrieval processing and fine-tuning of the large language model (LLM).

The "Retrieval" and "Generation" components collaborate in a way that the Retrieval component provides context-specific documents, and the Generation component utilizes this information to generate pr

In [None]:
# Force-installing setuptools (needed for Python 3.12) and the IBM runtime
%pip install -qU setuptools qiskit-ibm-runtime pennylane-qiskit

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m57.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m63.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m378.6/378.6 kB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.0/8.0 MB[0m [31m121.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1

In [None]:
import pennylane as qml
from qiskit_ibm_runtime import QiskitRuntimeService

# --- IBM CLOUD PRODUCTION CONFIGURATION ---

# This is our unique Cloud Resource Name for our IBM Cloud instance
MY_CRN = userdata.get('IBM_CRN')

try:

    # We pass our CRN directly into the 'instance' parameter.
    service = QiskitRuntimeService(
        channel="ibm_cloud",
        token=userdata.get('IBM_TOKEN'),
        instance=MY_CRN
    )

    # 2. Locking onto the 133-qubit(in this case) hardware
    qpu_backend = service.backend("ibm_torino")

    # 3. THE HARDWARE SWITCH:

    # Swapping our local 'default.qubit' for the remote 16-qubit.

    dev = qml.device('qiskit.remote', wires=16, backend=qpu_backend)
    ##for using simulator make above line comment and uncomment below line.

    #dev = qml.device("lightning.qubit", wires=16)

    # 4. Redefining our circuit to use the real hardware device
    @qml.qnode(dev)
    def quantum_kernel_circuit(x1, x2):
        # Angle Embedding for our 16 compressed dimensions
        qml.AngleEmbedding(x1, wires=range(16), rotation='X')
        # Adjoint (inverse) for the document chunk to measure overlap
        qml.adjoint(qml.AngleEmbedding)(x2, wires=range(16), rotation='X')
        return qml.probs(wires=range(16))

    print(f"✅ SYSTEM LIVE ON IBM CLOUD: {qpu_backend.name}")
    print(f"Qubits: {qpu_backend.num_qubits}")
    print(f"Status: {qpu_backend.status().status_msg}")

except Exception as e:
    print(f"❌ Connection Error: {e}")



✅ SYSTEM LIVE ON IBM CLOUD: ibm_torino
Qubits: 133
Status: active


Task 4: Interact With Your Gradio Chatbot

In [None]:
import gradio as gr

# 1. We update our chatbot component to explicitly use 'messages' type.

chatbot = gr.Chatbot(
    value=[{"role": "assistant", "content": initial_msg}],
    height=600,
    type="messages"
)



demo = gr.ChatInterface(
    chat_gen,
    chatbot=chatbot,
    type="messages",
    title="Hybrid Quantum-Classical RAG",
    description=f"Active Backend: **ibm_torino** | NVIDIA Embeddings: **nv-embed-v1**",
    theme="soft"
).queue()

print("Launching our Quantum-Enhanced Interface...")
try:
    demo.launch(debug=True, share=True)
except Exception as e:
    demo.close()
    print(f"Interface Error: {e}")

  chatbot = gr.Chatbot(


Launching our Quantum-Enhanced Interface...
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://12242341e10c4c913e.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://12242341e10c4c913e.gradio.live
