In [1]:
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Import required modules
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.prompts import (
    ChatPromptTemplate,
    MessagesPlaceholder,
    HumanMessagePromptTemplate,
)
from langchain.schema import SystemMessage, HumanMessage, AIMessage
from langchain_core.runnables import RunnableLambda, RunnableMap
from langchain_core.output_parsers import StrOutputParser
from langchain.load import dumps, loads
from langchain_community.chat_models import ChatOllama
from fastapi.responses import StreamingResponse


from tqdm import tqdm
import torch
import re
import os

import gradio as gr

Embedding Config

In [2]:
# Embed the text content in split_docs
embedding_model = HuggingFaceEmbeddings(
    model_name="intfloat/e5-base-v2",
    model_kwargs={"device": "cuda" if torch.cuda.is_available() else "cpu"},
    encode_kwargs={"normalize_embeddings": True},
)

  embedding_model = HuggingFaceEmbeddings(


Load Existing Vector DB

In [3]:
# Load the vector store
vectorstore = Chroma(
    persist_directory="../data/doc_vectordb",
    embedding_function=embedding_model,
)

  vectorstore = Chroma(


Retriever Config

In [4]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

Setting up Local LLM with Ollama. 

In [5]:
llm = ChatOllama(model="qwen3:8b", base_url="http://localhost:11434", streaming=True)

  llm = ChatOllama(model="qwen3:8b", base_url="http://localhost:11434", streaming=True)


MultiQuery Generation Chain.

In [6]:
# Prompt
template = """You are an AI scientific research assistant. Your task is to generate two 
different versions of the given user question and the conversation history so far to retrieve relevant documents from a vector 
database. The vector database consists of scientific papers, articles, books and other academic 
resources related to the field of Statistical Physics, Computational Physics, Statistics, Soft-Matter Physics
and other related fields. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. If the below question is unrelated or does not require
additional context, you can respond with "No relevant questions found."
Original question: {question}
Conversation history: {history}"""
multi_prompt = ChatPromptTemplate.from_template(template)


def remove_think_blocks(text):
    # Remove all <think>...</think> blocks from the text
    return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()


# Query generation cHAIN
generate_query_chain = (
    multi_prompt
    | llm
    | StrOutputParser()
    | remove_think_blocks
    | (lambda x: x.split("\n"))
)

Retrieval Chain for multiple queries.

In [7]:
# Function to make a single list of the retrieved documents
def get_unique_union(documents: list[list]):
    """Unique union of retrieved docs"""
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return
    return [loads(doc) for doc in unique_docs]


# Retrieval Chain
retrieval_chain = generate_query_chain | retriever.map() | get_unique_union

# Summary Chain 

In [8]:
# Prompt
template = """
You are an AI scientific research assistant. Summarize the following conversation between a user and an assistant.
- Focus on the key points, main ideas, and any important questions and answers.
- Exclude irrelevant, repetitive, or off-topic content.
- Use your own words; do not copy the conversation verbatim.
- Keep the summary concise and within 512-1024 tokens.
- Format the summary as a narration of how the conversation between the user and the assistant unfolded.

Original conversation:
{history}
"""
summarization_prompt = ChatPromptTemplate.from_template(template)

# Summarization chain
summarization_chain = (
    summarization_prompt | llm | StrOutputParser() | remove_think_blocks
)

Testing the working of the summarization chain. 

In [9]:
# Example conversation history
history = [
    HumanMessage(content="What is self-aggregation?"),
    AIMessage(
        content="Self-aggregation is the process by which molecules or particles spontaneously organize into ordered structures."
    ),
    HumanMessage(content="Why is it important in materials science?"),
    AIMessage(
        content="It's important because it can lead to new material properties and functionalities."
    ),
]

In [23]:
# See what summary is produced
summary = summarization_chain.invoke({"history": history})
print("Summary:", summary)

Summary: The conversation begins with the user asking for a definition of "self-aggregation," to which the assistant responds by explaining it as a process where molecules or particles spontaneously form ordered structures. The user then inquires about its importance in materials science, and the assistant highlights its significance in enabling novel material properties and functionalities. The exchange focuses on clarifying the concept and its relevance, with the assistant providing concise, direct answers to the user’s questions. No extraneous details are included, and the summary captures the core ideas and progression of the dialogue.


In [10]:
# Can be used to print the prompt  within the RAG chain
def debug_prompt(messages):
    print("Prompt to LLM:")
    for m in messages:
        print(f"{type(m).__name__}: {getattr(m, 'content', m)}")
    return messages


# | RunnableLambda(debug_prompt) Insert this right after the prompt prep and right before the llm call

# RAG Chain 

Prompt 

In [9]:
messages = [
    SystemMessage(
        content="""
        You are a helpful scientific assistant.  
        You need to answer the question in a scientifically sound manner, combining your own knowledge and the 
        provided context and the conversation history. If you use information from the provided context, cite it. 
        If the answer is based on your own knowledge, state so.
        The conversation history is provided above. You also have further context regarding the current question 
        below from snippets from various scientific papers. 
        """
    ),
    HumanMessagePromptTemplate.from_template(
        "Context:\n{context}\n\nQuestion: {question}\n\n History: {history}"
    ),
]

prompt = ChatPromptTemplate.from_messages(messages)


rag_chain_multi_query = (
    {
        "history": RunnableLambda(
            lambda x: summarization_chain.invoke({"history": x["history"]})
        ),
        "context": RunnableLambda(
            lambda x: retrieval_chain.invoke(
                {
                    "question": x["question"],
                    "history": summarization_chain.invoke({"history": x["history"]}),
                }
            )
        ),
        "question": RunnableLambda(lambda x: (x["question"])),
    }
    | prompt
    | llm
    | StrOutputParser()
)

In [10]:
def split_think_and_answer(response):
    """Extracts text after </think>."""
    match = re.search(r"</think>\s*(.*)", response, re.DOTALL)
    if match:
        return match.group(1).strip()
    return response.strip()


def get_thought(response):
    """Extracts text inside <think>...</think>."""
    match = re.search(r"<think>(.*?)</think>", response, re.DOTALL)
    if match:
        return match.group(1).strip()
    return None


def remove_details_blocks(text):
    # Remove all <details>...</details> blocks, including nested tags and multiline content
    return re.sub(r"<details[\s\S]*?</details>", "", text, flags=re.DOTALL).strip()


def rag_qa_multi_query(message, history):
    # Convert Gradio history format to LangChain message format
    history_langchain_format = []
    print("Gradio history:", history)
    # Convert Gradio history (list of {"role": ..., "content": ...}) to LangChain format
    for turn in history:
        if turn["role"] == "user":
            history_langchain_format.append(HumanMessage(content=turn["content"]))
        elif turn["role"] == "assistant":
            cleaned_content = remove_details_blocks(turn["content"])
            history_langchain_format.append(AIMessage(content=cleaned_content))

    try:
        # Pass both current message and history to the chain
        response = rag_chain_multi_query.invoke(
            {"history": history_langchain_format, "question": message}
        )
        thought = get_thought(response)
        final_answer = split_think_and_answer(response)

        if thought:
            # Add collapsible section with the <think> content
            final_answer = f"<details><summary><b>🤔 Thinking</b></summary><pre>{thought}</pre></details>\n\n{final_answer}"
        return {"role": "assistant", "content": final_answer}
    except Exception as e:
        return {"role": "assistant", "content": f"❌ Error: {str(e)}"}


# Create and launch the chat interface with memory
demo = gr.ChatInterface(
    fn=rag_qa_multi_query,
    type="messages",
    title="📄 Scientific PDF Chatbot",
    description="Ask questions about your scientific PDFs. Powered by RAG + Qwen3:8B",
    examples=["What are colloidal particles?", "Tell me more about that"],
)

demo.launch(share=False, inline=False, inbrowser=False)

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.




Gradio history: []


  return forward_call(*args, **kwargs)
  return [loads(doc) for doc in unique_docs]


Gradio history: [{'role': 'user', 'metadata': None, 'content': 'What are colloidal particles?', 'options': None}, {'role': 'assistant', 'metadata': None, 'content': "<details><summary><b>🤔 Thinking</b></summary><pre>Okay, the user is asking about colloidal particles. Let me start by recalling what I know. Colloidal particles are tiny particles suspended in a fluid, right? They're bigger than molecules but smaller than what's visible to the naked eye. Wait, the user provided some documents, so I should check those for specific info.\n\nLooking at the documents, there's a mention of colloidal particles in different contexts. For example, one document talks about colloidal systems and hydrodynamic interactions. Another mentions colloidal self-assembly, both passive and active systems. There's also a reference to interactions like phoretic, osmotic, and hydrodynamic. \n\nSo, colloidal particles are particles dispersed in a medium, usually liquid, with sizes between 1 nm to 1 micrometer. Th

  return forward_call(*args, **kwargs)


Gradio history: [{'role': 'user', 'metadata': None, 'content': 'What are colloidal particles?', 'options': None}, {'role': 'assistant', 'metadata': None, 'content': "<details><summary><b>🤔 Thinking</b></summary><pre>Okay, the user is asking about colloidal particles. Let me start by recalling what I know. Colloidal particles are tiny particles suspended in a fluid, right? They're bigger than molecules but smaller than what's visible to the naked eye. Wait, the user provided some documents, so I should check those for specific info.\n\nLooking at the documents, there's a mention of colloidal particles in different contexts. For example, one document talks about colloidal systems and hydrodynamic interactions. Another mentions colloidal self-assembly, both passive and active systems. There's also a reference to interactions like phoretic, osmotic, and hydrodynamic. \n\nSo, colloidal particles are particles dispersed in a medium, usually liquid, with sizes between 1 nm to 1 micrometer. Th

  return forward_call(*args, **kwargs)
