In [2]:
# from dotenv import load_dotenv

# Load environment variables from .env file
# load_dotenv()

# Import required modules
import os
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from tqdm import tqdm
import torch
from langchain_community.vectorstores import Chroma

# Embedding the PDF files

Testing the output after loading of a single PDF file using PyMuPDFLoader().

In [2]:
loader = PyMuPDFLoader(
    "../data/raw/files/417/Gangwal et al. - 2008 - Induced-Charge Electrophoresis of Metallodielectri.pdf"
)
doc = loader.load()
# Display the entire PDF
for page in doc:
    print(page)

page_content='Induced-Charge Electrophoresis of Metallodielectric Particles
Sumit Gangwal,1 Olivier J. Cayre,1 Martin Z. Bazant,2 and Orlin D. Velev1,*
1Department of Chemical and Biomolecular Engineering, North Carolina State University, Raleigh, North Carolina, 27695, USA
2Department of Mathematics and Institute for Soldier Nanotechnologies, Massachusetts Institute of Technology,
Cambridge, Massachusetts, 02139, USA
(Received 27 April 2007; published 4 February 2008)
The application of ac electric ﬁelds in aqueous suspensions of anisotropic particles leads to unbalanced
liquid ﬂows and nonlinear, induced-charge electrophoretic motion. We report experimental observations
of the motion of Janus microparticles with one dielectric and one metal-coated hemisphere induced by
uniform ﬁelds of frequency 100 Hz–10 kHz in NaCl solutions. The motion is perpendicular to the ﬁeld
axis and persists after particles are attracted to a glass wall. This phenomenon may ﬁnd applications in
microactuator

In [12]:
doc[0].metadata

{'producer': 'Acrobat Distiller 6.0.1 (Windows)',
 'creator': '3B2 Total Publishing System 8.07g/W',
 'creationdate': '2008-01-31T15:53:54-05:00',
 'source': '../data/raw/files/417/Gangwal et al. - 2008 - Induced-Charge Electrophoresis of Metallodielectri.pdf',
 'file_path': '../data/raw/files/417/Gangwal et al. - 2008 - Induced-Charge Electrophoresis of Metallodielectri.pdf',
 'total_pages': 4,
 'format': 'PDF 1.4',
 'title': 'untitled',
 'author': '',
 'subject': '',
 'keywords': '',
 'moddate': '2008-01-31T15:53:54-05:00',
 'trapped': '',
 'modDate': "D:20080131155354-05'00'",
 'creationDate': "D:20080131155354-05'00'",
 'page': 0}

Loading all the PDF files in a directory and splitting each of them into chunks. 

In [2]:
# Loop through all folders in the specified directory
root_dir = "../data/raw/files"  # Change to your target directory
file_count = 0

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500, chunk_overlap=50  # or 1000 depending on your LLM
)

split_docs = []

for dirpath, dirnames, filenames in os.walk(root_dir):
    for filename in filenames:
        if filename.lower().endswith(".pdf"):
            pdf_path = os.path.join(dirpath, filename)
            loader = PyMuPDFLoader(pdf_path)
            doc = loader.load()
            chunks = text_splitter.split_documents(doc)
            split_docs.extend(chunks)
            file_count += 1

print(f"Total PDF files found: {file_count}")

Total PDF files found: 286


Configuring the Embedding model e5-base-v2.  

In [2]:
# Embed the text content in split_docs
embedding_model = HuggingFaceEmbeddings(
    model_name="intfloat/e5-base-v2",
    model_kwargs={"device": "cuda" if torch.cuda.is_available() else "cpu"},
    encode_kwargs={"normalize_embeddings": True},
)

  embedding_model = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm


Compute the embeddings in memory. 

In [None]:
embeddings = []
for doc in tqdm(split_docs, desc="Embedding documents"):
    emb = embedding_model.embed_documents([doc.page_content])
    embeddings.append(emb)

Compute the embeddings as a ChromaDB and store them as a Chroma VectorDB locally. 

In [None]:
# Compute the embeddings and store it into a FAISS vector store
vectorstore = Chroma.from_documents(
    split_docs,
    embedding_model,
    persist_directory="../data/doc_vectordb",
)

Load an existing, local Chroma Vector DB.

In [3]:
# Load the vector store
vectorstore = Chroma(
    persist_directory="../data/doc_vectordb",
    embedding_function=embedding_model,
)

  vectorstore = Chroma(


## Retrieval 

Configuring the retriever on the vectorstore object. 

In [4]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

Checking the retriver. 

In [21]:
docs = retriever.get_relevant_documents(
    "What are some explaes of anisotropic self-assembly?"
)
docs[0].page_content

  return forward_call(*args, **kwargs)


'self-assembly are one-dimensional (1D) chains,9–13 2D- and 3D-\ncrystals9,12–15 as well as sheet- and membrane-like structures16,17\nformed by polarizable (superparamagnetic) particles in electric\n(magnetic) \ue103elds. Microscopically, all of these structures origi-\nnate from the \ue103eld-induced, anisotropic dipole–dipole interac-\ntions and their interplay with the external \ue103eld. The resulting\nstructural eﬀects have been extensively studied by theory (see\ne.g., ref. 18–21) and simulations (see e.g., ref. 10, 22 and 23).\nCompared to the structures formed by “simple” particles\nwith permanent or induced dipole moments, our theoretical\nunderstanding of (\ue103eld-driven) assembly of particles with\nmultipolar interactions is in its infancy.24 This is in contrast to\nthe signi\ue103cant experimental progress that has been made in\nsynthesizing novel colloidal particles with tunable directional\ninteractions. If such systems are, in addition, susceptible to an\nexternal \ue1

# LangChain

Configuring a prompt template using LangChain

In [5]:
from langchain.prompts import ChatPromptTemplate

# Prompt
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='Answer the question based only on the following context:\n{context}\n\nQuestion: {question}\n'), additional_kwargs={})])

Setting up a locally running LLM using Ollama.

In [7]:
# LLM
from langchain_community.chat_models import ChatOllama
from fastapi.responses import StreamingResponse


llm = ChatOllama(model="qwen3:8b", base_url="http://localhost:11434", streaming=True)

  llm = ChatOllama(model="qwen3:8b", base_url="http://localhost:11434", streaming=True)


A basic LangChain combining a prompt to an LLM. Retrieval is performed above (separately). 

In [None]:
# Chain
chain = prompt | llm

# Run
chain.invoke(
    {
        "context": docs,
        "question": "What are some examples of anisotropic self-assembly?",
    }
)

Sample RAG prompts can also be imported from LangChain hub. 

In [None]:
# Sample RAG prompt from LangChain Hub
from langchain import hub

prompt_hub_rag = hub.pull("rlm/rag-prompt")
prompt_hub_rag

Chain combining Retrieval (happening internally now) and the generation. 

In [8]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableLambda

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()  # Parse the LLM output into a string
)

rag_chain.invoke("What are colloidal particles?")

  return forward_call(*args, **kwargs)


'<think>\nOkay, let\'s try to figure out what the answer is. The user is asking, "What are colloidal particles?" and they want me to answer based only on the provided context.\n\nFirst, I need to look through the given documents to find relevant information. Let me start by scanning each document\'s page content.\n\nThe first document mentions colloidal particles in the context of their interactions. It talks about particles coated with polymers, like spherical silica particles. It also discusses hydrodynamic interactions and direct interactions between colloidal particles. However, it doesn\'t give a clear definition of what colloidal particles are, just their properties and interactions.\n\nThe second document is about colloidal self-assembly and mentions colloidal particles in various contexts, like Janus particles and their behavior under light. Again, it\'s more about their interactions and applications rather than defining them.\n\nThe third document refers to colloidal particles

Text output streaming. 

In [None]:
question = ""

response = ""
for chunk in rag_chain.stream(question):
    text = chunk.content if hasattr(chunk, "content") else str(chunk)
    print(text, end="", flush=True)  # ✅ Print as it streams
    response += text

Basic gradio interface implementation.

In [None]:
import gradio as gr


def rag_qa(user_question):
    try:
        answer = rag_chain.invoke(user_question)
        return answer
    except Exception as e:
        return f"❌ Error: {str(e)}"


demo = gr.Interface(
    fn=rag_qa,
    inputs=gr.Textbox(label="Ask a question about your PDFs"),
    outputs=gr.Textbox(label="LLM Answer"),
    title="📄 Scientific PDF Chatbot",
    description="Ask any question based on your local documents. Powered by RAG + Qwen3:8B",
)

demo.launch(share=True, inline=False, inbrowser=False)

Complicated prompt for continued chats. 

In [None]:
from langchain.schema import SystemMessage, HumanMessage, AIMessage
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.prompts.chat import HumanMessagePromptTemplate

messages = [
    MessagesPlaceholder(variable_name="history"),
    SystemMessage(
        content="You are a helpful scientific assistant. Answer using only the context provided."
    ),
    HumanMessagePromptTemplate.from_template(
        "Context:\n{context}\n\nQuestion: {question}"
    ),
]

prompt = ChatPromptTemplate.from_messages(messages)

In [47]:
from langchain_core.runnables import RunnableLambda, RunnableMap


# Update the RAG chain
rag_chain = (
    {
        "history": RunnableLambda(lambda x: (x["history"])),
        "context": RunnableLambda(
            lambda x: retriever.get_relevant_documents(x["question"])
        ),
        "question": RunnableLambda(lambda x: (x["question"])),
    }
    | prompt
    | llm
    | StrOutputParser()
)


response = rag_chain.invoke(
    {
        "question": "What are colloidal particles?",
        "history": [],  # or previous conversation as LangChain messages
    }
)

  return forward_call(*args, **kwargs)


In [48]:
response

'<think>\nOkay, the user is asking, "What are colloidal particles?" Let me check the provided context to find the answer.\n\nLooking at the first document, there\'s a mention of colloidal particles in the context of dynamics. It says, "colloidal particles are large in comparison to the size of the solvent molecules," which implies they\'re larger than solvent molecules but still small enough to be suspended. The text also talks about hydrodynamic interactions, which are interactions mediated through the solvent. \n\nIn the second document, there\'s a reference to colloidal particles in the context of self-assembly, mentioning Janus particles and their behavior. The third document discusses colloidosomes and structures formed with colloidal particles, like micelles and emulsion droplets. \n\nPutting this together, colloidal particles are small particles suspended in a fluid, larger than molecules but small enough to remain dispersed. They interact via direct potential interactions and h

In [64]:
import gradio as gr

import re


def split_think_and_answer(response):
    """Extracts text after </think>."""
    match = re.search(r"</think>\s*(.*)", response, re.DOTALL)
    if match:
        return match.group(1).strip()
    return response.strip()


def get_thought(response):
    """Extracts text inside <think>...</think>."""
    match = re.search(r"<think>(.*?)</think>", response, re.DOTALL)
    if match:
        return match.group(1).strip()
    return None


def rag_qa(message, history):
    # Convert Gradio history format to LangChain message format
    history_langchain_format = []
    print("Gradio history:", history)
    # Convert Gradio history (list of {"role": ..., "content": ...}) to LangChain format
    for turn in history:
        if turn["role"] == "user":
            history_langchain_format.append(HumanMessage(content=turn["content"]))
        elif turn["role"] == "assistant":
            history_langchain_format.append(AIMessage(content=turn["content"]))

    history_langchain_format.append(HumanMessage(content=message))

    try:
        # Pass both current message and history to the chain
        response = rag_chain.invoke(
            {"history": history_langchain_format, "question": message}
        )
        thought = get_thought(response)
        final_answer = split_think_and_answer(response)

        if thought:
            # Add collapsible section with the <think> content
            final_answer = f"<details><summary><b>🤔 Thinking</b></summary><pre>{thought}</pre></details>\n\n{final_answer}"
        return {"role": "assistant", "content": final_answer}
    except Exception as e:
        return {"role": "assistant", "content": f"❌ Error: {str(e)}"}


# Create and launch the chat interface with memory
demo = gr.ChatInterface(
    fn=rag_qa,
    type="messages",
    title="📄 Scientific PDF Chatbot",
    description="Ask questions about your scientific PDFs. Powered by RAG + Qwen3:8B",
    examples=["What are colloidal particles?", "Tell me more about that"],
)

demo.launch(share=True, inline=False, inbrowser=False)

* Running on local URL:  http://127.0.0.1:7877
* Running on public URL: https://0cd97ed4c2e84c7b11.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




Gradio history: []


  return forward_call(*args, **kwargs)


Gradio history: [{'role': 'user', 'metadata': None, 'content': 'List some works by Sabine Klapp', 'options': None}, {'role': 'assistant', 'metadata': None, 'content': "<details><summary><b>🤔 Thinking</b></summary><pre>Okay, let's see. The user is asking for works by Sabine Klapp. I need to check the provided context to find any mentions of her. The context includes several documents. Let me go through each one.\n\nFirst document: It's about the Fokker-Planck equation, authored by someone else, no mention of Sabine Klapp. Second document is from Bungartz, discussing molecular dynamics simulation. The third document is another entry from Bungartz, same as the second. None of these documents mention Sabine Klapp. \n\nWait, maybe I missed something. Let me check again. The first document's page content lists the book title and publishers, but no author. The other two are about algorithms and molecular dynamics, authored by Bungartz. There's no mention of Sabine Klapp in any of the metadata

  return forward_call(*args, **kwargs)


Gradio history: [{'role': 'user', 'metadata': None, 'content': 'List some works by Sabine Klapp', 'options': None}, {'role': 'assistant', 'metadata': None, 'content': "<details><summary><b>🤔 Thinking</b></summary><pre>Okay, let's see. The user is asking for works by Sabine Klapp. I need to check the provided context to find any mentions of her. The context includes several documents. Let me go through each one.\n\nFirst document: It's about the Fokker-Planck equation, authored by someone else, no mention of Sabine Klapp. Second document is from Bungartz, discussing molecular dynamics simulation. The third document is another entry from Bungartz, same as the second. None of these documents mention Sabine Klapp. \n\nWait, maybe I missed something. Let me check again. The first document's page content lists the book title and publishers, but no author. The other two are about algorithms and molecular dynamics, authored by Bungartz. There's no mention of Sabine Klapp in any of the metadata

  return forward_call(*args, **kwargs)


## MultiQuery Generating prompts 

Prompt template for retrieval multi-query generation from the users question. 

In [None]:
template = """You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question}"""