# File QA RAG Chatbot App with ChatGPT, LangChain and Streamlit

Here we will implement an advanced RAG System with ChatGPT, LangChain and Streamlit to build a File QA UI-based chatbot with the following features:

PDF Document Upload and Indexing

RAG System for query analysis and response

Result streaming capabilities (Real-time output)

Show document sources of the answer from RAG system

Install Dependencies

In [None]:
!pip install langchain==0.1.12
!pip install langchain-openai==0.0.8
!pip install langchain-community==0.0.29
!pip install streamlit==1.32.2
!pip install PyMuPDF==1.24.0
!pip install chromadb==0.4.24
!pip install pyngrok==7.1.5

Collecting langchain==0.1.12
  Downloading langchain-0.1.12-py3-none-any.whl.metadata (13 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain==0.1.12)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting langchain-community<0.1,>=0.0.28 (from langchain==0.1.12)
  Downloading langchain_community-0.0.38-py3-none-any.whl.metadata (8.7 kB)
Collecting langchain-core<0.2.0,>=0.1.31 (from langchain==0.1.12)
  Downloading langchain_core-0.1.53-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain-text-splitters<0.1,>=0.0.1 (from langchain==0.1.12)
  Downloading langchain_text_splitters-0.0.2-py3-none-any.whl.metadata (2.2 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain==0.1.12)
  Downloading langsmith-0.1.147-py3-none-any.whl.metadata (14 kB)
Collecting tenacity<9.0.0,>=8.1.0 (from langchain==0.1.12)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain==

Collecting PyMuPDF==1.24.0
  Downloading PyMuPDF-1.24.0-cp311-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting PyMuPDFb==1.24.0 (from PyMuPDF==1.24.0)
  Downloading PyMuPDFb-1.24.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.4 kB)
Downloading PyMuPDF-1.24.0-cp311-none-manylinux2014_x86_64.whl (3.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.9/3.9 MB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyMuPDFb-1.24.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (30.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.8/30.8 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.24.0 PyMuPDFb-1.24.0
Collecting chromadb==0.4.24
  Downloading chromadb-0.4.24-py3-none-any.whl.metadata (7.3 kB)
Collecting build>=1.0.3 (from chromadb==0.4.24)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting

Collecting pyngrok==7.1.5
  Downloading pyngrok-7.1.5-py3-none-any.whl.metadata (7.6 kB)
Downloading pyngrok-7.1.5-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.1.5


# Load OpenAI API Credentials

Here we load it from a file so we don't explore the credentials on the internet by mistake

In [None]:
from getpass import getpass

OPENAI_KEY = getpass('Enter Open AI API Key: ')

Enter Open AI API Key: ··········


# Set Environment Variable

In [None]:
import os

os.environ['OPENAI_API_KEY'] = OPENAI_KEY

# Write App Code Header

In [None]:
%%writefile app.py
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain_community.chat_message_histories import StreamlitChatMessageHistory
from langchain.callbacks.base import BaseCallbackHandler
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import CharacterTextSplitter # Changed to CharacterTextSplitter
from langchain_community.vectorstores.chroma import Chroma
from operator import itemgetter


import streamlit as st
import tempfile
import os
import pandas as pd

# Customize initial app landing page
st.set_page_config(page_title="Dream Journal QA Chatbot", page_icon="🌙")
st.title("Dream Journal QA Chatbot 🌙")
st.sidebar.header("Document Upload and Dream Analysis")

@st.cache_resource(ttl="1h") #stores uploaded documents for 1h in cache
# Takes uploaded PDFs, creates document chunks, computes embeddings
# Stores document chunks and embeddings in a Vector DB
# Returns a retriever which can look up the Vector DB
# to return documents based on user input
# Stores this in the cache

#if you have preneeded files you can change it here  / change to uploaded diractory instead of uploaded files
def configure_retriever(uploaded_files):
    # Read documents
    docs = []
    temp_dir = tempfile.TemporaryDirectory()
    for file in uploaded_files:
        temp_filepath = os.path.join(temp_dir.name, file.name)
        with open(temp_filepath, "wb") as f:
            f.write(file.getvalue())
        loader = PyMuPDFLoader(temp_filepath)
        docs.extend(loader.load())

    # Split into documents chunks
    text_splitter = CharacterTextSplitter( # Using CharacterTextSplitter
        separator="/end",  # Split specifically by /end marker
        chunk_overlap=200,
        keep_separator=True, # Keep the separator in the chunks
        chunk_size=2000 # Chunk size limit, but /end is primary separator
    )
    doc_chunks = text_splitter.split_documents(docs)

    # Create document embeddings and store in Vector DB
    embeddings_model = OpenAIEmbeddings()
    vectordb = Chroma.from_documents(doc_chunks, embeddings_model)

    # Define retriever object
    retriever = vectordb.as_retriever()
    return retriever, doc_chunks  # Return both retriever and doc_chunks


# Manages live updates to a Streamlit app's display by appending new text tokens
# to an existing text stream and rendering the updated text in Markdown
class StreamHandler(BaseCallbackHandler):
    def __init__(self, container, initial_text=""):
        self.container = container
        self.text = initial_text

    def on_llm_new_token(self, token: str, **kwargs) -> None:
        self.text += token
        self.container.markdown(self.text)

# Creates UI element to accept PDF uploads
uploaded_files = st.sidebar.file_uploader(
    label="Upload Dream Journal PDF files", type=["pdf"],
    accept_multiple_files=True
)

if not uploaded_files:
    st.info("Please upload your Dream Journal PDF documents to continue.")
    st.stop()


# Create retriever object based on uploaded PDFs
retriever, doc_chunks = configure_retriever(uploaded_files) # Get both retriever and doc_chunks

# Load a connection to ChatGPT LLM
chatgpt = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0.1,
                    streaming=True)

# --- Question Type Classifier Chain ---
question_type_prompt_template = """
Determine if the following question is asking for a count of something, asking for the full text of a dream, or a general question that requires a detailed answer from a document.

Question: {question}

Respond with 'count_question' if the question is asking for a count.
Respond with 'full_dream_question' if the question is asking for the full text of a dream, like "full dream", "return dream", "what is the full dream about".
Respond with 'qa_question' if it's a general question requiring a detailed answer.

Just answer 'count_question', 'full_dream_question', or 'qa_question'.
"""
question_type_prompt = ChatPromptTemplate.from_template(question_type_prompt_template)

question_type_chain = question_type_prompt | chatgpt


# --- Topic Extraction Chain for Count Questions ---
topic_extraction_prompt_template = """
User question: {question}

Identify the topic that the user wants to count in their dream journal from the question above.
Even if the question includes negation (like 'not about X'), identify 'X' as the topic.
Just return the core topic, do not include any extra words or explanations, and do not include the negation words.
If the question is too vague or it's not clear what to count, respond with 'unknown_topic'.
"""
topic_extraction_prompt = ChatPromptTemplate.from_template(topic_extraction_prompt_template)
topic_extraction_chain = topic_extraction_prompt | chatgpt


# --- QA RAG Chain (same as before) ---
qa_template = """
Use only the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know,
don't try to make up an answer. Keep the answer as concise as possible.

{context}

Question: {question}
"""
qa_prompt = ChatPromptTemplate.from_template(qa_template)


# This function formats retrieved documents before sending to LLM
def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])


# Create a QA RAG System Chain
qa_rag_chain = (
    {
        "context": itemgetter("question") | retriever | format_docs, # based on the user question get context docs
        "question": itemgetter("question") # user question
    }
    | qa_prompt
    | chatgpt # above prompt is sent to the LLM for response
)


# --- Counting Dreams by Topic Function ---
def count_dreams_by_topic(document_chunks, topic, negate=False): # Added negate parameter
    dream_count = 0
    source_documents_count = [] # List to store source documents for count questions
    with st.spinner(f"Analyzing dreams for '{topic}'..."):
        for chunk in document_chunks:
            if negate:
                prompt = f"""Analyze the following dream journal entry and determine if it is NOT about a dream that includes the topic: '{topic}'.
                Respond with 'yes' if the dream is NOT about '{topic}', and 'no' if it IS about it. Just answer 'yes' or 'no'.

                Dream Entry:
                {chunk.page_content}
                """
            else:
                prompt = f"""Analyze the following dream journal entry and determine if it is about a dream that includes the topic: '{topic}'.
                Respond with 'yes' if the dream is about '{topic}', and 'no' if it is not. Just answer 'yes' or 'no'.

                Dream Entry:
                {chunk.page_content}
                """
            print(f"\n--- Analyzing Dream Chunk for topic '{topic}', Negate: {negate} ---") # Debug print
            print(f"Prompt sent to LLM:\n{prompt}") # Debug print of the prompt
            response = chatgpt.invoke(prompt)
            llm_response_content = response.content.strip().lower() # Get LLM response content and lowercase it
            print(f"LLM Response (stripped, lowercased): {llm_response_content}") # Debug print of LLM response
            if llm_response_content == "yes": # Stricter "yes" check
                dream_count += 1
                metadata = chunk.metadata
                source_documents_count.append({
                    "source": metadata["source"],
                    "page": metadata["page"],
                    "content": chunk.page_content[:200] + "..." # Snippet of content
                })
    return dream_count, source_documents_count

# --- Function to display source documents table ---
def display_source_documents_table(source_documents):
    if source_documents:
        st.markdown("__Dreams related to the topic (Sources):__" + "\n")
        st.dataframe(data=pd.DataFrame(source_documents), width=1000)


# --- Function to retrieve and display full dream text ---
def retrieve_full_dream_text(user_question, document_chunks, retriever):
    relevant_docs = retriever.get_relevant_documents(user_question)
    if not relevant_docs:
        return "No dream found related to your question."

    # Assuming each document in relevant_docs corresponds to parts of the same dream (page)
    # Group chunks by source and page to reconstruct full dream - improved logic
    dream_pages = {}
    for doc in relevant_docs:
        source = doc.metadata['source']
        page = doc.metadata['page']
        key = (source, page)
        if key not in dream_pages:
            dream_pages[key] = []
        dream_pages[key].append(doc)

    full_dreams_text = ""
    for key in dream_pages:
        # Sort chunks by their order in the document if possible (not directly available from metadata in this setup, assuming page order is enough)
        page_chunks = sorted(dream_pages[key], key=lambda doc: doc.metadata['page']) # Sort by page number, assuming page number reflects order
        dream_text_parts = [chunk.page_content for chunk in page_chunks]
        full_dreams_text += "\n\n".join(dream_text_parts) # Join parts of the dream from the page

    return full_dreams_text.strip()


# Store conversation history in Streamlit session state
streamlit_msg_history = StreamlitChatMessageHistory(key="langchain_messages")


# Shows the first message when app starts
if len(streamlit_msg_history.messages) == 0:
    streamlit_msg_history.add_ai_message("Please ask questions about your dream journal, including count-related questions, general questions, or requests to retrieve the full text of a dream.")


# Render current messages from StreamlitMessageHistory
for msg in streamlit_msg_history.messages:
    st.chat_message(msg.type).write(msg.content)

# Callback handler which does some post-processing on the LLM response (for QA questions only)
# Used to post the top 3 document sources used by the LLM in RAG response
class PostMessageHandler(BaseCallbackHandler):
    def __init__(self, msg: st.write):
        BaseCallbackHandler.__init__(self)
        self.msg = msg
        self.sources = []

    def on_retriever_end(self, documents, *, run_id, parent_run_id, **kwargs):
        source_ids = []
        for d in documents: # retrieved documents from retriever based on user query
            metadata = {
                "source": d.metadata["source"],
                "page": d.metadata["page"],
                "content": d.page_content[:200]
            }
            idx = (metadata["source"], metadata["page"])
            if idx not in source_ids: # store unique source documents
                source_ids.append(idx)
                self.sources.append(metadata)

    def on_llm_end(self, response, *, run_id, parent_run_id, **kwargs):
        if len(self.sources):
            st.markdown("__Sources:__" + "\n")
            st.dataframe(data=pd.DataFrame(self.sources[:3]), # Top 3 sources
                        width=1000)


# Handle user input
if user_prompt := st.chat_input():
    st.chat_message("human").write(user_prompt)

    # --- Classify Question Type ---
    question_type_response = question_type_chain.invoke({"question": user_prompt})
    question_type = question_type_response.content.strip() # Get question type

    if question_type == "count_question":
        # --- Handle Count Question ---
        topic_extraction_response = topic_extraction_chain.invoke({"question": user_prompt})
        topic_to_count = topic_extraction_response.content.strip()

        print(f"User prompt: {user_prompt}") # Debug print
        print(f"Extracted topic from LLM: {topic_to_count}") # Debug print

        if topic_to_count == "unknown_topic":
            st.warning("Sorry, I couldn't understand what topic you want to count. Please be more specific.")
            streamlit_msg_history.add_user_message(user_prompt)
            streamlit_msg_history.add_ai_message("Sorry, I couldn't understand what topic you want to count. Please be more specific.")

        else:
            negated_question = False
            negation_words = ["not", "no", "without", "excluding"] # Extend as needed
            if any(neg_word in user_prompt.lower() for neg_word in negation_words):
                negated_question = True

            dream_count, source_documents_count = count_dreams_by_topic(doc_chunks, topic_to_count, negate=negated_question) # Pass negate flag
            if negated_question:
                st.chat_message("ai").write(f"I have analyzed your dream journal and found **{dream_count} dreams that are NOT about '{topic_to_count}'**.")
                streamlit_msg_history.add_ai_message(f"I have analyzed your dream journal and found **{dream_count} dreams that are NOT about '{topic_to_count}'**.")

            else:
                st.chat_message("ai").write(f"I have analyzed your dream journal and found **{dream_count} dreams about '{topic_to_count}'**.")
                streamlit_msg_history.add_ai_message(f"I have analyzed your dream journal and found **{dream_count} dreams about '{topic_to_count}'**.")

            display_source_documents_table(source_documents_count) # Display sources for count questions
            streamlit_msg_history.add_user_message(user_prompt)


    elif question_type == "qa_question":
        # --- Handle QA Question (Normal RAG flow) ---
        with st.chat_message("ai"):
            # Initializing an empty data stream
            stream_handler = StreamHandler(st.empty())
            # UI element to write RAG sources after LLM response
            sources_container = st.write("")
            pm_handler = PostMessageHandler(sources_container)
            config = {"callbacks": [stream_handler, pm_handler]}
            # Get LLM response
            response = qa_rag_chain.invoke({"question": user_prompt}, config)

        streamlit_msg_history.add_user_message(user_prompt)
        streamlit_msg_history.add_ai_message(response.content) # Store AI message without sources in history

    elif question_type == "full_dream_question":
        # --- Handle Full Dream Retrieval ---
        with st.chat_message("ai"):
            full_dream_text = retrieve_full_dream_text(user_prompt, doc_chunks, retriever)
            cleaned_dream_text = full_dream_text.split('/endDream')[0].strip()
            st.markdown(cleaned_dream_text) # Display full dream text

        streamlit_msg_history.add_user_message(user_prompt)

        streamlit_msg_history.add_ai_message(cleaned_dream_text)
        #streamlit_msg_history.add_ai_message(full_dream_text.split('/end', 1)[0])


    else:
        st.chat_message("ai").write("Sorry, I could not understand the type of question. Please rephrase.")
        streamlit_msg_history.add_user_message(user_prompt)
        streamlit_msg_history.add_ai_message("Sorry, I could not understand the type of question. Please rephrase.")

Overwriting app.py


# Starting the Streamlit App

In [None]:
!streamlit run app.py --server.port=8989 &>/./logs.txt &

# Setting Up ngrok Tunnel

In [None]:
from getpass import getpass

ngrok_auth_token = getpass('Enter ngrok API Key: ')

In [None]:
from pyngrok import ngrok
import yaml

# Terminate open tunnels if exist
ngrok.kill()

# Authenticate ngrok with the token read from the file
!ngrok config add-authtoken {ngrok_auth_token}

# Open an HTTPS tunnel on port XXXX which you get from your `logs.txt` file
ngrok_tunnel = ngrok.connect(8989)
print("Streamlit App:", ngrok_tunnel.public_url)

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
Streamlit App: https://15c9-34-125-59-185.ngrok-free.app
