# File QA RAG Chatbot App with ChatGPT, LangChain and Streamlit

Here we will implement an advanced RAG System with ChatGPT, LangChain and Streamlit to build a File QA UI-based chatbot with the following features:

PDF Document Upload and Indexing

RAG System for query analysis and response

Result streaming capabilities (Real-time output)

Show document sources of the answer from RAG system

Install Dependencies

In [None]:
!pip install langchain==0.1.12
!pip install langchain-openai==0.0.8
!pip install langchain-community==0.0.29
!pip install streamlit==1.32.2
!pip install PyMuPDF==1.24.0
!pip install chromadb==0.4.24
!pip install pyngrok==7.1.5

!pip install PyPDF2 spacy pandas matplotlib wordcloud
!python -m spacy download en_core_web_sm

# Load OpenAI API Credentials

Here we load it from a file so we don't explore the credentials on the internet by mistake

In [None]:
from getpass import getpass

OPENAI_KEY = getpass('Enter Open AI API Key: ')

# Set Environment Variable

In [3]:
import os

os.environ['OPENAI_API_KEY'] = OPENAI_KEY

# Write App Code Header

In [None]:
%%writefile app.py
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain_community.chat_message_histories import StreamlitChatMessageHistory
from langchain.callbacks.base import BaseCallbackHandler
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import CharacterTextSplitter  # Changed to CharacterTextSplitter
from langchain_community.vectorstores.chroma import Chroma
from operator import itemgetter
from os.path import basename

import streamlit as st
import tempfile
import os
import pandas as pd
from io import BytesIO
import spacy
import matplotlib.pyplot as plt
from collections import Counter
from wordcloud import WordCloud

# Customize initial app landing page
st.set_page_config(page_title="Dream Journal QA Chatbot", page_icon="🌙")
st.title("Dream Journal QA Chatbot 🌙")
st.sidebar.header("Document Upload and Dream Analysis")

@st.cache_resource(ttl="1h")  # Stores uploaded documents for 1h in cache
def configure_retriever(uploaded_files):
    # Read documents
    docs = []
    temp_dir = tempfile.TemporaryDirectory()
    for file in uploaded_files:
        temp_filepath = os.path.join(temp_dir.name, file.name)
        with open(temp_filepath, "wb") as f:
            f.write(file.getvalue())
        loader = PyMuPDFLoader(temp_filepath)
        docs.extend(loader.load())

    # Split into document chunks using a custom separator
    text_splitter = CharacterTextSplitter(
        separator="/end",  # Split specifically by /end marker
        chunk_overlap=200,
        keep_separator=True,  # Keep the separator in the chunks
        chunk_size=2000      # Chunk size limit, but /end is primary separator
    )
    doc_chunks = text_splitter.split_documents(docs)

    # Create document embeddings and store in a Vector DB
    embeddings_model = OpenAIEmbeddings()
    vectordb = Chroma.from_documents(doc_chunks, embeddings_model)

    # Define retriever object
    retriever = vectordb.as_retriever()
    return retriever, doc_chunks  # Return both retriever and doc_chunks

# Manages live updates to a Streamlit app's display by appending new text tokens
class StreamHandler(BaseCallbackHandler):
    def __init__(self, container, initial_text=""):
        self.container = container
        self.text = initial_text

    def on_llm_new_token(self, token: str, **kwargs) -> None:
        self.text += token
        self.container.markdown(self.text)

# Create UI element to accept PDF uploads
uploaded_files = st.sidebar.file_uploader(
    label="Upload Dream Journal PDF files", type=["pdf"],
    accept_multiple_files=True
)

if not uploaded_files:
    st.info("Please upload your Dream Journal PDF documents to continue.")
    st.stop()

# Create retriever object based on uploaded PDFs
retriever, doc_chunks = configure_retriever(uploaded_files)  # Get both retriever and doc_chunks

# Load a connection to ChatGPT LLM
chatgpt = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0.1, streaming=True)

# --- Question Type Classifier Chain ---
question_type_prompt_template = """
Determine the type of question being asked. Choose from the following categories: count_question, full_dream_question, analysis_question, or qa_question.

Question: {question}

Respond with 'count_question' if the question asks for a count of dreams related to a specific topic (e.g., "how many dreams about cats?").
Respond with 'full_dream_question' if the question asks for the complete text of a dream (e.g., "show me the full dream", "what is the dream about?").
Respond with 'analysis_question' if the question is a general request to analyze ALL dreams in the journal and provide an overall analysis or visualization, like "analyze my dreams", "dream analysis", "plot dream words", "show me a word frequency plot of my dreams". This is for a holistic analysis of the entire dream journal.
Respond with 'qa_question' if the question is a general question asking for specific information or details ABOUT the content of the dreams, requiring a detailed answer from the dream journal (e.g., "what happens in dreams about cats?", "tell me about dreams involving dogs"). This is for question-answering about dream content.

Just answer 'count_question', 'full_dream_question', 'analysis_question', or 'qa_question'.
"""
question_type_prompt = ChatPromptTemplate.from_template(question_type_prompt_template)
question_type_chain = question_type_prompt | chatgpt

# --- Topic Extraction Chain for Count Questions ---
topic_extraction_prompt_template = """
User question: {question}

Identify the topic that the user wants to count in their dream journal from the question above.
Even if the question includes negation (like 'not about X'), identify 'X' as the topic.
Just return the core topic, do not include any extra words or explanations, and do not include the negation words.
If the question is too vague or it's not clear what to count, respond with 'unknown_topic'.
"""
topic_extraction_prompt = ChatPromptTemplate.from_template(topic_extraction_prompt_template)
topic_extraction_chain = topic_extraction_prompt | chatgpt

# --- QA RAG Chain ---
qa_template = """
Use only the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know,
don't try to make up an answer. Keep the answer as concise as possible.

{context}

Question: {question}
"""
qa_prompt = ChatPromptTemplate.from_template(qa_template)

def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])

qa_rag_chain = (
    {
        "context": itemgetter("question") | retriever | format_docs,
        "question": itemgetter("question")
    }
    | qa_prompt
    | chatgpt
)

# --- Helper Function: Generate Topic Variations ---
def get_topic_variations(topic, nlp_model):
    """
    Using spaCy lemmatization, generate a set of topic variations
    including singular and plural forms.
    """
    topic = topic.lower().strip()
    doc = nlp_model(topic)
    variations = set()
    for token in doc:
        # Add the lemma (base form)
        variations.add(token.lemma_)
        # Add plural/singular variations:
        if token.lemma_.endswith("s"):
            variations.add(token.lemma_[:-1])
        else:
            variations.add(token.lemma_ + "s")
    # Always include the original topic as provided
    variations.add(topic)
    return list(variations)

# --- Updated Counting Dreams by Topic Function ---
def count_dreams_by_topic(document_chunks, topic, negate=False):
    """
    Analyze each dream chunk to see if it (or its page) is about a dream that
    includes any of the variations of the topic. For non-negated queries the prompt
    instructs the LLM to determine if the dream is about any of these topics,
    and for negated queries it instructs the LLM to determine if the dream is NOT about them.
    The function ensures that the same page is not counted twice.
    """
    dream_count = 0
    source_documents_count = []  # To store source document info for count questions
    counted_pages = set()        # To avoid counting the same page twice

    # Generate topic variations using spaCy
    topic_variations = get_topic_variations(topic, nlp_model)
    variations_text = ", ".join(topic_variations)

    with st.spinner(f"Analyzing dreams for '{topic}'..."):
        for chunk in document_chunks:
            source = chunk.metadata.get("source", "unknown")
            page = chunk.metadata.get("page", "unknown")
            page_id = (basename(source), page)
            if page_id in counted_pages:
                continue  # Skip if this page has already been counted

            if negate:
                # For negated queries, use simpler wording
                prompt = f"""Analyze the following dream journal entry and determine if it is NOT about a dream that includes any of the following topics: {variations_text}.
Respond with 'yes' if the dream is NOT about any of these topics, and 'no' if it is about one or more of them. Just answer 'yes' or 'no'.

Dream Entry:
{chunk.page_content}
"""
            else:
                prompt = f"""Analyze the following dream journal entry and determine if it is about a dream that includes any of the following topics: {variations_text}.
Respond with 'yes' if the dream is about one or more of these topics, and 'no' if it is not. Just answer 'yes' or 'no'.

Dream Entry:
{chunk.page_content}
"""
            response = chatgpt.invoke(prompt)
            llm_response_content = response.content.strip().lower()
            if llm_response_content == "yes":
                dream_count += 1
                counted_pages.add(page_id)
                source_documents_count.append({
                    "source": basename(source),
                    "page": page,
                    "content": chunk.page_content[:200] + "..."
                })
    return dream_count, source_documents_count

# --- Function to display source documents table ---
def display_source_documents_table(source_documents):
    if source_documents:
        st.markdown("__Dreams related to the topic (Sources):__" + "\n")
        st.dataframe(data=pd.DataFrame(source_documents), width=1000)

# --- Function to retrieve and display full dream text ---
def retrieve_full_dream_text(user_question, document_chunks, retriever):
    relevant_docs = retriever.get_relevant_documents(user_question)
    if not relevant_docs:
        return "No dream found related to your question."

    # Group chunks by source and page to reconstruct the full dream
    dream_pages = {}
    for doc in relevant_docs:
        source = basename(doc.metadata['source'])
        page = doc.metadata['page']
        key = (source, page)
        if key not in dream_pages:
            dream_pages[key] = []
        dream_pages[key].append(doc)

    full_dreams_text = ""
    for key in dream_pages:
        page_chunks = sorted(dream_pages[key], key=lambda doc: doc.metadata.get('page', 0))
        dream_text_parts = [chunk.page_content for chunk in page_chunks]
        full_dreams_text += "\n\n".join(dream_text_parts)
    return full_dreams_text.strip()

# --- NLP Analysis and Plotting Functions ---
@st.cache_resource
def load_spacy_model():
    try:
        nlp = spacy.load("en_core_web_sm")
    except OSError:
        print("Downloading en_core_web_sm model...")
        spacy.cli.download("en_core_web_sm")
        nlp = spacy.load("en_core_web_sm")
    print("spaCy model loaded.")
    return nlp

nlp_model = load_spacy_model()

def analyze_dreams_spacy(dream_texts, nlp):
    all_dream_text = " ".join(dream_texts)
    doc = nlp(all_dream_text)
    tokens = [
        token.text.lower() for token in doc
        if not token.is_stop and not token.is_punct and len(token.text) > 2
           and token.text.lower() not in ["/end", "dream", "enddream"]
    ]
    word_freq = Counter(tokens)
    return word_freq

def plot_horizontal_word_frequencies(sorted_word_freq, top_n=50):
    words, frequencies = zip(*sorted_word_freq[:top_n])
    plt.figure(figsize=(10, 12))
    plt.barh(words, frequencies, color='skyblue')
    plt.xlabel('Frequency', fontsize=12)
    plt.ylabel('Words', fontsize=12)
    plt.title(f'Top {top_n} Word Frequencies in Dreams', fontsize=14)
    plt.gca().invert_yaxis()
    plt.tight_layout()
    buffer = BytesIO()
    plt.savefig(buffer, format='png')
    plt.close()
    buffer.seek(0)
    return buffer

def analyze_and_plot_dreams(document_chunks, top_n):
    dream_texts = [chunk.page_content for chunk in document_chunks]
    if not dream_texts:
        return None, "No dream texts found to analyze."
    dream_word_freq = analyze_dreams_spacy(dream_texts, nlp_model)
    sorted_dream_word_freq = sorted(dream_word_freq.items(), key=lambda item: item[1], reverse=True)
    if not sorted_dream_word_freq:
        return None, "No significant words found for plotting."
    plot_buffer = plot_horizontal_word_frequencies(sorted_dream_word_freq, top_n=top_n)
    return plot_buffer, None

# --- PostMessageHandler: Ensure Unique Source Documents ---
class PostMessageHandler(BaseCallbackHandler):
    def __init__(self, msg: st.write):
        super().__init__()
        self.msg = msg
        self.sources = []  # This will store unique source metadata

    def on_retriever_end(self, documents, *, run_id, parent_run_id, **kwargs):
        for d in documents:
            # Use basename to remove temporary directory differences
            src = basename(d.metadata["source"])
            metadata = {
                "source": src,
                "page": d.metadata["page"],
                "content": d.page_content[:200]
            }
            idx = (src, d.metadata["page"])
            # Check if this (source, page) tuple is already present
            if idx not in {(s["source"], s["page"]) for s in self.sources}:
                self.sources.append(metadata)

    def on_llm_end(self, response, *, run_id, parent_run_id, **kwargs):
        if self.sources:
            st.markdown("__Sources:__" + "\n")
            st.dataframe(data=pd.DataFrame(self.sources[:3]), width=1000)

# --- Conversation Handling ---
streamlit_msg_history = StreamlitChatMessageHistory(key="langchain_messages")

if "awaiting_top_n" not in st.session_state:
    st.session_state.awaiting_top_n = False
if "analysis_prompt" not in st.session_state:
    st.session_state.analysis_prompt = None

if len(streamlit_msg_history.messages) == 0:
    streamlit_msg_history.add_ai_message("Please ask questions about your dream journal, including count-related questions, general questions, requests to retrieve the full text of a dream or ask for analysis and plots.")

for msg in streamlit_msg_history.messages:
    st.chat_message(msg.type).write(msg.content)

if user_prompt := st.chat_input():
    st.chat_message("human").write(user_prompt)

    if st.session_state.awaiting_top_n:
        try:
            top_n_words = int(user_prompt)
            if top_n_words <= 0:
                st.error("Please enter a positive number for the number of top words.")
                streamlit_msg_history.add_user_message(user_prompt)
                streamlit_msg_history.add_ai_message("Please enter a positive number for the number of top words.")
            else:
                st.session_state.awaiting_top_n = False
                plot_buffer, error_message = analyze_and_plot_dreams(doc_chunks, top_n_words)
                with st.chat_message("ai"):
                    if plot_buffer:
                        st.image(plot_buffer, caption=f'Top {top_n_words} Word Frequency Plot of Dreams')
                        st.markdown(f"Below is a word frequency plot of the top {top_n_words} most common words in your dream journal. This visualization helps to identify recurring themes and patterns in your dreams.")
                        streamlit_msg_history.add_ai_message(f"Here is a word frequency plot of your dreams showing top {top_n_words} words:")
                    else:
                        error_text = error_message or "Failed to generate dream analysis plot."
                        st.error(error_text)
                        streamlit_msg_history.add_ai_message(error_text)
            streamlit_msg_history.add_user_message(user_prompt)
        except ValueError:
            st.error("Invalid input. Please enter a number.")
            streamlit_msg_history.add_user_message(user_prompt)
            streamlit_msg_history.add_ai_message("Invalid input. Please enter a number.")
            st.session_state.awaiting_top_n = True

    else:
        # --- Classify Question Type ---
        question_type_response = question_type_chain.invoke({"question": user_prompt})
        question_type = question_type_response.content.strip()

        if question_type == "count_question":
            # --- Handle Count Question ---
            topic_extraction_response = topic_extraction_chain.invoke({"question": user_prompt})
            topic_to_count = topic_extraction_response.content.strip()

            if topic_to_count == "unknown_topic":
                st.warning("Sorry, I couldn't understand what topic you want to count. Please be more specific.")
                streamlit_msg_history.add_user_message(user_prompt)
                streamlit_msg_history.add_ai_message("Sorry, I couldn't understand what topic you want to count. Please be more specific.")
            else:
                negated_question = False
                negation_words = ["not", "no", "without", "excluding"]
                if any(neg_word in user_prompt.lower() for neg_word in negation_words):
                    negated_question = True

                dream_count, source_documents_count = count_dreams_by_topic(doc_chunks, topic_to_count, negate=negated_question)
                if negated_question:
                    answer_text = f"I have analyzed your dream journal and found **{dream_count} dreams that are NOT about '{topic_to_count}'**."
                else:
                    answer_text = f"I have analyzed your dream journal and found **{dream_count} dreams about '{topic_to_count}'**."
                st.chat_message("ai").write(answer_text)
                streamlit_msg_history.add_ai_message(answer_text)
                display_source_documents_table(source_documents_count)
                streamlit_msg_history.add_user_message(user_prompt)

        elif question_type == "qa_question":
            # --- Handle QA Question (Normal RAG flow) ---
            with st.chat_message("ai"):
                stream_handler = StreamHandler(st.empty())
                sources_container = st.write("")
                pm_handler = PostMessageHandler(sources_container)
                config = {"callbacks": [stream_handler, pm_handler]}
                response = qa_rag_chain.invoke({"question": user_prompt}, config)
            streamlit_msg_history.add_user_message(user_prompt)
            streamlit_msg_history.add_ai_message(response.content)

        elif question_type == "full_dream_question":
            # --- Handle Full Dream Retrieval ---
            with st.chat_message("ai"):
                full_dream_text = retrieve_full_dream_text(user_prompt, doc_chunks, retriever)
                cleaned_dream_text = full_dream_text.split('/end')[0].strip()
                st.markdown(cleaned_dream_text)
            streamlit_msg_history.add_user_message(user_prompt)
            streamlit_msg_history.add_ai_message(cleaned_dream_text)

        elif question_type == "analysis_question":
            # --- Handle Dream Analysis and Plot ---
            st.session_state.awaiting_top_n = True
            st.session_state.analysis_prompt = user_prompt
            with st.chat_message("ai"):
                st.write("How many top words would you like to see in the plot?")
            streamlit_msg_history.add_user_message(user_prompt)
            streamlit_msg_history.add_ai_message("How many top words would you like to see in the plot?")

        else:
            st.chat_message("ai").write("Sorry, I could not understand the type of question. Please rephrase.")
            streamlit_msg_history.add_user_message(user_prompt)
            streamlit_msg_history.add_ai_message("Sorry, I could not understand the type of question. Please rephrase.")


# Starting the Streamlit App

In [5]:
!streamlit run app.py --server.port=8989 &>/./logs.txt &

# Setting Up ngrok Tunnel

In [None]:
from getpass import getpass

ngrok_auth_token = getpass('Enter ngrok API Key: ')

In [None]:
from pyngrok import ngrok
import yaml

# Terminate open tunnels if exist
ngrok.kill()

# Authenticate ngrok with the token read from the file
!ngrok config add-authtoken {ngrok_auth_token}

# Open an HTTPS tunnel on port XXXX which you get from your `logs.txt` file
ngrok_tunnel = ngrok.connect(8989)
print("Streamlit App:", ngrok_tunnel.public_url)