# FILE QA RAG CHATBOT APP

In [None]:
!pip install langchain==0.1.12
!pip install langchain-openai==0.0.8
!pip install langchain-community==0.0.29
!pip install streamlit==1.32.2
!pip install PyMuPDF==1.24.0
!pip install chromadb==0.4.24
!pip install pyngrok==7.1.5


Collecting langchain==0.1.12
  Downloading langchain-0.1.12-py3-none-any.whl.metadata (13 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain==0.1.12)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting langchain-community<0.1,>=0.0.28 (from langchain==0.1.12)
  Downloading langchain_community-0.0.38-py3-none-any.whl.metadata (8.7 kB)
Collecting langchain-core<0.2.0,>=0.1.31 (from langchain==0.1.12)
  Downloading langchain_core-0.1.53-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain-text-splitters<0.1,>=0.0.1 (from langchain==0.1.12)
  Downloading langchain_text_splitters-0.0.2-py3-none-any.whl.metadata (2.2 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain==0.1.12)
  Downloading langsmith-0.1.147-py3-none-any.whl.metadata (14 kB)
Collecting tenacity<9.0.0,>=8.1.0 (from langchain==0.1.12)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain==

Collecting PyMuPDF==1.24.0
  Downloading PyMuPDF-1.24.0-cp311-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting PyMuPDFb==1.24.0 (from PyMuPDF==1.24.0)
  Downloading PyMuPDFb-1.24.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.4 kB)
Downloading PyMuPDF-1.24.0-cp311-none-manylinux2014_x86_64.whl (3.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.9/3.9 MB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyMuPDFb-1.24.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (30.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.8/30.8 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.24.0 PyMuPDFb-1.24.0
Collecting chromadb==0.4.24
  Downloading chromadb-0.4.24-py3-none-any.whl.metadata (7.3 kB)
Collecting build>=1.0.3 (from chromadb==0.4.24)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting

Collecting pyngrok==7.1.5
  Downloading pyngrok-7.1.5-py3-none-any.whl.metadata (7.6 kB)
Downloading pyngrok-7.1.5-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.1.5


In [None]:
import locale

locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
import yaml

with open('chat_gpt_api.yaml', 'r') as file:
    api_creds = yaml.safe_load(file)

In [None]:
api_creds.keys()

dict_keys(['OPENAI_API_KEY'])

In [None]:
import os

os.environ['OPENAI_API_KEY'] = api_creds['OPENAI_API_KEY']

In [None]:
%%writefile app.py
import os
import tempfile
import pandas as pd
from operator import itemgetter

import streamlit as st
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.chat_message_histories import StreamlitChatMessageHistory
from langchain_core.callbacks.base import BaseCallbackHandler
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores.chroma import Chroma

# ------------------------------
# Page Configuration and Styling
# ------------------------------
st.set_page_config(page_title="RAG-Based QA ChatBOT", page_icon="🤖", layout="wide")
st.markdown(
    """
    <style>
    /* General background */
    body {
        background-color: #f5f5f5;
    }

    /* Sidebar styling */
    .css-1d391kg {
        background-color: #ffffff;
        border-right: 2px solid #e6e6e6;
    }

    /* Chat message styling */
    .chat-message.ai {
        background-color: #e0f7fa;
        color: #006064;
        border-radius: 10px;
        padding: 10px;
        margin-bottom: 5px;
    }
    .chat-message.human {
        background-color: #fce4ec;
        color: #880e4f;
        border-radius: 10px;
        padding: 10px;
        margin-bottom: 5px;
        text-align: right;
    }

    /* Title styling */
    .app-title {
        font-size: 2.5rem;
        font-weight: 700;
        text-align: center;
        margin-bottom: 20px;
    }

    /* Chat container styling */
    .chat-container {
        background-color: #ffffff;
        padding: 15px;
        border-radius: 10px;
        box-shadow: 0 2px 5px rgba(0,0,0,0.1);
    }
    </style>
    """,
    unsafe_allow_html=True,
)

# ------------------------------
# App Header
# ------------------------------
st.markdown('<div class="app-title">🤖 RAG-Based QA ChatBOT</div>', unsafe_allow_html=True)
st.markdown("### Ask your questions based on your uploaded PDF documents below!")

# ------------------------------
# Sidebar File Upload
# ------------------------------
with st.sidebar:
    st.header("Upload Documents")
    st.markdown("Upload your PDF files that contain the context for the chatbot.")
    uploaded_files = st.file_uploader("Choose PDFs", type=["pdf"], accept_multiple_files=True)

if not uploaded_files:
    st.info("Please upload at least one PDF document to continue.")
    st.stop()

# ------------------------------
# Function to Configure Retriever
# ------------------------------
@st.cache_resource(ttl="1h")
def configure_retriever(uploaded_files):
    docs = []
    temp_dir = tempfile.TemporaryDirectory()
    for file in uploaded_files:
        temp_filepath = os.path.join(temp_dir.name, file.name)
        with open(temp_filepath, "wb") as f:
            f.write(file.getvalue())
        loader = PyMuPDFLoader(temp_filepath)
        docs.extend(loader.load())

    # Splitting documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    doc_chunks = text_splitter.split_documents(docs)

    # Create embeddings and vector store
    embeddings_model = OpenAIEmbeddings()
    vectordb = Chroma.from_documents(doc_chunks, embeddings_model)
    retriever = vectordb.as_retriever()
    return retriever

# ------------------------------
# Initialize Retriever and LLM
# ------------------------------
retriever = configure_retriever(uploaded_files)
chatgpt = ChatOpenAI(temperature=0.1, model_name="gpt-4o-mini", streaming=True)

# ------------------------------
# Define Prompt Template and Chain
# ------------------------------
qa_template = """
Use only the following pieces of context to answer the question at the end.
If you don't know the answer, just say that "The information you are asking for is not available in your files".
Keep the answer as concise as possible.

Context:
{context}

Question: {question}
"""

qa_prompt = ChatPromptTemplate.from_template(qa_template)

def format_docs(docs):
    return "\n\n".join([doc.page_content for doc in docs])

qa_rag_chain = (
    {"context": itemgetter("question") | retriever | format_docs, "question": itemgetter("question")}
    | qa_prompt
    | chatgpt
)

# ------------------------------
# Custom Callback Handlers
# ------------------------------
class StreamlitCallbackHandler(BaseCallbackHandler):
    def __init__(self, container, initial_text=""):
        self.container = container
        self.text = initial_text

    def on_llm_new_token(self, token: str, **kwargs) -> None:
        self.text += token
        self.container.markdown(self.text)

class PostMessageHandler(BaseCallbackHandler):
    def __init__(self, container):
        super().__init__()
        self.container = container
        self.sources = []

    def on_retriever_end(self, documents, *, run_id, parent_run_id, **kwargs):
        source_ids = []
        for d in documents:
            metadata = {
                "source": d.metadata.get("source", "N/A"),
                "page": d.metadata.get("page", "N/A"),
                "content": d.page_content[:200]
            }
            idx = (metadata["source"], metadata["page"])
            if idx not in source_ids:
                source_ids.append(idx)
                self.sources.append(metadata)

    def on_llm_end(self, response, *, run_id, parent_run_id, **kwargs):
        if self.sources:
            self.container.markdown("**Sources:**")
            self.container.dataframe(pd.DataFrame(self.sources[:3]), width=1000)

# ------------------------------
# Chat History and UI Display
# ------------------------------
# Using StreamlitChatMessageHistory to persist conversation
streamlit_msg_history = StreamlitChatMessageHistory(key="langchain_messages")
if len(streamlit_msg_history.messages) == 0:
    streamlit_msg_history.add_ai_message("How can I help you?")

# Display past conversation messages in a styled chat container
with st.container():
    st.markdown('<div class="chat-container">', unsafe_allow_html=True)
    for msg in streamlit_msg_history.messages:
        if msg.type == "human":
            st.markdown(f'<div class="chat-message human">{msg.content}</div>', unsafe_allow_html=True)
        else:
            st.markdown(f'<div class="chat-message ai">{msg.content}</div>', unsafe_allow_html=True)
    st.markdown('</div>', unsafe_allow_html=True)

# ------------------------------
# Chat Input and Processing
# ------------------------------
user_prompt = st.chat_input("Type your question here...")
if user_prompt:
    # Display the human message immediately
    st.markdown(f'<div class="chat-message human">{user_prompt}</div>', unsafe_allow_html=True)

    # Prepare placeholders for streaming response and source info
    with st.container():
        ai_response_placeholder = st.empty()  # For streaming tokens
        sources_container = st.empty()          # For sources info

        # Instantiate our callback handlers (pass the actual instances!)
        stream_handler = StreamlitCallbackHandler(ai_response_placeholder)
        pm_handler = PostMessageHandler(sources_container)

        # Execute the chain using the callback instances
        config = {"callbacks": [stream_handler, pm_handler]}
        response = qa_rag_chain.invoke({"question": user_prompt}, config)

        # Display the final response if not fully streamed using .content attribute
        st.markdown(f'<div class="chat-message ai">{response.content}</div>', unsafe_allow_html=True)

    # Save messages to history using .content attribute
    streamlit_msg_history.add_user_message(user_prompt)
    streamlit_msg_history.add_ai_message(response.content)


Writing app.py


In [None]:
!streamlit run app.py --server.port 8989 &>./logs.txt &

In [None]:
from pyngrok import ngrok
import yaml

ngrok.kill()

with open('ngrok_key.yaml', 'r') as file:
    ngrok_token = yaml.safe_load(file)

ngrok.set_auth_token(ngrok_token['ngrok_key'])


ngrok_tunnel = ngrok.connect(8989)
ngrok_tunnel.public_url



'https://27b5-34-56-201-141.ngrok-free.app'