In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# LangChain imports
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.chat_message_histories import StreamlitChatMessageHistory
from langchain_core.callbacks.base import BaseCallbackHandler
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

from operator import itemgetter

st.set_page_config(page_title="File QA Chatbot", page_icon="*")
st.title("Welcome to File QA RAG Chatbot")

@st.cache_resource(ttl="1h")
def configure_retriever(uploaded_files):
    # Load the uploaded files into a LangChain Chroma vector store
    docs = []
    temp_dir = tempfile.TemporaryDirectory()
    for file in uploaded_files:
        temp_filepath = os.path.join(temp_dir.name, file.name)
        with open(temp_filepath, "wb") as f:
            f.write(file.getvalue())
        loader = PyMuPDFLoader(temp_filepath)
        docs.extend(loader.load())

    if not docs:  # Add a check if any documents were loaded
        return None

    # Corrected the class name from RecursiveCharacterSplitter to RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
    doc_chunks = text_splitter.split_documents(docs)

    if not doc_chunks: # Add a check if splitting created any chunks
         return None

    embeddings_model = OpenAIEmbeddings()
    vectordb = Chroma.from_documents(doc_chunks, embeddings_model)

    retriever = vectordb.as_retriever()
    return retriever

class StreamHandler(BaseCallbackHandler):
    def __init__(self, container, initial_text=""):
        self.container = container
        self.text = initial_text

    def on_llm_new_token(self, token: str, **kwargs) -> None:
        self.text += token
        self.container.markdown(self.text)

# Initialize streamlit_msg_history unconditionally
streamlit_msg_history = StreamlitChatMessageHistory(key="langchain_messages")


uploaded_files = st.sidebar.file_uploader(
    label="Upload PDF files", type=["pdf"],
    accept_multiple_files=True
)

if not uploaded_files:
    st.info("Please upload one or more PDF files.")
    # It's generally better not to use st.stop() in interactive notebooks as it can
    # prevent subsequent cells from running. Handle the None case for retriever instead.
    # st.stop()
    retriever = None # Explicitly set retriever to None if no files

else:
    retriever = configure_retriever(uploaded_files)

# Only proceed to define the chain and chat if a retriever is successfully configured
if retriever is not None:
    chatgpt = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0.1,
                        streaming=True)

    qa_template = """
        Use only the following pieces of context to answer the question at the end.
        If you don't know the answer, just say that you don't know,
        don't try to make up an answer. Keep the answer as concise as possible.

        {context}

        Question: {question}
        """

    qa_prompt = ChatPromptTemplate.from_template(qa_template)

    def format_docs(docs):
        return "\n\n".join([d.page_content for d in docs])

    # Define the chain only if retriever is not None
    qa_rag_chain = (
        {
            "context": itemgetter("question") | retriever | format_docs,
            "question": itemgetter("question")
        }
        | qa_prompt
        | chatgpt
    )

    # This part of the code that handles chat input and response would also
    # need to be inside this `if retriever is not None:` block.
    # For brevity, I'm not including the full chat input/output loop here,
    # as the original code didn't show it, but it's crucial for a functional app.
    # You would likely have something like:
    # user_input = st.chat_input("Your question:")
    # if user_input:
    #     with st.chat_message("user"):
    #         st.markdown(user_input)
    #     with st.chat_message("assistant"):
    #         stream_handler = StreamHandler(st.empty())
    #         qa_rag_chain.invoke({"question": user_input}, {"callbacks": [stream_handler]})

else:
    # Display a message if the retriever could not be configured (e.g., no files uploaded)
    st.info("Please upload PDF files to start the chat.")
    # Add an initial message to the history when no files are uploaded
    if len(streamlit_msg_history.messages) == 0:
        streamlit_msg_history.add_ai_message("Please upload PDF files to start the chat.")


# Display message history regardless, but the chat input will only appear
# if retriever is not None in a real Streamlit app.
for msg in streamlit_msg_history.messages:
    st.chat_message(msg.type).write(msg.content)

2025-05-22 05:36:19.568 Session state does not function when running a script without `streamlit run`


In [None]:
class PostMessageHandler(BaseCallbackHandler):
    def __init__(self, msg: st.write):
      BaseCallbackHandler.__init__(self)
      self.msg = msg
      self.sources = []

    def on_retriever_end(self, documents, *, run_id, present_run_id, **kwargs):
      sources_ids = []
      for d in documents:
        metadata = {
            "sources": d.metadata["source"],
            "page": d.metadata["page"],
            "content": d.page_content[:200]
        }
        idx = (metadata["source"], metadata["page"])
        if idx not in sources_ids:
          sources_ids.append(idx)
          self.sources.append(metadata)

    def on_llm_end(self, response, *, run_id, present_run_id, **kwargs):
      if len(self.sources):
        st.markdown("__Sources:__ "+"\n")
        st.dataframe(data=pd. DataFrame(self.sources[:3]),
                      width=1000)


if user_prompt := st.chat_input():
    st.chat_message("human").write(user_prompt)

    with st.chat_message("ai"):
      stream_handler = StreamHandler(st.empty())
      sources_container = st.write("")
      pm_handler = PostMessageHandler(sources_container)
      config = {"callbacks": [stream_handler, pm_handler]}
      response = qa_rag_chain.invoke({"question": user_prompt}, config)



In [None]:
!streamlit run app.py --server.port=8989 &>./logs.txt &

In [None]:
# Ensure you have the necessary library installed
!pip install pyngrok pyyaml

from pyngrok import ngrok
import yaml

ngrok.kill()

# Make sure 'ngrok_credentials.yml' exists in the same directory as this notebook
# and contains your ngrok auth token in the format:
# ngrok_key: your_ngrok_auth_token
try:
    with open("/content/drive/MyDrive/ngrok_credentials.yml", "r") as file:
        NGROK_AUTH_TOKEN = yaml.safe_load(file)
    ngrok.set_auth_token(NGROK_AUTH_TOKEN['ngrok_key'])

    # Connect to the port where your Streamlit app is running (8989)
    ngrok_tunnel = ngrok.connect(8989)
    print("Streamlit App", ngrok_tunnel.public_url)

except FileNotFoundError:
    print("Error: ngrok_credentials.yml not found. Please create this file and add your ngrok auth token.")
    print("The file should contain: ngrok_key: your_ngrok_auth_token")
except KeyError:
    print("Error: ngrok_credentials.yml is missing the 'ngrok_key'. Please ensure the file is in the correct format.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Streamlit App https://28e0-34-16-210-140.ngrok-free.app
