In [None]:

import streamlit as st
import re
import time
from io import BytesIO
from typing import Any, Dict, List
import openai
from langchain import LLMChain, OpenAI
from langchain.agents import AgentExecutor, Tool, ZeroShotAgent
from langchain.chains import RetrievalQA
from langchain.chains.question_answering import load_qa_chain
from langchain.docstore.document import Document
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.memory import ConversationBufferMemory
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import VectorStore
from langchain.vectorstores.faiss import FAISS
from pypdf import PdfReader

@st.cache_data
def parse_pdf(file: BytesIO) -> List[str]:
    pdf = PdfReader(file)
    output = []
    for page in pdf.pages:
        text = page.extract_text()
        
        text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
        
        text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
        
        text = re.sub(r"\n\s*\n", "\n\n", text)
        output.append(text)
    return output


@st.cache_data
def text_to_docs(text: str) -> List[Document]:
    
    if isinstance(text, str):
        
        text = [text]
    page_docs = [Document(page_content=page) for page in text]

    
    for i, doc in enumerate(page_docs):
        doc.metadata["page"] = i + 1

    
    doc_chunks = []

    for doc in page_docs:
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=4000,
            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
            chunk_overlap=0,
        )
        chunks = text_splitter.split_text(doc.page_content)
        for i, chunk in enumerate(chunks):
            doc = Document(
                page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i}
            )
            
            doc.metadata["source"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}"
            doc_chunks.append(doc)
    return doc_chunks


@st.cache_data
def test_embed():
    embeddings = OpenAIEmbeddings(openai_api_key=api)
    
    with st.spinner("Processing..."):
        index = FAISS.from_documents(pages, embeddings)
    st.success("Embeddings generated successfully.", icon="✅")
    return index
st.title(" PDF Chatgpt ")
st.sidebar.markdown(
    """
    ### Order:
    1. Upload PDF File that you would like to talk to:-
    2. Enter Your openAI Key
    3. Ask your pdf questions

    """
)
uploaded_file = st.file_uploader("**Upload the PDF File you would like to have a chat**", type=["pdf"])
if uploaded_file:
    name_of_file = uploaded_file.name
    doc = parse_pdf(uploaded_file)
    pages = text_to_docs(doc)
    if pages:
        with st.expander("Show Page Content", expanded=False):
            page_sel = st.number_input(
                label="Select Page", min_value=1, max_value=len(pages), step=1
            )
            pages[page_sel - 1]
api = st.text_input(
            "**Enter OpenAI API Key**",
            type="password",
            placeholder="sk-",
            help="https://platform.openai.com/account/api-keys",
        )
if api:
        index = test_embed()
        qa = RetrievalQA.from_chain_type(
        llm=OpenAI(openai_api_key=api),
        chain_type="stuff",
        retriever=index.as_retriever(),
        )
        tools = [
            Tool(
                name="State of Union QA System",
                func=qa.run,
                description="Useful for when you need to answer questions about the aspects asked. Input may be a partial or fully formed question.",
            )
        ]
        prefix = """Have a conversation with a human, answering the following questions as best you can based on the context and memory available. 
                You have access to a single tool:"""
        suffix = """Begin!"
        {chat_history}
        Question: {input}
            {agent_scratchpad}"""
        prompt = ZeroShotAgent.create_prompt(
            tools,
            prefix=prefix,
            suffix=suffix,
            input_variables=["input", "chat_history", "agent_scratchpad"]
        )
        if "memory" not in st.session_state:
            st.session_state.memory = ConversationBufferMemory(
                memory_key="chat_history"
            )
        llm_chain = LLMChain(
            llm=OpenAI(
                temperature=0, openai_api_key=api, model_name="gpt-3.5-turbo"
            ),
            prompt=prompt,
        )
        agent = ZeroShotAgent(llm_chain=llm_chain, tools=tools, verbose=True)
        agent_chain = AgentExecutor.from_agent_and_tools(
                agent=agent, tools=tools, verbose=True, memory=st.session_state.memory
            )
        query = st.text_input(
                "**Ask me anything?**",
                placeholder="Ask me anything from {}".format(name_of_file),
            )

        if query:
            with st.spinner(
                 "Generating Answer to your Query : `{}` ".format(query)
            ):
                res = agent_chain.run(query)
                st.info(res, icon="🤖")

        with st.expander("History/Memory"):
            st.session_state.memory