In [2]:
!pip install streamlit pypdf langchain openai chromadb tiktoken huggingface_hub pyngrok

Collecting streamlit
  Downloading streamlit-1.26.0-py2.py3-none-any.whl (8.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.1/8.1 MB[0m [31m36.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pypdf
  Downloading pypdf-3.15.3-py3-none-any.whl (271 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m271.9/271.9 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain
  Downloading langchain-0.0.273-py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m66.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting openai
  Downloading openai-0.27.9-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.5/75.5 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting chromadb
  Downloading chromadb-0.4.7-py3-none-any.whl (415 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m415.5/415.5 kB[0m [31m39.4 MB/s[0m eta [36m0:00:00[0m

In [None]:
# this is if you don't want to use openAI embeddings so that you won't be charged
# from hugging_face mostly if you have a GPU
!pip install InstructorEmbedding sentence_transformers

In [3]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [1]:
%%writefile my_chat_app.py
import streamlit as st
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import TokenTextSplitter, CharacterTextSplitter
from langchain.vectorstores import Chroma, FAISS
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI
from htmlTemplates import css, user_template, bot_template
from PyPDF2 import PdfReader
from dotenv import load_dotenv, find_dotenv
import os
import openai


def get_pdf_document(files):
  #loader = PyPDFLoader(files)
  #pages = loader.load()
  text = ""
  for filex in files:
    pdf_reader = PdfReader(filex)
    for page in pdf_reader.pages:
      text += page.extract_text()
  return text

def split_and_get_chunks(documents,chunk_size=150, chunk_overlap=20):
  splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len
    )
  # splitter.split_documents
  # splitter.create_documents
  splitted_documents = splitter.split_text(documents)
  return splitted_documents


def create_vectore_stores(splitted_documents, directory="embeddings_vec/chroma/"):
  embeddings = OpenAIEmbeddings()
  PERSIST_DIRECTORY = directory
  # Chroma.from_documents
  vector_db = Chroma.from_texts(texts = splitted_documents,
                                    embedding=embeddings,
                                    persist_directory=PERSIST_DIRECTORY
                                    )
  return vector_db

def conversationQa(vector_store, search_type="mmr", k=4, fetch_k=20, chain_type = "stuff"):
  memory = ConversationBufferMemory(memory_key = "chat_history",
                                      return_messages = True)

  llm = ChatOpenAI(temperature=0)
  conversation_chain = ConversationalRetrievalChain.from_llm(
      llm,
      memory = memory,
      retriever = vector_store.as_retriever(search_type= search_type, search_kwargs = {"k":k, "fetch_k":fetch_k}),
      chain_type = chain_type
  )
  return conversation_chain

def question_and_answering(user_input):
  answer = st.session_state.conversations({"question": user_input})
  st.session_state.chat_history = answer["chat_history"]
  for i, message in enumerate(st.session_state.chat_history):
        if i % 2 == 0:
            st.write(user_template.replace(
                "{{MSG}}", message.content), unsafe_allow_html=True)
        else:
            st.write(bot_template.replace(
                "{{MSG}}", message.content), unsafe_allow_html=True)


def main():
  _ = load_dotenv(find_dotenv())

  openai.api_key  = os.environ['OPENAI_API_KEY']
  st.set_page_config(page_title = "Chat WIth Your Data", page_icon=":books:")
  st.write(css, unsafe_allow_html = True)

  if "conversations" not in st.session_state:
    st.session_state.conversations= None
  if "chat_history" not in st.session_state:
    st.session_state.chat_history = None

  st.header("CHAT WITH YOUR DATA :books:")
  user_question = st.text_input("Ask a question about your documents")
  if user_question:
    question_and_answering(user_question)

  with st.sidebar:
    st.subheader("Your Documents")
    pdf_documents = st.file_uploader("upload your pdf files and click on 'Load' ", accept_multiple_files=True)
    if st.button("Load"):
      with st.spinner("loading"):
        # get pdf
        docs = get_pdf_document(pdf_documents)
        # createchunks
        chunks = split_and_get_chunks(docs)
        # vector store
        vectors = create_vectore_stores(chunks)
        # retrieve and chat
        st.session_state.conversations = conversationQa(vectors)


if __name__ == "__main__":
  main()

Writing my_chat_app.py


In [None]:
!ngrok authtoken  os.environ[NGROK_AUTHTOKEN]

Authtoken saved to configuration file: /root/.ngrok2/ngrok.yml


In [None]:
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip

--2023-08-24 11:15:36--  https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
Resolving bin.equinox.io (bin.equinox.io)... 18.205.222.128, 54.237.133.81, 52.202.168.65, ...
Connecting to bin.equinox.io (bin.equinox.io)|18.205.222.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13921656 (13M) [application/octet-stream]
Saving to: ‘ngrok-stable-linux-amd64.zip’


2023-08-24 11:15:37 (54.2 MB/s) - ‘ngrok-stable-linux-amd64.zip’ saved [13921656/13921656]



In [None]:
!unzip ngrok-stable-linux-amd64.zip

Archive:  ngrok-stable-linux-amd64.zip
  inflating: ngrok                   


In [None]:
get_ipython().system_raw('./ngrok http 8501 &')

In [None]:
! curl -s http://localhost:4040/api/tunnels | python3 -c \
    "import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])"

http://480b-34-148-102-204.ngrok-free.app


In [None]:
!streamlit run /content/my_chat_app.py


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to False.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.148.102.204:8501[0m
[0m


In [None]:
! pip install streamlit -q


In [None]:
!wget -q -O - ipv4.icanhazip.com

35.245.165.160


In [None]:
! streamlit run my_chat_app.py & npx localtunnel --port 8501

[..................] | fetchMetadata: sill resolveWithNewModule localtunnel@2.0[0m[K
Collecting usage statistics. To deactivate, set browser.gatherUsageStats to False.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://35.245.165.160:8501[0m
[0m
[K[?25hnpx: installed 22 in 2.396s
your url is: https://light-guests-check.loca.lt
[34m  Stopping...[0m
^C
