In [3]:
import os
import re
from langchain import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.document_loaders import TextLoader
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.llms import OpenAI
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.chains.summarize import load_summarize_chain

from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.question_answering import load_qa_chain

from dotenv import load_dotenv
import gradio as gr


load_dotenv()

True

*Summarize the whole thing here*


In [4]:
def source_name(docs):
    """
    Create document.metadata['law'] to direct filename only
    use regex to isolate filename (w/o pdf)
    """
    #splits filename by /'s
    split_path = re.split(r'\/', docs.metadata['source'])
    #takes the last item from the split (the item name) and gets rid of the ".pdf" extension
    docs.metadata['law'] = re.split(r'\.',split_path[-1])[0]


### Takes the folder and splits it into the separate documents. Then runs the above source_name function on all documents.

In [5]:
documents = PyPDFDirectoryLoader("/Users/gv658da/Documents/tax_folder")
texts = documents.load_and_split()

for text in texts:
    source_name(text)

### Creates a vector store of OpenAI embeddings from the documents, and sets up the ability to retrieve those documents from unstructured queries.


*Why FAISS instead of something else?

In [6]:
retriever = FAISS.from_documents(texts, OpenAIEmbeddings()).as_retriever()

### Setting up chatGPT to be the large language model used in analysis. Temperature of 0 means the responses from the model with be deterministic (no randomness).

In [7]:
llm = OpenAI(temperature=0)

### Sets up the LLM to find the vector embeddings from the vector store that best match the query.

*More detail about QA

In [8]:
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=retriever)

### Takes the vectors retrieved by the chain and displays them in a user-friendly readable summary with source documents referenced below.

In [9]:
def chatty(query):
    chatbot = dict(chain({"question": query}, return_only_outputs=True))
    return chatbot['answer'] + f"\n" + "Here's where I found this information: " + chatbot['sources']

### Creates a convenient interface for the "chatty" function using Gradio. Provides an interface where a user can enter a query on the left-hand side, and chatty's response will appear on the right-hand side. demo.launch() opens the interface in browser.

In [10]:
demo = gr.Interface(
    fn=chatty,
    inputs=["text"],
    outputs=["text"],
)
demo.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


