In [1]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import nltk

<h1> Convert PDF to Document Type </h1>

In [88]:
#PDF Parser
loader = UnstructuredPDFLoader("7_Habits.pdf")

In [89]:
data = loader.load()

detectron2 is not installed. Cannot use the hi_res partitioning strategy. Falling back to partitioning with the fast strategy.


In [90]:
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')

You have 1 document(s) in your data
There are 762522 characters in your document


<h1>Break up the data into smaller parts</h1>
Model can only take a certain amount of input. Breaking up the documents and finding the relevant ones helps us get around this constraint

In [91]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [92]:
print (f'Now you have {len(texts)} documents')

Now you have 963 documents


<h1> Create embeddings </h1>
Embeddings will convert Doc Type to something the model can understand

In [93]:
import chromadb

client = chromadb.Client()

collection = client.create_collection("book")

Using embedded DuckDB without persistence: data will be transient
No embedding_function provided, using default embedding function: SentenceTransformerEmbeddingFunction


In [94]:
collection.add(
    documents=[i.page_content for i in texts],
    ids = ["id"+str(i) for i in range(len(texts))]
)

<h1>Use Langchain to connect embeddings to GPT </h1> 

In [95]:
from langchain.llms import OpenAI
import os
from langchain.chains.question_answering import load_qa_chain

In [96]:
#Setup the model
llm = OpenAI(temperature=0, openai_api_key=os.environ.get('OPENAI_API_KEY')) #text-davinci-003
chain = load_qa_chain(llm, chain_type="stuff")

In [100]:
query = "Why should I put things first things first?"
results = collection.query(
    query_texts=[query],
    n_results=2
)
results

{'ids': [['id460', 'id311']],
 'embeddings': None,
 'documents': [['The first generation of time management does not even recognize the concept of priority. It gives  us  notes  and  “to  do”  lists  that  we  can  cross  off,  and  we  feel  a  temporary  sense  of accomplishment every time we check something off, but no priority is attached to items on the list.  In  addition,  there  is  no  correlation  between  what’s  on  the  list  and  our  ultimate  values  and purposes in life. We simply respond to whatever penetrates our awareness and apparently needs to be done.\n\nMany people manage from this first-generation paradigm. It’s the course of least resistance. There’s  no  pain  or  strain;  it’s  fun  to  “go  with  the  flow.”  Externally  imposed  disciplines  and schedules give people the feeling that they aren’t responsible for results.',
   'To  the  extent  to  which  we  understand  the  principle  of  two  creations  and  accept  the responsibility for both, we act wit

In [101]:
from langchain.docstore.document import Document
documents = []
for i in results['documents']:
    for j in i:
        documents.append(Document(page_content = j))

documents

[Document(page_content='The first generation of time management does not even recognize the concept of priority. It gives  us  notes  and  “to  do”  lists  that  we  can  cross  off,  and  we  feel  a  temporary  sense  of accomplishment every time we check something off, but no priority is attached to items on the list.  In  addition,  there  is  no  correlation  between  what’s  on  the  list  and  our  ultimate  values  and purposes in life. We simply respond to whatever penetrates our awareness and apparently needs to be done.\n\nMany people manage from this first-generation paradigm. It’s the course of least resistance. There’s  no  pain  or  strain;  it’s  fun  to  “go  with  the  flow.”  Externally  imposed  disciplines  and schedules give people the feeling that they aren’t responsible for results.', metadata={}),
 Document(page_content='To  the  extent  to  which  we  understand  the  principle  of  two  creations  and  accept  the responsibility for both, we act within and en

In [102]:
chain.run(input_documents=documents, question=query)

' Putting first things first helps you to take charge of your life and be responsible for the results. It allows you to create your life by design, rather than by default, and to act within and enlarge your Circle of Influence.'