In [9]:
# pip install langchain --upgrade
# Version: 0.0.164

# !pip install pypdf

In [10]:
# PDF Loaders. If unstructured gives you a hard time, try PyPDFLoader
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

Load you data

In [11]:
loader = PyPDFLoader("../content/machine_learning/thebook.pdf")

## Other options for loaders 
# loader = UnstructuredPDFLoader("../data/field-guide-to-data-science.pdf")
# loader = OnlinePDFLoader("https://wolfpaulus.com/wp-content/uploads/2017/05/field-guide-to-data-science.pdf")

In [12]:
data = loader.load()

In [13]:
# Note: If you're using PyPDFLoader then it will split by page for you already
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[30].page_content)} characters in your document')

You have 234 document(s) in your data
There are 2163 characters in your document


Chunk your data up into smaller documents

In [14]:

# Note: If you're using PyPDFLoader then we'll be splitting for the 2nd time.
# This is optional, test out on your own data.

text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [15]:
print (f'Now you have {len(texts)} documents')

Now you have 292 documents


Create embeddings of your documents to get ready for semantic search

In [16]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

In [17]:
# Check to see if there is an environment variable with you API keys, if not, use what you put below
OPENAI_API_KEY = ""

# PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', 'YourAPIKey')
# PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', 'us-east1-gcp') # You may need to switch with your env

In [18]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [22]:
 # initialize pinecone
pinecone.init(      
	api_key='',      
	environment='us-west1-gcp-free'      
)      
index = pinecone.Index('prueba1') # put in the name of your pinecone index hereindex
index_name = 'prueba1'

In [25]:
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

In [32]:

query = "What is the Central Limit Theorem?"
docs = docsearch.similarity_search(query)

In [33]:
# Here's an example of the first document that was returned
print(docs[0].page_content[:450])

40 2 Density Estimation
101102103123456
Fig. 2.3. Five instantiations of a running average over outcomes of a toss of a dice.
Note that all of them converge to the mean 3 .5. Moreover note that they all are
well contained within the upper and lower envelopes given by µ±√
VarX[x]/m.
The central limit theorem answers this question exactly by addressing a
slightly more general question, namely whether the sum over a number of
independent random vari


Query those docs to get your answer back

In [34]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [35]:
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

In [36]:
query = "What is the collect stage of data maturity?"
docs = docsearch.similarity_search(query)

In [37]:
chain.run(input_documents=docs, question=query)

' There is no collect stage of data maturity.'