In [20]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

### Load your data

In [21]:
#Cargamos nustros datos de un pdf local
loader = UnstructuredPDFLoader("../data/field-guide-to-data-science.pdf")

#Cargamos nuestros datos de un pdf en la nueve
# loader = OnlinePDFLoader("https://wolfpaulus.com/wp-content/uploads/2017/05/field-guide-to-data-science.pdf")

In [22]:
data = loader.load()

detectron2 is not installed. Cannot use the hi_res partitioning strategy. Falling back to partitioning with the fast strategy.


In [23]:
# Cantidad de documentos (pdf's)
print (f'You have {len(data)} document(s) in your data')

# Caracteres en el documento
print (f'There are {len(data[0].page_content)} characters in your document')

You have 1 document(s) in your data
There are 201014 characters in your document


### Chunk your data up into smaller documents

In [24]:
# Configuramos la division de nuestros datos
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)

# Dividimos nuestros datos
texts = text_splitter.split_documents(data)

In [25]:
print (f'Now you have {len(texts)} documents')

Now you have 240 documents


### Cree incrustaciones de sus documentos para prepararse para la búsqueda semántica

In [26]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings

# pinecone es una base de datos en la nueve que nos permitira guardar nuestros "Embeddings"
import pinecone

In [31]:
OPENAI_API_KEY = ""
PINECONE_API_KEY = "" 
PINECONE_API_ENV = ""

In [32]:
# Creamos nuestros "Embeddings"
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [33]:
# inicializar pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "langchain"

In [34]:
# Subimos nuestros fracmentos del decumento con sus "Embeddings" en la base de datos pinecone
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

In [35]:
# Hacemos una pregunta
query = "What are examples of good data science teams?"

# Busca los documentos con mayor similitud de significado semantico (usando los "Embeddings") con la pregunta
docs = docsearch.similarity_search(query, include_metadata=True)

### Query those docs to get your answer back

In [36]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [37]:
# Cargamos nuestro LLM
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)

"""Inicimaos una cadena con nuestro LLM tipo "stuff" que significa que 
tomara los dumentos vistos anterior mente y los pendra en el promt para hacer la pregunta
"""
chain = load_qa_chain(llm, chain_type="stuff")

In [38]:
# Pregunta
query = "What is the collect stage of data maturity?"
# Documentos mas similares a la pregunta
docs = docsearch.similarity_search(query, include_metadata=True)

In [39]:
# respuesta por parte de nuestro LLM
chain.run(input_documents=docs, question=query)

' The collect stage of data maturity is the first stage of data science maturity. It focuses on collecting internal or external datasets. An example of this stage is gathering sales records and corresponding weather data.'

In [41]:
# Pregunta
query =  "what are the steps in cleaning the data?" 
# Documentos mas similares a la pregunta
docs = docsearch.similarity_search(query, include_metadata=True)

In [42]:
# respuesta por parte de nuestro LLM
chain.run(input_documents=docs, question=query)

' The steps in cleaning the data include outlier removal, Gaussian filter, exponential smoothing, median filter, distribution fitting, feature hashing, wrapper methods, sensitivity analysis, self organizing maps, deduplication, normalization, format conversion, fast Fourier transform (FFT), discrete wavelet transform, coordinate transform.'

In [43]:
# Pregunta
query =  "what is tokenization POS staging?"

# Documentos mas similares a la pregunta
docs = docsearch.similarity_search(query, include_metadata=True)

# respuesta por parte de nuestro LLM
chain.run(input_documents=docs, question=query)

' Tokenization POS staging is a part-of-speech tagging process that eliminates words other than nouns and verbs and uses raw term counts instead of TF/IDF weighted terms.'