In [1]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain import OpenAI, VectorDBQA
from langchain.document_loaders import DirectoryLoader
import magic
import os
import nltk

os.environ['OPENAI_API_KEY'] = ""

# nltk.download('averaged_perceptron_tagger')

# pip install unstructured
# Other dependencies to install https://langchain.readthedocs.io/en/latest/modules/document_loaders/examples/unstructured_file.html
# pip install python-magic-bin
# pip install chromadb

In [2]:
# Cargamos nuestros documentos de un directorio local
loader = DirectoryLoader('../data/PaulGrahamEssaySmall/', glob='**/*.txt')

In [3]:
# Almacenamos su contenido en documents
documents = loader.load()

In [4]:
# Configuramos la division de nuestros documentos
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)

In [5]:
# Dividimos nuestros documentos
texts = text_splitter.split_documents(documents)

In [6]:
# Creamos los "Embeddings" (vectores que contienen en significado semantico de nuestros fracmentos de documentos)
embeddings = OpenAIEmbeddings(openai_api_key=os.environ['OPENAI_API_KEY'])

In [7]:
# Creamos un "vector store" de los fracmentos de documentos por "Embeddings"
docsearch = Chroma.from_documents(texts, embeddings)

Using embedded DuckDB without persistence: data will be transient


In [8]:
# Creamos una cadena "stuff" que busque en nuestro "vector store" los fracmentos de documentos que le sean utiles a nuestro LLM
qa = VectorDBQA.from_chain_type(llm=OpenAI(), chain_type="stuff", vectorstore=docsearch)



In [9]:
# Pregunta
query = "What did McCarthy discover?"

# Respuesta de la cadena
qa.run(query)

' John McCarthy published a paper in 1960 in which he showed how, given a handful of simple operators and a notation for functions, you can build a whole programming language, which he called Lisp.'

### Sources

In [10]:
# Creamos una cadena "stuff" que busque en nuestro "vector store" los documentos que le sean utiles a nuestro LLM
# y  establecemos a "return_source_documents = Treu" esto nos dira los fracmentos de documentos usados
qa = VectorDBQA.from_chain_type(llm=OpenAI(), chain_type="stuff", vectorstore=docsearch, return_source_documents=True)

# Hacemos una pregunta y guardamos la respuesta en query
query = "What did McCarthy discover?"

# Creamos un diccionario conla pregunta y la respuesta obtenida
result = qa({"query": query})

In [11]:
# Respuesta
result['result']

' McCarthy discovered that, given a handful of simple operators and a notation for functions, you can build a whole programming language. He called this language Lisp, for "List Processing," because one of his key ideas was to use a simple data structure called a list for both code and data.'

In [12]:
# fracmentos de documentos usados.
result['source_documents']

[Document(page_content='May 2001\n\n(I wrote this article to help myself understand exactly\n\nwhat McCarthy discovered.  You don\'t need to know this stuff\n\nto program in Lisp, but it should be helpful to\n\nanyone who wants to\n\nunderstand the essence of Lisp \x97 both in the sense of its\n\norigins and its semantic core.  The fact that it has such a core\n\nis one of Lisp\'s distinguishing features, and the reason why,\n\nunlike other languages, Lisp has dialects.)In 1960, John\n\nMcCarthy published a remarkable paper in\n\nwhich he did for programming something like what Euclid did for\n\ngeometry. He showed how, given a handful of simple\n\noperators and a notation for functions, you can\n\nbuild a whole programming language.\n\nHe called this language Lisp, for "List Processing,"\n\nbecause one of his key ideas was to use a simple\n\ndata structure called a list for both\n\ncode and data.It\'s worth understanding what McCarthy discovered, not\n\njust as a landmark in the histo