## Prepare environment

Open a terminal and run the following commands:
```
mkdir data
```

```
curl https://www.gutenberg.org/cache/epub/11/pg11.txt > data/001-alice/alice.txt
```

In [10]:
from dotenv import load_dotenv
load_dotenv()

True

## Load data

In [25]:
from langchain.document_loaders import DirectoryLoader

def load_docs(directory):
  loader = DirectoryLoader(directory)
  documents = loader.load()
  return documents

In [26]:
dirToLoad = '../data/001-alice'
documents = load_docs(dirToLoad)
len(documents)

1

In [None]:
#import os
#import openai
#import pinecone
#import langchain

#from langchain.embeddings.openai import OpenAIEmbeddings
#from langchain.vectorstores import Pinecone
#from langchain.document_loaders import DirectoryLoader
#from langchain.document_loaders import UnstructuredFileLoader

#from langchain.chat_models import ChatOpenAI
#from langchain.chains.question_answering import load_qa_chain


## Split text

In [27]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_docs(documents, chunk_size=1000, chunk_overlap=20):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
  texts = text_splitter.split_documents(documents)
  return texts


In [28]:
texts = split_docs(documents)
print(len(texts))

172


## Get embeddings

In [30]:
import os
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(openai_api_key=os.environ['OPENAI_API_KEY'])

## Create index

In [33]:
import pinecone

pinecone.init(
    api_key=os.getenv('PINECODE_API_KEY'),
    environment=os.getenv('PINECODE_ENVIRONMENT')
)

  from tqdm.autonotebook import tqdm


In [40]:
print(pinecone.list_indexes())

['ai-repo-reader-1']


In [41]:
index_name = "ai-repo-reader-1"

In [42]:
pinecone.delete_index(index_name)

In [43]:
pinecone.create_index(index_name, dimension=1536)

## Search

In [44]:
from langchain.vectorstores import Pinecone
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

In [45]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

llm = OpenAI(temperature=0, openai_api_key=os.environ['OPENAI_API_KEY'])
chain = load_qa_chain(llm, chain_type="stuff")

In [46]:
query = "What did Alice found in the little glass box?"
docs = docsearch.similarity_search(query, include_metadata=True)
chain.run(input_documents=docs, question=query)

' Alice found a very small banana with the words "EAT ME" marked on it in currants.'