In [29]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [30]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
with open('churchill.txt') as f:
    churchill = f.read()
    
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100, 
    #100 is the max chunk size 
    chunk_overlap=20,
    #20 is the max chunk overlap
    length_function=len
)

In [31]:
chunks = text_splitter.create_documents([churchill])
print(chunks[2])
print(chunks[10].page_content)
print(f'Now you have {len(chunks)}')

page_content='which we boast when an absolute guarantee against invasion, still less against serious raids, could'
ourselves for every kind of novel stratagem and every kind of brutal and treacherous manÅ“uvre. I
Now you have 36


In [32]:
#### Embedding cost

In [33]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.0004:.6f}')

print_embedding_cost(chunks)

Total Tokens: 743
Embedding Cost in USD: 0.000297


In [34]:
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

In [35]:
vector = embedding.embed_query(chunks[0].page_content)
print(vector)

[-0.006603145742401956, -0.01851298968227042, -0.007637615981493388, -0.010378290538400467, -0.007308466105967239, 0.016041008198059502, -0.013542158058504748, -0.02201944140557277, -0.027487357123037553, 0.003761710701157228, 0.018499556285921044, 0.008222024602043779, 0.021213361256161745, -0.008504152933734401, 0.008806433222594171, 0.015772316056685883, 0.03415095687430187, 0.023671907481378204, 0.010841787415935967, 0.0017918506014457998, -0.04057273500218149, 0.0011511841621624605, 0.01549018679367272, -0.021038710339814096, 0.011654585194844223, -0.004393140632712511, -0.0012393493240234386, -0.025915498875908716, 0.0049036585884222675, 0.006364680139798247, 0.013575745274668358, 0.0002716324430548115, -0.006593069763817382, -0.014684105712939153, -0.02505567955316493, -0.028535262621123443, -0.0056190553962333955, -0.028105352959751553, 0.004621530256731646, -0.011009720702786385, 0.01414671863622429, -0.007704789482498063, 0.004268869609287734, -0.014348239139238316, 0.0028666

In [36]:
#### Insterting the embeddings into a pinecone index

In [37]:
pip install pinecone-client

Note: you may need to restart the kernel to use updated packages.


In [38]:
import os
import pinecone
from langchain.vectorstores import Pinecone

pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))

In [39]:
# deleting all indexes 
indexes = pinecone.list_indexes()
for i in indexes:
    print('Deleting all indexes ...', end= '')
    pinecone.delete_index(i)
    print('Done')


Deleting all indexes ...Done


In [40]:
index_name = 'churchill'
if index_name not in pinecone.list_indexes():
    print(f'Creating index {index_name} ...')
    pinecone.create_index(index_name, dimension=1536, metric='cosine')
    print('Done!')

Creating index churchill ...
Done!


In [41]:
vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)

In [42]:
query = 'where should we fight?'
result = vector_store.similarity_search(query)
print(result)

[Document(page_content='be. We shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the'), Document(page_content='we shall fight on the seas and oceans, we shall fight with growing confidence and growing strength'), Document(page_content='growing strength in the air, we shall defend our island, whatever the cost may be. We shall fight'), Document(page_content='shall fight in the fields and in the streets, we shall fight in the hills; we shall never')]


In [46]:
for r in result: 
    print(r.page_content)
    print('-' * 50)

be. We shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the
--------------------------------------------------
we shall fight on the seas and oceans, we shall fight with growing confidence and growing strength
--------------------------------------------------
growing strength in the air, we shall defend our island, whatever the cost may be. We shall fight
--------------------------------------------------
shall fight in the fields and in the streets, we shall fight in the hills; we shall never
--------------------------------------------------


In [53]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=1)

retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})

chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

In [54]:
query = 'Where should we fight?'
answer = chain.run(query)