# Splitting and Embedding Txt Using LangChain

In [2]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
with open('../files/churchill_speech.txt') as f:
    churchill_speech = f.read()
    
    
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len
)

In [6]:
!pwd

/Users/walkyz/code/Walky-Z/langchain/notebooks


In [8]:
chunks = text_splitter.create_documents([churchill_speech])

In [13]:
print(f'Now you have {len(chunks)}')

Now you have 281


## Embedding Cost

In [16]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.0004:.6f}')
    
print_embedding_cost(chunks)

Total Tokens: 5642
Embedding Cost in USD: 0.002257


In [31]:
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model='text-embedding-3-small')

In [32]:
vector = embeddings.embed_query(chunks[0].page_content)
vector

[0.048597493583205226,
 0.01679310620242673,
 0.07459263703946342,
 0.040762339882407045,
 -0.01440106557255407,
 0.018892245874713973,
 -0.03343976378870595,
 -0.04500943170707363,
 -0.031853203586161,
 -0.03790653226288421,
 0.07659414616363086,
 -0.006117401808932943,
 0.015670312989531955,
 0.003981650448270125,
 0.005897724693759978,
 -0.01974654623271458,
 0.029046217759956257,
 0.035636534009113,
 0.016024236066490706,
 0.04188513495794702,
 0.0014294278027803995,
 0.03858997403940085,
 -0.02179686783697412,
 -0.004262349217155118,
 -0.022382673477435357,
 0.023102727407988882,
 0.015011280247029161,
 -0.05901996204863207,
 0.023651919031768043,
 -0.0652197393415028,
 0.032561053465368896,
 -0.021076813906420595,
 0.015841171571015916,
 0.057067272855137546,
 0.009507144125407716,
 -0.063706407172322,
 0.011630690969063608,
 -0.019526869583202915,
 -0.033171267208521385,
 0.024994392619464883,
 0.017110418987993804,
 -0.05682318810293463,
 -0.01942923344714751,
 0.05823888413606

## Inserting Embedding into Pinecone Index

In [20]:
import pinecone
from langchain_community.vectorstores import Pinecone
pc = pinecone.Pinecone()

In [34]:
# Delete all index (for free plan!)

for i in pc.list_indexes().names():
    print('Deleting all indexes ...', end='')
    pc.delete_index(i)
    print('done')

Deleting all indexes ...done


In [35]:
index_name = 'churchill-speech'

In [36]:
def create_index(name, region='us-east-1'):
    if name not in pc.list_indexes().names():
        print(f'Creating index: {name}')
        pc.create_index(
            name=name,
            dimension=1536,
            metric='cosine',
            spec=pinecone.PodSpec(
                environment='gcp-starter'
            )
        )
        print('Index created! 😀')
    else:
        print(f'Index {name} already exists!')

In [37]:
create_index(index_name)

Creating index: churchill-speech
Index created! 😀


In [38]:
vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)

In [39]:
# Loading the vector store from an existing index
vector_store = Pinecone.from_existing_index(index_name='churchill-speech', embedding=embeddings)

## Asking Questions (Similarity Search

In [40]:
query = 'Where should we fight?'
result = vector_store.similarity_search(query)

print(result)

[Document(page_content='on the beaches, we shall fight on the landing grounds, we shall fight in the fields and in the'), Document(page_content='flag or fail. We shall go on to the end, we shall fight in France, we shall fight on the seas and'), Document(page_content='fields and in the streets, we shall fight in the hills; we shall never surrender, and even if,'), Document(page_content='in the air, we shall defend our Island, whatever the cost may be, we shall fight on the beaches, we')]


In [41]:
for r in result:
    print(r.page_content)
    print('-' * 50)

on the beaches, we shall fight on the landing grounds, we shall fight in the fields and in the
--------------------------------------------------
flag or fail. We shall go on to the end, we shall fight in France, we shall fight on the seas and
--------------------------------------------------
fields and in the streets, we shall fight in the hills; we shall never surrender, and even if,
--------------------------------------------------
in the air, we shall defend our Island, whatever the cost may be, we shall fight on the beaches, we
--------------------------------------------------


In [53]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model='gpt-4o', temperature=0)

retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})

chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

In [45]:
query = 'Where should we fight?'
answer = chain.run(query)

In [46]:
print(answer)

We shall fight in France, on the beaches, on the landing grounds, in the fields and in the streets, and in the hills.


In [54]:
query2 = 'Who was the king of Belgium at that time?'
answer2 = chain.run(query2)
print(answer2)

The king of Belgium at that time was King Leopold III.


In [49]:
query3 = 'What about the French Armies?'
answer3 = chain.run(query3)
print(answer3)

The French Armies have been weakened according to the context provided.


In [55]:
query3 = 'What about the French Armies?'
answer3 = chain.run(query3)
print(answer3)

The French Army has been weakened, and a large part of their fortified positions has been lost.
