# **Splitting and Embedding Text Using LangChain**

In [14]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [15]:
speech_path = os.getenv('CHURCHILL_SPEECH')
try:
    with open(speech_path) as f:
        churchill_speech = f.read()
        print('File Read Correctly!')
except Exception as e:
    print(f"Error opening file: {e}")

File Read Correctly!


In [16]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter= RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len
)

In [17]:
chunks= text_splitter.create_documents([churchill_speech])
print(chunks[2])

page_content='could have saved the British and French Armies who had entered Belgium at the appeal of the Belgian'


In [18]:
# Print Only Text
chunks= text_splitter.create_documents([churchill_speech])
print(chunks[2].page_content)

could have saved the British and French Armies who had entered Belgium at the appeal of the Belgian


In [20]:
print(f'Now you have: {len(chunks)}')

Now you have: 271


## **Embedding Cost**

In [21]:
def print_embedding_cost(texts):
    import tiktoken
    enc=tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens=sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens/1000*0.0004:.6f}')

print_embedding_cost(chunks)

Total Tokens: 5413
Embedding Cost in USD: 0.002165


In [22]:
from langchain.embeddings import OpenAIEmbeddings
embedding=OpenAIEmbeddings()

In [23]:
vector=embedding.embed_query('abc')
vector

[0.0026003900945243616,
 -0.011285445990905637,
 -0.00940453863619554,
 -0.03911722245794239,
 -0.03425231768417376,
 0.01206326514391234,
 -0.021227386595400844,
 -0.022853734632851876,
 0.018653512882996225,
 -0.0005285633100776219,
 0.0034188677622028355,
 0.0191484884536398,
 -0.002685243122667173,
 -0.004553776460640192,
 -0.018681797148100282,
 0.003090062481876243,
 0.02708224064781159,
 0.010521768970450963,
 0.010663190295971264,
 0.007410493289746676,
 -0.014382579539057895,
 0.01774841453702125,
 -0.006979157315587234,
 -0.01538667281289708,
 -0.02015258265880151,
 -0.0034754365252415874,
 0.009850016277245749,
 -0.020124298393697452,
 0.02624785389591929,
 -0.007353924759538556,
 0.007778189667421983,
 0.01358354765288441,
 -0.007452919687402766,
 -0.009941940604495207,
 -0.010352063379826604,
 -0.014255300346089626,
 -0.01364011618309253,
 -0.015782653455676445,
 0.010203570522369025,
 -0.0002733415023606986,
 0.024324520143553102,
 0.004642165021921011,
 0.013951243099237

In [24]:
vector= embedding.embed_query(chunks[0].page_content)
print(vector)

[-0.009225539907269376, 0.0010092550704136824, -0.003830427787906608, 0.01771953896719268, -0.011501443627396225, 0.03839233536024005, -0.025509091216240315, -0.038609087652003285, 0.010519282942698913, -0.033596679729107556, 0.014806923527366171, 0.029559658981791536, 0.012409095711799925, -0.02416793268566499, 0.0026382876220717296, 0.006394206576854081, -0.0022386496528106884, 0.023775069156844127, 0.00606230416443034, -0.034761723297334936, 0.005818456904874123, -0.02529233892447708, -0.0017357155530909089, 0.012395548693564722, 0.011054391094311977, 0.001491868642780659, 0.005906512989064226, -0.03695634770201831, -0.00919844587079897, -0.0024655626739116133, -0.0017001545138081808, -0.020605059439226203, -0.01701509215631701, -0.019832877537174522, -0.012185569910919089, -0.00886654299271394, -0.004744853381427909, -0.009300048507562986, 0.011162767240193593, -0.003759305709341152, 0.02717537818446049, -0.015226882023980017, -6.449400219883792e-06, 0.016771245828083373, -0.026267

## **Inserting the Embeddings Into Pinecone Index**

In [28]:
import os
import pinecone
from langchain.vectorstores import Pinecone
pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))

In [32]:
# Deleting All Indexes
indexes= pinecone.list_indexes()
for i in indexes:
    print('Deleting All Indexes...!', end='')
    pinecone.delete_index(i)
    print('Done!')
print('Done!')

Done!


In [33]:
index_name='churchill-speach'
if index_name not in pinecone.list_indexes():
    print(f'Creating Index {index_name}...!')
    pinecone.create_index(index_name, dimension=1536, metric='cosine')
    print('Done!')

Creating Index churchill-speach...!
Done!


In [34]:
from langchain.embeddings import OpenAIEmbeddings
embeddings=OpenAIEmbeddings()

In [35]:
vector_store= Pinecone.from_documents(chunks, embeddings, index_name=index_name)

## **Asking Questions (Similarity Search)**

In [47]:
# Define the index name (Ensure this is the correct name of the index you've created)
index_name = index_name

# Connect to the Pinecone vector index
vector_store = vector_store

In [42]:
try:
    query= 'Where should we fight?'
    result= vector_store.similarity_search(query)
    print(result)
except Exception:
    print(e)

[Document(page_content='on the beaches, we shall fight on the landing grounds, we shall fight in the fields and in the'), Document(page_content='fields and in the streets, we shall fight in the hills; we shall never surrender, and even if,'), Document(page_content='When we consider how much greater would be our advantage in defending the air above this Island'), Document(page_content='front, now on that, fighting on three fronts at once, battles fought by two or three divisions')]


In [43]:
for r in result:
    print(r.page_content)
    print('-' * 30)

on the beaches, we shall fight on the landing grounds, we shall fight in the fields and in the
------------------------------
fields and in the streets, we shall fight in the hills; we shall never surrender, and even if,
------------------------------
When we consider how much greater would be our advantage in defending the air above this Island
------------------------------
front, now on that, fighting on three fronts at once, battles fought by two or three divisions
------------------------------


In [44]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

In [50]:
llm= ChatOpenAI(model='gpt-3.5-turbo', temperature=1)
retriever= vector_store.as_retriever(search_type='similarity', search_kwards={'k':3})
chain= RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=retriever)

In [52]:
# Define the index name (Ensure this is the correct name of the index you've created)
index_name = index_name

# Connect to the Pinecone vector index
vector_store = vector_store

In [53]:
query='Where should we fight?'
answer=chain.run(query)
print(answer)

We should fight on the beaches, landing grounds, fields, streets, and hills.


In [54]:
query='What about the French Armies?'
answer=chain.run(query)
print(answer)

The French Armies were involved in the fighting during a military disaster in France and Belgium. They were working to reopen communications to Amiens and had planned to advance across the Somme. However, beyond these details, I don't have any further information about the specific actions or outcomes of the French Armies.
