### Pinecone

In [1]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)
print(os.getenv('PINECONE_API_KEY'))

d9293cf9-29a0-49a7-8401-9ddb69014ff3


In [2]:
pip install -q pinecone-client

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install --upgrade -q pinecone-client

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip show pinecone-client

Name: pinecone-client
Version: 5.0.1
Summary: Pinecone client and SDK
Home-page: https://www.pinecone.io
Author: Pinecone Systems, Inc.
Author-email: support@pinecone.io
License: Apache-2.0
Location: C:\Users\Jeet\AppData\Local\Programs\Python\Python312\Lib\site-packages
Requires: certifi, pinecone-plugin-inference, pinecone-plugin-interface, tqdm, typing-extensions, urllib3
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [5]:
import os
from pinecone import Pinecone, ServerlessSpec

# Initialize the Pinecone client with your API key
pc = Pinecone(
    api_key=os.environ.get("PINECONE_API_KEY"),  # Or replace with your actual API key
)

# Define the index name and its properties
index_name = "langchain"
dimension = 1536
metric = "cosine"

# Create a new index if it doesn't exist
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=dimension,
        metric=metric,
        spec=ServerlessSpec(
            cloud='aws',   # Specify cloud provider
            region='us-east-1'  # Specify the region
        )
    )

print("Index created successfully!")



Index created successfully!


### Working with Pinecone Indexes

In [6]:
pc.list_indexes()[0]

{'deletion_protection': 'disabled',
 'dimension': 1536,
 'host': 'langchain-ozgp58x.svc.aped-4627-b74a.pinecone.io',
 'metric': 'cosine',
 'name': 'langchain',
 'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
 'status': {'ready': True, 'state': 'Ready'}}

In [7]:
pc.describe_index('langchain')

{'deletion_protection': 'disabled',
 'dimension': 1536,
 'host': 'langchain-ozgp58x.svc.aped-4627-b74a.pinecone.io',
 'metric': 'cosine',
 'name': 'langchain',
 'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
 'status': {'ready': True, 'state': 'Ready'}}

In [8]:
pc.list_indexes().names()

['langchain']

### Deleted index from pinecone and now creating a new one

In [35]:
from pinecone import ServerlessSpec
index_name="langchain"
if index_name not in pc.list_indexes().names():
    print(f'Creating index:{index_name}')
    pc.create_index(
        name = index_name,
        dimension = 1536,
        metric = 'cosine',
        spec = ServerlessSpec(
            cloud = 'aws',
            region = 'us-east-1'
        )
    )
    print(f'Index created!')
else:
    print(f'Index {index_name} already exists')

Creating index:langchain
Index created!


### How to Delete a Index

In [34]:
index_name = 'langchain'
#To check if index already exists
if index_name in pc.list_indexes().names():
    print(f'Deleting index {index_name}...')
    pc.delete_index(index_name)
    print('Done')
else:
    print(f'Index {index_name} does not exist!')

Deleting index langchain...
Done


In [18]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

### Working with Vectors

In [36]:
# Lets generate a few random variables with a dimesnion of 1536 using random module
import random
vectors = [[random.random() for _ in range(1536)] for v in range(5)]
ids = list('abcde')
index_name = 'langchain'
index = pc.Index(index_name)

index.upsert(vectors = zip(ids,vectors))

{'upserted_count': 5}

In [28]:
# Updating vectors we can use the same upsert method with 2 arguements i.e id of vector and new value to update
index.upsert(vectors = [('c', [0.5]*1536)])

{'upserted_count': 1}

In [41]:
#Fetching vectors
#select index if not selected
#index.fetch(ids=['c','d'])

In [31]:
#delete vectors by ID
index.delete(ids = ['b','c'])

{}

In [32]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 3}},
 'total_vector_count': 3}

In [33]:
#If we try to fetch a vector that does not exist we wont get an error we only get an empty vector
index.fetch(ids = ['x'])

{'namespace': '', 'usage': {'read_units': 1}, 'vectors': {}}

In [37]:
#query
query_vector = [random.random() for _ in range(1536)]
#The query operation will retrieve ID's od most similar vectors in index along with their similarities

index.query(
    vector = query_vector,
    top_k = 3,#top 3 most similar vectors
    include_values = False#Don't display actual values of vectors
    
)

{'matches': [{'id': 'd', 'score': 0.773781717, 'values': []},
             {'id': 'a', 'score': 0.757795453, 'values': []},
             {'id': 'e', 'score': 0.75589627, 'values': []}],
 'namespace': '',
 'usage': {'read_units': 5}}

### Namespaces

In [39]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 5}},
 'total_vector_count': 5}

In [42]:
#Now insert 3 vectors in namespace
vectors = [[random.random() for _ in range(1536)] for v in range (3)]
ids = list('xyz')
index.upsert(vectors = zip(ids, vectors), namespace = 'first-namespace')


{'upserted_count': 3}

In [44]:
#Now insert 2 vectors in namespace
vectors = [[random.random() for _ in range(1536)] for v in range (2)]
ids = list('qp')
index.upsert(vectors = zip(ids,vectors), namespace = 'second-namespace')

{'upserted_count': 2}

In [45]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 5},
                'first-namespace': {'vector_count': 3},
                'second-namespace': {'vector_count': 2}},
 'total_vector_count': 10}

In [47]:
#Suppose we wanted to fetch vector X 
index.fetch(ids = ['x'])

{'namespace': '', 'usage': {'read_units': 1}, 'vectors': {}}

In [49]:
#We get empty vector because it will only recognize this when we explicitly specify the namespace also
#index.fetch(ids=['x'], namespace = 'first-namespace')

In [51]:
#To delete a vector also we need to explicitly specify its namespace
index.delete(ids = ['x'], namespace = 'first-namespace')

{}

In [52]:
#to delete all vectors in a namespace
index.delete(delete_all = True, namespace = 'first-namespace')

{}

In [53]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 5},
                'second-namespace': {'vector_count': 2}},
 'total_vector_count': 7}

### Splitting and Embedding Text using Langchain

In [54]:
import os 
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override = True)

True

In [60]:
#When dealing with large peices of text, it is important to split it into chunks and also keep semantically related pieces of text together.
#Lets load document available to us i.e a text file named Churchill_speech and split this text file into chunks we'll make use of text_splirrer
#RecursiveCharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
with open('files/churchill_speech.txt') as f:
    churchill_speech = f.read()
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 100,
    chunk_overlap = 20,# this is the max overlap betwn chunks needed to maintain continuity between them
    length_function = len
)

In [62]:
chunks = text_splitter.create_documents([churchill_speech])
print(chunks[0])

page_content='Winston Churchill Speech - We Shall Fight on the Beaches
We Shall Fight on the Beaches
June 4, 1940'


In [66]:
#To get text only
print(chunks[10].page_content)

penetration were realized and when a new French Generalissimo, General Weygand, assumed


In [68]:
#Let's check how many chunks we have 
print(f'We have {len(chunks)} chunks')

We have 300 chunks


### Embedding Cost

In [70]:
#Now we'll be using OpenAI's text Embedding model ADA-002 which is a good idea to calculate the embedding cost
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])

    print(f'Total tokens : {total_tokens}')
    print(f'Embedding cost in USD : {total_tokens/1000 * 0.0004:.6f}')

print_embedding_cost(chunks)

Total tokens : 4820
Embedding cost in USD : 0.001928


In [73]:
#Let's Import and instantiate OpenAI'sEmbeddings
from langchain.embeddings import OpenAIEmbeddings
embedding = OpenAIEmbeddings()
#OpenAIEmbeddings class can be used to embedd text to vectors
#Lets turn the first chunk into vector of floating point numbers
vector = embedding.embed_query(chunks[0].page_content)
vector


[-0.044567573656022125,
 -0.0378875395789288,
 -0.0029496059181736843,
 -0.007993097388439452,
 0.015743980127830393,
 0.022589743056483196,
 -0.028581378879143218,
 -0.009650358895679658,
 0.0010493331318405562,
 0.007336567129750246,
 0.007789127035119314,
 0.0327882728296835,
 0.00741305618686828,
 -0.011696439194526596,
 0.006374081108250191,
 -0.005386098268716172,
 0.013168851797818924,
 -0.0024986397385547204,
 0.013589540447814895,
 -0.01096341941305807,
 -0.008171572320709483,
 -0.026847628780446265,
 0.029626728860761727,
 -0.0038658801793480554,
 -0.014456415497163373,
 -0.018523080208145856,
 0.010835938116855967,
 -0.018612315811635728,
 0.00305477831350332,
 -0.014341682144316964,
 0.007081604071684753,
 -0.008560391112316214,
 -0.01650886976768816,
 0.005150257591345508,
 -0.01833185733252013,
 -0.023851811800438823,
 -0.022373024294146077,
 -0.008745239084941521,
 0.02267898052261821,
 -0.012671672693721062,
 0.013615037265848859,
 0.004605273699663786,
 0.0087516130566

### Inserting Embeddings into a Piecone Index

In [75]:
#We'll embedd each piece of text into numeric vectors and insert them into a pinecone Index
import pinecone
from langchain_community.vectorstores import Pinecone
pc = pinecone.Pinecone()

In [76]:
#Next step is to create Pinecone Index, we can also use existing Index But with the free plan of Pinecone we are limited to only
#one index and one project so let's delete existing index and create a new Index

for i in pc.list_indexes().names():
    print(f'Deleting all indexes ...', end='')
    pc.delete_index(i)
    print('Done')

Deleting all indexes ...Done


In [80]:
#Now lets create a new Index for these embeddings caleed Churchill Speech
index_name = 'churchill-speech'
if index_name not in pc.list_indexes().names():
    print(f'Creating index {index_name}')
    pc.create_index(
        name=index_name,
        dimension = 1536,
        metric = 'cosine',
        spec= pinecone.PodSpec(
            environment = 'gcp-starter'
        )
    )
    print('Done')
    

Creating index churchill-speech
Done


In [84]:
vector_store = Pinecone.from_documents(chunks, embedding, index_name=index_name)

In [86]:
#To load vector store from existing index
vector_store = Pinecone.from_existing_index(index_name='churchill-speech',embedding=embedding)

### Asking questions(Similarity Search)


In [None]:
#So far we've split the text into chunks and embedded them into vectors and inserted them into pinecone indexes.
#Now lets see how to ask questions and do similarity searches
#user defines a query and the query is emdded into a vector and similarity search is performed in the vector database and the text behind the most
#similar vectors is the answer to user's question



In [87]:
query = 'Where should we fight?'
result = vector_store.similarity_search(query)
print(result)

[Document(page_content='shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and'), Document(page_content='front, now on that, fighting'), Document(page_content='end, we shall fight in France, we shall fight on the seas and oceans, we shall fight with growing'), Document(page_content='Winston Churchill Speech - We Shall Fight on the Beaches\nWe Shall Fight on the Beaches\nJune 4, 1940')]


In [88]:
#We see how it extracted the chunks relevant to the query
#We can also iterate over chunks and only print chunk text
for r in result:
    print(r.page_content)
    print('-' * 50)

shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and
--------------------------------------------------
front, now on that, fighting
--------------------------------------------------
end, we shall fight in France, we shall fight on the seas and oceans, we shall fight with growing
--------------------------------------------------
Winston Churchill Speech - We Shall Fight on the Beaches
We Shall Fight on the Beaches
June 4, 1940
--------------------------------------------------


In [90]:
#These chunks represent the answer but we can't give this to users like this we need the answer in natural language.
#That's where LLM comes in, we'll reieve most similar chunks of text and feed them to the language model for final answer
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)
#Let's expose this index in a  retriever interface
#retriever is a generic interface that  makes it easy to combine docs with Language models
retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k':3})#it will return 3 most similar chunks to user query

chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
#The default chain type "stuff" uses all of the text from the docs in the prompt

In [93]:
query='Where should we fight?'
answer = chain.run(query)
print(answer)

We shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and front, we shall fight in France, we shall fight on the seas and oceans.


In [95]:
query='Who was the king of Belgium?'
answer=chain.run(query)
print(answer)

The King of Belgium during the time when the country was invaded and called for aid was King Leopold.
