Pinecone


In [4]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [4]:
pip install -q pinecone-client

Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install --upgrade -q pinecone-client

Note: you may need to restart the kernel to use updated packages.


In [6]:
pip show pinecone-client

Name: pinecone-client
Version: 3.2.1
Summary: Pinecone client and SDK
Home-page: https://www.pinecone.io
Author: Pinecone Systems, Inc.
Author-email: support@pinecone.io
License: Apache-2.0
Location: C:\Users\akinw\anaconda3\envs\Andela\Lib\site-packages
Requires: certifi, tqdm, typing-extensions, urllib3
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [5]:
from pinecone import Pinecone
pc = Pinecone()
pc.list_indexes()

{'indexes': [{'dimension': 3072,
              'host': 'langchain-slnc88x.svc.gcp-starter.pinecone.io',
              'metric': 'cosine',
              'name': 'langchain',
              'spec': {'pod': {'environment': 'gcp-starter',
                               'pod_type': 'starter',
                               'pods': 1,
                               'replicas': 1,
                               'shards': 1}},
              'status': {'ready': True, 'state': 'Ready'}}]}

Working with Pinecone indexes

In [11]:
pc.list_indexes().names()

['langchain']

In [13]:
index_name = 'langchain'
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 3072,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

Working with Vectors


In [35]:
# inserting vectors
import random
vectors = [[random.random() for _ in range(3072)] for v in range(5)]
# print(vectors)
ids = list('abcde')
index_name = 'langchain'
index = pc.Index(index_name)
index.upsert(vectors=zip(ids, vectors))

{'upserted_count': 5}

In [19]:
# updating vectors
index.upsert(vectors=[('c', [0.5] * 3072)])

{'upserted_count': 1}

In [20]:
# fetching vectors
index.fetch(ids=['c', 'd'])

{'namespace': '',
 'usage': {'read_units': 1},
 'vectors': {'c': {'id': 'c',
                   'values': [0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
             

In [33]:
# deleting vectors
index.delete(ids=['e'])

{}

In [38]:
index.describe_index_stats()

{'dimension': 3072,
 'index_fullness': 5e-05,
 'namespaces': {'': {'vector_count': 5}},
 'total_vector_count': 5}

In [24]:
index.fetch(ids=[ 'x'])

{'namespace': '', 'usage': {'read_units': 1}, 'vectors': {}}

In [39]:
#querying vectors
query_vector = [random.random() for _ in range(3072)]

index.query(
    vector = query_vector,
    top_k = 3,
    include_values=False
)

Namespaces


In [6]:
index = pc.Index('langchain')
import random
vectors = [[random.random() for _ in range(3072)] for v in range(5)]
# print(vectors)
ids = list('abcde')
index.upsert(vectors=zip(ids, vectors))

{'upserted_count': 5}

In [7]:
vectors = [[random.random() for _ in range(3072)] for v in range(3)]
# print(vectors)
ids = list('xyz')
index.upsert(vectors=zip(ids, vectors), namespace='first-namespace')

{'upserted_count': 3}

In [8]:
vectors = [[random.random() for _ in range(3072)] for v in range(2)]
# print(vectors)
ids = list('qp')
index.upsert(vectors=zip(ids, vectors), namespace='second-namespace')

{'upserted_count': 2}

In [9]:
index.describe_index_stats()

{'dimension': 3072,
 'index_fullness': 0.0001,
 'namespaces': {'': {'vector_count': 5},
                'first-namespace': {'vector_count': 3},
                'second-namespace': {'vector_count': 2}},
 'total_vector_count': 10}

In [10]:
index.delete(delete_all=True, namespace='first-namespace')

{}

In [11]:
index.describe_index_stats()

{'dimension': 3072,
 'index_fullness': 7e-05,
 'namespaces': {'': {'vector_count': 5},
                'second-namespace': {'vector_count': 2}},
 'total_vector_count': 7}

In [13]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [15]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
with open('files/churchhill.txt') as f:
    churchhill_speech = f.read()
    
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 100,
    chunk_overlap = 20,
    length_function = len
)

In [17]:
chunks = text_splitter.create_documents([churchhill_speech])
print(chunks[5])

page_content='so many of us knew so well, our losses in men exceed 30,000 in killed, wounded and missing. I take'


In [18]:
from langchain_openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

In [19]:
vector = embedding.embed_query(chunks[0].page_content)
print(vector)

[-0.02743887036723576, -0.019429550047440007, -0.01677661988712055, -0.018178882519504145, 0.01263299708248429, 0.017837791206190258, -0.008824148429984647, 0.0017607239270438077, -0.004004660159897876, 0.01937901834595743, 0.009961118232600815, 0.025076499818747617, -0.004633151573131845, -0.00840094333459222, 0.01365627009110336, 0.017559865916713484, 0.017307205546655404, -0.01587967799485312, 0.02104657380810845, -0.018077819116538992, -0.011805535581263453, 0.0027050404536044033, 0.01900002779087031, -0.010820161814417613, -0.009095757489606745, -0.02968754319040421, 0.018368378728370303, -0.029460149602410016, 0.00756716513785541, -0.018974761008806424, 0.0037646331808717385, -0.014275286693894022, -0.00883046465983932, -0.015677549326277615, -0.03082451299302038, -0.015374357254736957, -0.017383004030201866, -0.004822646850675406, 0.03575138089592698, -0.010137980119112431, 0.0004113619606880982, 0.01430055254463531, 0.012190842366205245, 0.0006407298070693456, -0.01774936026293

Inserting the Embeddings into a Pinecode Index

In [20]:
import pinecone
from langchain_community.vectorstores import Pinecone
pc = pinecone.Pinecone()

In [21]:
for i in pc.list_indexes().names():
    pc.delete_index(i)

In [25]:
index_name = 'churchill-speech'
if index_name not in pc.list_indexes().names():
    print(f'Creating index {index_name} ...')
    pc.create_index(
        name = index_name,
        dimension = 1536,
        metric = 'cosine',
        spec = pinecone.PodSpec(
            environment = 'gcp-starter'
        )
    )
    print('Done')

In [28]:
vector_store = Pinecone.from_documents(chunks, embedding, index_name= index_name)

In [29]:
#loading the vector from an existing index
vector_store = Pinecone.from_existing_index(index_name='churchill-speech', embedding=embedding)

Asking Questions (Similarity Search)

In [30]:
query = 'Where should we fight?'
result = vector_store.similarity_search(query)
print(result)

[Document(page_content='In a long series of very fierce battles, now on this front, now on that, fighting on three fronts'), Document(page_content='a defensive war. We have our duty to our Allies.'), Document(page_content='in the air. We shall defend our island whatever the cost may be; we shall fight on beaches, landing'), Document(page_content='equal or sometimes larger number of the enemy, and fought very fiercely on old ground so many of us')]


In [31]:
for r in result:
    print(r.page_content)
    print('-' * 50)

In a long series of very fierce battles, now on this front, now on that, fighting on three fronts
--------------------------------------------------
a defensive war. We have our duty to our Allies.
--------------------------------------------------
in the air. We shall defend our island whatever the cost may be; we shall fight on beaches, landing
--------------------------------------------------
equal or sometimes larger number of the enemy, and fought very fiercely on old ground so many of us
--------------------------------------------------


In [38]:
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)
retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k':3})
chain = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=retriever)

In [42]:
query = 'What about the French armies?'
answer = chain.run(query)
print(answer)

The context provided mentions that the French Army has been weakened and that a colossal military disaster has occurred in France.


In [41]:
#query = 'Where should we fight?'
query = 'Who was the king of Belgium at that time'
answer = chain.invoke(query)
print(answer)

{'query': 'Who was the king of Belgium at that time', 'result': 'The King of Belgium at the time of the events described in the context provided was King Leopold III.'}
