# Pinecone

In [92]:
pip install -q -r ./requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [12]:
# authenticating to Pinecone. 
# the API KEY is in .env
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [None]:
pip install --upgrade -q pinecone-client

In [None]:
pip show pinecone-client

In [18]:
from pinecone import Pinecone,x ServerlessSpec

# Initilizing and authenticating the pinecone client
pc = Pinecone()
# pc = Pinecone(api_key='YOUR_API_KEY')

# checking authentication and read indexes in pinecone
pc.list_indexes()

{'indexes': [{'dimension': 1536,
              'host': 'langchain-jl6xhcm.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'langchain',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}

## Working with Pinecone Indexes

In [19]:
pc.list_indexes().names()

['langchain']

In [44]:
#creating pinecone indexes with serveeless
from pinecone import ServerlessSpec
index_name = 'langchain'
if index_name not in pc.list_indexes().names():
    print(f"Creating index {index_name}")
    pc.create_index(
        name=index_name, 
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1",
            )
        )
    print('Index created! :D')
else:
    print(f"Index {index_name} already exists")

Creating index langchain
Index created! :D


In [43]:
#deleting pinecone indexes
index_name = 'langchain'
if index_name in pc.list_indexes().names():
    print(f'Deleting index {index_name}...')
    pc.delete_index(index_name)
    print('Done')
else: 
    print(f'Index {index_name} does not exist!')

Deleting index langchain...
Done


In [20]:
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

## Working with Vectors

In [34]:
#insering vectors
import random
vectors = [[random.random() for _ in range(1536)] for v in range(5)]
# print(vectors)
ids = list('abcde')

index_name = 'langchain'
index = pc.Index(index_name)

index.upsert(vectors=zip(ids, vectors))

{'upserted_count': 5}

In [24]:
#update vectors
index.upsert(vectors=[('c', [0.5] * 1536)])

{'upserted_count': 1}

In [27]:
# fetch vectors
# index = pc.Index(index_name)
index.fetch(ids=['c', 'd'])

{'namespace': '',
 'usage': {'read_units': 1},
 'vectors': {'c': {'id': 'c',
                   'values': [0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
             

In [29]:
# delete vectors
index.delete(ids=['b', 'c'])

{}

In [35]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 5}},
 'total_vector_count': 5}

In [31]:
index.fetch(ids=['x'])

{'namespace': '', 'usage': {'read_units': 1}, 'vectors': {}}

In [39]:
# query 
query_vector = [random.random() for _ in range(1536)]
# print(query_vector)

In [37]:
# This retrieves the query_vectors of the most similar records in your index, along with their similarity scores.
index.query(
    vector=query_vector,
    top_k=3,
    include_values=False
)

{'matches': [{'id': 'e', 'score': 0.755557597, 'values': []},
             {'id': 'c', 'score': 0.747839808, 'values': []},
             {'id': 'b', 'score': 0.745738685, 'values': []}],
 'namespace': '',
 'usage': {'read_units': 5}}

## Namespaces

In [45]:
# index.describe_index_stats()
index = pc.Index('langchain')

import random
vectors = [[random.random() for _ in range(1536)] for v in range(5)]
ids = list('abcde')
index.upsert(vectors=zip(ids, vectors))

{'upserted_count': 5}

In [46]:
vectors = [[random.random() for _ in range(1536)] for v in range(3)]
ids = list('xyz')
index.upsert(vectors=zip(ids, vectors), namespace='first-namespace')

{'upserted_count': 3}

In [47]:
vectors = [[random.random() for _ in range(1536)] for v in range(3)]
ids = list('aq')
index.upsert(vectors=zip(ids, vectors), namespace='second-namespace')

{'upserted_count': 2}

In [49]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 5},
                'first-namespace': {'vector_count': 3},
                'second-namespace': {'vector_count': 2}},
 'total_vector_count': 10}

In [54]:
# get specific namespace
index.fetch(ids=['x'], namespace='first-namespace')

{'namespace': 'first-namespace', 'usage': {'read_units': 1}, 'vectors': {}}

In [53]:
# delete specific id in namespace
index.delete(ids=['x'], namespace='first-namespace')

{}

In [57]:
# delete namespace
index.delete(delete_all=True, namespace='first-namespace')

{}

## Splitting and Embedding Text Using LangChain

In [60]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [78]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
with open('files/churchill_speech.txt') as f:
    churchill_speech = f.read()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len
)

In [83]:
chunks = text_splitter.create_documents([churchill_speech])
# print(chunks[0])
# print(chunks[10].page_content)
print(f'Now you have {len(chunks)}')

Now you have 300


### Embeding Cost

In [86]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-3-small')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    # check prices here: https://openai.com/pricing
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.00255:.6f}')

print_embedding_cost(chunks)

Total Tokens: 4820
Embedding Cost in USD: 0.012291


### Creating embeddings

In [109]:
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model='text-embedding-3-large')

In [110]:
vector = embeddings.embed_query(chunks[0].page_content)
print(vector)

[0.009385457073265283, 0.012539638884992108, -0.010413296310865905, 0.014017157905458323, 0.007516074774200512, -0.06295515470002197, 0.010830856641425414, 0.03702790960558337, 0.007252690701810118, 0.029910122372971654, 0.010175608824775187, 0.08489952065328242, -0.0058201396596304624, 0.026492557885838267, 0.01111993636879293, -0.001238225119381979, -0.0387495413064531, 0.015558916761859257, -0.022342655357494346, -0.03733626305324416, -0.002853859856957032, 0.02772596422590097, -0.03674525320988354, -0.007644554446069951, -0.01654821228416356, 0.017987187589333455, -0.0027703479305435136, 0.01995293010796158, 0.02798292356963985, 0.016034291734040692, 0.01708782802360227, 0.005058896625915354, -0.0015570158989261537, -0.026800909470853944, 0.024051438532383593, -0.01807712354590657, 0.011582463746316444, 0.04016281955966204, -0.015764483864321336, 0.036385509383591065, 0.032325546537110485, -0.03895510654627007, -0.016522517094847717, -0.014736645558043271, 0.03150327440197194, 0.02

## Inserting the Embeddings into a Pinecone Index

In [97]:
import pinecone
from langchain_community.vectorstores import Pinecone
pc = pinecone.Pinecone()

In [106]:
for i in pc.list_indexes().names():
    print('Deleteing all indexes')
    pc.delete_index(i)
    print('Done')

Deleteing all indexes
Done


In [107]:
index_name = 'churchill-speech'
if index_name not in pc.list_indexes().names():
    print(f'Creating index {index_name} ...')
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric='cosine',
        spec=pinecone.PodSpec(
            environment='gcp-starter'
        )
    )
    print('Done')

Creating index churchill-speech ...
Done


In [108]:
vectore_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)

In [111]:
# loading the vector store from an existing index
vector_store = Pinecone.from_existing_index(index_name='churchill-speech', embedding=embeddings)