# Pinecone

In [92]:
pip install -q -r ./requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [12]:
# authenticating to Pinecone. 
# the API KEY is in .env
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [None]:
pip install --upgrade -q pinecone-client

In [None]:
pip show pinecone-client

In [18]:
from pinecone import Pinecone,x ServerlessSpec

# Initilizing and authenticating the pinecone client
pc = Pinecone()
# pc = Pinecone(api_key='YOUR_API_KEY')

# checking authentication and read indexes in pinecone
pc.list_indexes()

{'indexes': [{'dimension': 1536,
              'host': 'langchain-jl6xhcm.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'langchain',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}

## Working with Pinecone Indexes

In [19]:
pc.list_indexes().names()

['langchain']

In [44]:
#creating pinecone indexes with serveeless
from pinecone import ServerlessSpec
index_name = 'langchain'
if index_name not in pc.list_indexes().names():
    print(f"Creating index {index_name}")
    pc.create_index(
        name=index_name, 
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1",
            )
        )
    print('Index created! :D')
else:
    print(f"Index {index_name} already exists")

Creating index langchain
Index created! :D


In [43]:
#deleting pinecone indexes
index_name = 'langchain'
if index_name in pc.list_indexes().names():
    print(f'Deleting index {index_name}...')
    pc.delete_index(index_name)
    print('Done')
else: 
    print(f'Index {index_name} does not exist!')

Deleting index langchain...
Done


In [20]:
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

## Working with Vectors

In [34]:
#insering vectors
import random
vectors = [[random.random() for _ in range(1536)] for v in range(5)]
# print(vectors)
ids = list('abcde')

index_name = 'langchain'
index = pc.Index(index_name)

index.upsert(vectors=zip(ids, vectors))

{'upserted_count': 5}

In [24]:
#update vectors
index.upsert(vectors=[('c', [0.5] * 1536)])

{'upserted_count': 1}

In [27]:
# fetch vectors
# index = pc.Index(index_name)
index.fetch(ids=['c', 'd'])

{'namespace': '',
 'usage': {'read_units': 1},
 'vectors': {'c': {'id': 'c',
                   'values': [0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
                              0.5,
             

In [29]:
# delete vectors
index.delete(ids=['b', 'c'])

{}

In [35]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 5}},
 'total_vector_count': 5}

In [31]:
index.fetch(ids=['x'])

{'namespace': '', 'usage': {'read_units': 1}, 'vectors': {}}

In [39]:
# query 
query_vector = [random.random() for _ in range(1536)]
# print(query_vector)

In [37]:
# This retrieves the query_vectors of the most similar records in your index, along with their similarity scores.
index.query(
    vector=query_vector,
    top_k=3,
    include_values=False
)

{'matches': [{'id': 'e', 'score': 0.755557597, 'values': []},
             {'id': 'c', 'score': 0.747839808, 'values': []},
             {'id': 'b', 'score': 0.745738685, 'values': []}],
 'namespace': '',
 'usage': {'read_units': 5}}

## Namespaces

In [45]:
# index.describe_index_stats()
index = pc.Index('langchain')

import random
vectors = [[random.random() for _ in range(1536)] for v in range(5)]
ids = list('abcde')
index.upsert(vectors=zip(ids, vectors))

{'upserted_count': 5}

In [46]:
vectors = [[random.random() for _ in range(1536)] for v in range(3)]
ids = list('xyz')
index.upsert(vectors=zip(ids, vectors), namespace='first-namespace')

{'upserted_count': 3}

In [47]:
vectors = [[random.random() for _ in range(1536)] for v in range(3)]
ids = list('aq')
index.upsert(vectors=zip(ids, vectors), namespace='second-namespace')

{'upserted_count': 2}

In [49]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 5},
                'first-namespace': {'vector_count': 3},
                'second-namespace': {'vector_count': 2}},
 'total_vector_count': 10}

In [54]:
# get specific namespace
index.fetch(ids=['x'], namespace='first-namespace')

{'namespace': 'first-namespace', 'usage': {'read_units': 1}, 'vectors': {}}

In [53]:
# delete specific id in namespace
index.delete(ids=['x'], namespace='first-namespace')

{}

In [57]:
# delete namespace
index.delete(delete_all=True, namespace='first-namespace')

{}

## Splitting and Embedding Text Using LangChain

In [60]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [78]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
with open('files/churchill_speech.txt') as f:
    churchill_speech = f.read()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len
)

In [83]:
chunks = text_splitter.create_documents([churchill_speech])
# print(chunks[0])
# print(chunks[10].page_content)
print(f'Now you have {len(chunks)}')

Now you have 300


### Embeding Cost

In [86]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-3-small')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    # check prices here: https://openai.com/pricing
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.00255:.6f}')

print_embedding_cost(chunks)

Total Tokens: 4820
Embedding Cost in USD: 0.012291


### Creating embeddings

In [93]:
from langchain.embeddings import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

In [95]:
vector = embedding.embed_query(chunks[0].page_content)
print(vector)

[-0.04460720101393019, -0.03782302995241884, -0.002899531824250758, -0.00804663938375041, 0.01577447213986173, 0.022609653602899887, -0.02853942558454757, -0.009659792574953701, 0.0010847341349010796, 0.007338891948647185, 0.0077979711268880685, 0.032773156663462616, 0.007351644083367585, -0.011763905514029525, 0.006385664639607694, -0.005397369309333081, 0.013134767679883927, -0.002529717821883321, 0.01359384639246351, -0.011005149540044702, -0.008199665000395203, -0.026805127811992433, 0.02961061048899673, -0.003918113765524633, -0.014486500479504477, -0.018503444802511433, 0.010871251426988558, -0.018656470419156225, 0.0031115371699229877, -0.014371730801359582, 0.007077471557064444, -0.008499342261800444, -0.016539605811021305, 0.005212462191734037, -0.018350417323221437, -0.023884872662875437, -0.022469377792668988, -0.008754385887531033, 0.022660662141781484, -0.012669312201452194, 0.013683111801167606, 0.004654553387333432, 0.008843651296235131, 0.0029728569481390302, -0.0278508