## Pinecone

In [7]:
import json
import pinecone

# Read config file to get API_KEY and ORG_ID values
with open("../parameters/config.json") as config:
    param = json.load(config)

api_key = param['PINECONE']['API']
env = param['PINECONE']['ENV']
print(api_key) 
pinecone.init(api_key = api_key, environment= env)

pinecone.info.version()

9d3f4ffd-cc84-49ce-a06a-1bf3598f34cb


VersionResponse(server='2.0.11', client='2.2.4')

In [6]:
# Create Pinecone index
index_name = "langchain-pinecone"
if index_name not in pinecone.list_indexes():
    print(f'Create index {index_name}')
    pinecone.create_index(index_name,dimension=1536, metric='cosine', pods=1, pod_type='p1.x2')
    print('Done')
else:
    print("Index exists")
    
pinecone.list_indexes()

Create index langchain-pinecone


ApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'content-type': 'text/plain; charset=utf-8', 'Content-Length': '140', 'date': 'Sun, 03 Dec 2023 04:03:35 GMT', 'x-envoy-upstream-service-time': '3', 'server': 'envoy', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: Capacity Reached. Starter Projects support a single index. Create a new project to add more. Your Starter Project remains free post-upgrade.


In [None]:
# Delete Pinecone index

index_name = "langchain-pinecone"
if index_name in pinecone.list_indexes():
    print(f'Deleting index {index_name}')
    pinecone.delete_index(index_name)
    print('Index Deleted')
else:
    print(f'Index {index_name} does not exist!')
    

In [None]:
index_name = "langchain-pinecone"
index = pinecone.Index(index_name)
index.describe_index_stats()

In [None]:
# Insert vectors
import random
vectors = [[random.random() for _ in range (1536)] for v in range (5)]
# Vectors
ids = list('abcde')

index_name = "langchain-pinecone"
index = pinecone.Index(index_name)
index.upsert(vectors = zip(ids, vectors))

In [None]:
# Update vectors:
index_name = "langchain-pinecone"
index = pinecone.Index(index_name)
index.upsert(vectors = [('c',[0.3]*1536)])


In [None]:
# Fetch a vector
index_name = "langchain-pinecone"
index = pinecone.Index(index_name)
index.fetch(ids)


In [None]:
# Delete Index 
index.delete(ids = ['b','c'])
index.describe_index_stats()

## Splitting and Embedding Text Using LangChain


In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import PyPDF2
import json

# Read config file to get API_KEY and ORG_ID values
with open("../parameters/config.json") as config:
    param = json.load(config)

# Read the PDF file
pdf_reader = PyPDF2.PdfReader('../data/TroutStocking.pdf')

# Extract text from each page and concatenate it
full_text = ""
for page in pdf_reader.pages:
    full_text += page.extract_text() + "\n"

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len
)

# Create chunks from the extracted text
chunks = text_splitter.create_documents([full_text])
print(chunks[0])
print(len(chunks))

page_content='Water Body County Stocking Schedule\nBoggs Creek Lumpkin 2023-04-05\nBoggs Creek Lumpkin 2023-04-12'
234


In [9]:
# Calculate embedding cost
import tiktoken
def cost(texts):
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens:{total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.0004:.6f}')

print(cost(chunks))

Total Tokens:8037
Embedding Cost in USD: 0.003215
None


In [10]:
# Create embeddings
from langchain.embeddings import  OpenAIEmbeddings
# Read config file to get API_KEY and ORG_ID values
with open("../parameters/config.json") as config:
    param = json.load(config)
api_key = param['GPT']['API']

embeddings = OpenAIEmbeddings(api_key = api_key)
vector = embeddings.embed_query(chunks[0].page_content)
print(vector)

[-0.023088439874252496, -0.006411140252561013, -0.009314559980158336, -0.017351060323643527, -0.005577622794516799, 0.01464212777009253, -0.023102333345285844, -0.008168473417139877, -0.023532982518542744, 0.0023546874592592143, -0.008710260114114604, 0.006515330109439533, -0.014586559473894933, 0.019170907588614033, -0.01071070294474335, 0.012933416166194587, -0.008501880400357563, -0.0013197363244826592, 0.0139544754597524, -0.01342658130248839, 0.011224705493619277, 0.04795505032566598, -0.013871123946778637, -0.015836835893791913, 0.010523160643568433, -0.008578286109137285, 0.019434853735923404, -0.043537407099539675, -0.016656461277786725, 0.026867054213364865, 0.0019396651812856177, -0.02590850902019869, 0.003122218380476085, -0.008953369780164484, -0.022421625907817124, -0.04084236801702202, 0.008147636004557753, -0.010259213564936429, 0.020976861382551188, -0.01468380352657941, 0.01748998013281489, 0.012280493808147299, 0.010286997713035227, -0.003363591362646797, -0.004455846

## Inserting the Embeddings into a Pinecone Index


In [11]:
import json
import pinecone
from langchain.vectorstores import Pinecone
# Read config file to get API_KEY and ORG_ID values
with open("../parameters/config.json") as config:
    param = json.load(config)

api_key = param['PINECONE']['API']
env = param['PINECONE']['ENV']
print(api_key) 
pinecone.init(api_key = api_key, environment= env)

pinecone.info.version()

9d3f4ffd-cc84-49ce-a06a-1bf3598f34cb


VersionResponse(server='2.0.11', client='2.2.4')

In [16]:
# deleting all indexes
indexes = pinecone.list_indexes()
print(indexes)

for i in indexes:
    pinecone.delete_index(i)
    print("Indexes Deleted")


['fishing']
Indexes Deleted


In [17]:
# Create Pinecone index
index_name = "fishing"
if index_name not in pinecone.list_indexes():
    print(f'Create index {index_name}')
    pinecone.create_index(index_name,dimension=1536, metric='cosine', pods=1, pod_type='p1.x2')
    print('Done')
else:
    print("Index exists")
pinecone.list_indexes()

Create index fishing
Done


['fishing']

In [18]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import PyPDF2
import json

# Read config file to get API_KEY and ORG_ID values
with open("../parameters/config.json") as config:
    param = json.load(config)

# Read the PDF file
pdf_reader = PyPDF2.PdfReader('../data/TroutStocking.pdf')

# Extract text from each page and concatenate it
full_text = ""
for page in pdf_reader.pages:
    full_text += page.extract_text() + "\n"

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len
)

# Create chunks from the extracted text
chunks = text_splitter.create_documents([full_text])
print(chunks[0])
print(len(chunks))

# Create embeddings
from langchain.embeddings import  OpenAIEmbeddings
# Read config file to get API_KEY and ORG_ID values
with open("../parameters/config.json") as config:
    param = json.load(config)
api_key = param['GPT']['API']

embeddings = OpenAIEmbeddings(api_key = api_key)
vector = embeddings.embed_query(chunks[0].page_content)


page_content='Water Body County Stocking Schedule\nBoggs Creek Lumpkin 2023-04-05\nBoggs Creek Lumpkin 2023-04-12'
234


In [19]:
Pinecone.from_documents(chunks, embeddings,index_name = index_name)

<langchain.vectorstores.pinecone.Pinecone at 0x2acfed487f0>

In [14]:
## Asking Questions ( Similarity Search)
vector_store = Pinecone.from_documents(chunks,embeddings,index_name=index_name)
query = 'Bodies of watter in Lumpkin county'
results = vector_store.similarity_search(query)
print(results)

ValueError: Index 'langchain-pinecone' not found in your Pinecone project. Did you mean one of the following indexes: fishing

In [None]:
for r in results:
    print(r.page_content)
    print('-'*50)

In [None]:
# Use LLM to get better answer
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

# Initialize the LLM with your API key and model parameters
llm = ChatOpenAI(api_key=api_key, model_name='gpt-4-1106-preview', temperature=1)

# Assuming vector_store is a Pinecone object and has a method like as_retriever
# Adjust the method name and parameters according to the actual API
retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})

# Initialize the RetrievalQA chain
chain = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=retriever)


In [None]:
query = 'What is latest stocking schedule in lumpking county'
answer = chain.run(query)
print(answer)