## Pinecone Basics

In [2]:
import json
import pinecone
from dotenv import load_dotenv, find_dotenv
import os
# Read config file to get API_KEY and ORG_ID values
load_dotenv(find_dotenv(),  override=True)
api_key = os.getenv("PINECONE_API_KEY")
env = os.getenv("PINECONE_ENV")
print(api_key) 
pinecone.init(api_key = api_key, environment= env)
pinecone.info.version()

9d3f4ffd-cc84-49ce-a06a-1bf3598f34cb


VersionResponse(server='2.0.11', client='2.2.4')

In [7]:
# Create Pinecone index
index_name = "fishing"
if index_name not in pinecone.list_indexes():
    print(f'Create index {index_name}')
    pinecone.create_index(index_name,dimension=1536, metric='cosine', pods=1, pod_type='p1.x2')
    print('Done')
else:
    print("Index exists")
    
pinecone.list_indexes()

Create index fishing
Done


['fishing']

In [6]:
# Delete Pinecone index

index_name = "fishing"
if index_name in pinecone.list_indexes():
    print(f'Deleting index {index_name}')
    pinecone.delete_index(index_name)
    print('Index Deleted')
else:
    print(f'Index {index_name} does not exist!')
    

Deleting index fishing
Index Deleted


In [None]:
index_name = "langchain-pinecone"
index = pinecone.Index(index_name)
index.describe_index_stats()

In [None]:
# Insert vectors
import random
vectors = [[random.random() for _ in range (1536)] for v in range (5)]
# Vectors
ids = list('abcde')

index_name = "langchain-pinecone"
index = pinecone.Index(index_name)
index.upsert(vectors = zip(ids, vectors))

In [None]:
# Update vectors:
index_name = "langchain-pinecone"
index = pinecone.Index(index_name)
index.upsert(vectors = [('c',[0.3]*1536)])


In [None]:
# Fetch a vector
index_name = "langchain-pinecone"
index = pinecone.Index(index_name)
index.fetch(ids)


In [None]:
# Delete Index 
index.delete(ids = ['b','c'])
index.describe_index_stats()

## Splitting and Embedding Text Using LangChain


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import PyPDF2
import json

# Read config file to get API_KEY and ORG_ID values
with open("../parameters/config.json") as config:
    param = json.load(config)

# Read the PDF file
pdf_reader = PyPDF2.PdfReader('../data/TroutStocking.pdf')

# Extract text from each page and concatenate it
full_text = ""
for page in pdf_reader.pages:
    full_text += page.extract_text() + "\n"

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len
)

# Create chunks from the extracted text
chunks = text_splitter.create_documents([full_text])
print(chunks[0])
print(len(chunks))

In [None]:
# Calculate embedding cost
import tiktoken
def cost(texts):
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens:{total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.0004:.6f}')

print(cost(chunks))

In [None]:
# Create embeddings
from langchain.embeddings import  OpenAIEmbeddings
# Read config file to get API_KEY and ORG_ID values
with open("../parameters/config.json") as config:
    param = json.load(config)
api_key = param['GPT']['API']

embeddings = OpenAIEmbeddings(api_key = api_key)
vector = embeddings.embed_query(chunks[0].page_content)
print(vector)

## Inserting the Embeddings into a Pinecone Index


In [None]:
import json
import pinecone
from langchain.vectorstores import Pinecone
# Read config file to get API_KEY and ORG_ID values
with open("../parameters/config.json") as config:
    param = json.load(config)

api_key = param['PINECONE']['API']
env = param['PINECONE']['ENV']
print(api_key) 
pinecone.init(api_key = api_key, environment= env)

pinecone.info.version()

In [None]:
# deleting all indexes
indexes = pinecone.list_indexes()
print(indexes)

for i in indexes:
    pinecone.delete_index(i)
    print("Indexes Deleted")


In [None]:
# Create Pinecone index
index_name = "fishing"
if index_name not in pinecone.list_indexes():
    print(f'Create index {index_name}')
    pinecone.create_index(index_name,dimension=1536, metric='cosine', pods=1, pod_type='p1.x2')
    print('Done')
else:
    print("Index exists")
pinecone.list_indexes()

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import PyPDF2
import json

# Read config file to get API_KEY and ORG_ID values
with open("../parameters/config.json") as config:
    param = json.load(config)

# Read the PDF file
pdf_reader = PyPDF2.PdfReader('../data/TroutStocking.pdf')

# Extract text from each page and concatenate it
full_text = ""
for page in pdf_reader.pages:
    full_text += page.extract_text() + "\n"

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len
)

# Create chunks from the extracted text
chunks = text_splitter.create_documents([full_text])
print(chunks[0])
print(len(chunks))

# Create embeddings
from langchain.embeddings import  OpenAIEmbeddings
# Read config file to get API_KEY and ORG_ID values
with open("../parameters/config.json") as config:
    param = json.load(config)
api_key = param['GPT']['API']

embeddings = OpenAIEmbeddings(api_key = api_key)
vector = embeddings.embed_query(chunks[0].page_content)


FileNotFoundError: [Errno 2] No such file or directory: '../parameters/config.json'

In [None]:
Pinecone.from_documents(chunks, embeddings,index_name = index_name)

In [None]:
## Asking Questions ( Similarity Search)
vector_store = Pinecone.from_documents(chunks,embeddings,index_name=index_name)
query = 'Bodies of watter in Lumpkin county'
results = vector_store.similarity_search(query)
print(results)

In [None]:
for r in results:
    print(r.page_content)
    print('-'*50)

In [None]:
# Use LLM to get better answer
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

# Initialize the LLM with your API key and model parameters
llm = ChatOpenAI(api_key=api_key, model_name='gpt-4-1106-preview', temperature=1)

# Assuming vector_store is a Pinecone object and has a method like as_retriever
# Adjust the method name and parameters according to the actual API
retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})

# Initialize the RetrievalQA chain
chain = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=retriever)


In [None]:
query = 'What is latest stocking schedule in lumpking county'
answer = chain.run(query)
print(answer)