# Pinecone Vector Database Integration

**References:**

    - [LanChain Pinecone](https://python.langchain.com/v0.2/docs/integrations/vectorstores/pinecone/)
    - [PineconeVectorStore API](https://api.python.langchain.com/en/latest/vectorstores/langchain_pinecone.vectorstores.PineconeVectorStore.html)

In [6]:
import os
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
from loaders.text_loaders import TextLoaderWithMetadata
from langchain_openai import AzureOpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_pinecone import PineconeVectorStore



In [2]:
load_dotenv("../.env")

True

In [13]:
# prepare modules for document extraction
loader = TextLoaderWithMetadata(
    "/Users/tjosh/codes/axum/druginsights/data/BNFPaeds_prettified.txt"
)
documents = loader.load(
    metadata={
        "title": "British National Formulary for Children",
        "link": "https://www.nice.org.uk/bnfc-uk-only",
        "references": [
            "NICE. (n.d.). BNFc is only available in the UK. https://www.nice.org.uk/bnfc-uk-only"
        ],
    },
)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024, chunk_overlap=0
)
docs = text_splitter.split_documents(documents)

embeddings = AzureOpenAIEmbeddings(
    azure_deployment=os.environ["AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME"],
    openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
)

len(docs)

3693

In [14]:
docs[0]



In [3]:
# Prepare pinecone

PC = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
PC_INDEX_NAME = os.environ["PINECONE_INDEX_NAME"]
PC.list_indexes()

{'indexes': [{'dimension': 1536,
              'host': 'axum-druginsights-80ua1d8.svc.eastus2-5e25.prod-azure.pinecone.io',
              'metric': 'cosine',
              'name': 'axum-druginsights',
              'spec': {'serverless': {'cloud': 'azure', 'region': 'eastus2'}},
              'status': {'ready': True, 'state': 'Ready'}}]}

In [8]:
# run this only once, if the index is not created

# PC.create_index(
#             name=PC_INDEX_NAME,
#             dimension=1536,
#             metric="cosine",
#             spec=ServerlessSpec(cloud="aws", region="us-east-1"),
#         )

In [15]:
# Adding documents to the vector database. Be careful, this will overwrite the previous documents.
# If you want to add more documents, you can use the .add_documents method instead.
# run only once for the same documents
vectorstore = PineconeVectorStore.from_documents(docs, embeddings, index_name=PC_INDEX_NAME)

# Or, load an existing vector store
# vectorstore = PineconeVectorStore(embedding=embeddings, index_name=PC_INDEX_NAME)


In [11]:
# let us test out the document search

query = "ACE inhibitors"
docs = vectorstore.similarity_search(query, k=3)
print(len(docs))
print(docs[0].page_content)

3


In [1]:
import json
import os

os.chdir("../")


In [3]:
from src.agents.prompts.qa_prompts_2 import CONTEXTUALIZE_Q_SYSTEM_PROMPT, QA_SYSTEM_PROMPT

In [4]:
with open("config.json", "r") as f:
    config = json.load(f)

config["contextualize_q_system_prompt"] = CONTEXTUALIZE_Q_SYSTEM_PROMPT
config["qa_system_prompt"] = QA_SYSTEM_PROMPT

with open("config.json", "w") as f:
    json.dump(config, f)
