# Transfer nodes (with embbeddings to Pinecone for Deployment Query)

- [**WARNING**] The dimensions of embeddings of the collection should be same as the pinecone index embedding.

In [1]:
chroma_path = '../processed_data/chromadb'

In [2]:
# WARNING - The dimensions of embeddings of the collection should be same as the pinecone index embedding.
collection_name = 'google_genai-docs'

In [3]:
import os
os.path.exists(chroma_path)

True

In [4]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [5]:
import subprocess
import chromadb
process = subprocess.Popen(["chroma", "run", "--path", chroma_path])
chroma_client = chromadb.HttpClient()

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given


In [6]:
chroma_client.list_collections()

[Collection(name=google_genai-docs), Collection(name=google_genai-api)]

In [7]:
chroma_collection = chroma_client.get_collection(collection_name)

In [8]:
chroma_collection.count()

1016

In [9]:
nodes_info = chroma_collection.get(include=['metadatas', 'documents', 'embeddings'])

In [10]:
len(nodes_info['embeddings'])

1016

In [11]:
embedding_dimensions = len(nodes_info['embeddings'][0].tolist())
embedding_dimensions

768

In [12]:
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import VectorStoreIndex

In [13]:
from llama_index.embeddings.google_genai import GoogleGenAIEmbedding
embed_model = GoogleGenAIEmbedding(model_name="models/text-embedding-004", api_key=os.getenv("GEMINI_API_KEY"))

In [14]:
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

In [15]:
nodes = vector_store.get_nodes(node_ids=nodes_info['ids'])

In [16]:
len(nodes)

1016

In [18]:
# add the nodes to the docstore
# from llama_index.core.storage.docstore import SimpleDocumentStore
# docstore = SimpleDocumentStore()
# docstore.add_documents(nodes)
# docstore.persist('path_to_folder/docstore.json')

In [43]:
for i, node in enumerate(nodes):
    node.embedding = nodes_info['embeddings'][i].tolist()

In [44]:
# close the chromadb process
process.terminate()

In [45]:
from pinecone import Pinecone, ServerlessSpec

In [46]:
PINECONE_API_KEY = os.environ["PINECONE_API_KEY"]
pc = Pinecone(api_key=PINECONE_API_KEY)

In [47]:
index_name = 'documentation-agent'
if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        vector_type="dense",
        dimension=768,  # for gemini - text-embedding-004
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        ),
        deletion_protection="disabled",
        tags={
            "environment": "development",
            "model": "gemini/text-embedding-004"
        }
    )

In [None]:
indexes_info = pc.list_indexes()
for index_info in indexes_info:
    if index_info['name'] == index_name:
        index_dimension = index_info['dimension']
        index_type = index_info['type']

In [None]:
if embedding_dimensions != index_dimension:
    raise ValueError(f"Embedding dimensions are {embedding_dimensions}, but should be {index_dimension}.")

In [48]:
from llama_index.vector_stores.pinecone import PineconeVectorStore

In [49]:
pinecone_index = pc.Index(name=index_name)

In [56]:
pinecone_index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'google_genai-api': {'vector_count': 1032},
                'google_genai-docs': {'vector_count': 1016}},
 'total_vector_count': 2048,
 'vector_type': 'dense'}

In [50]:
vector_store = PineconeVectorStore(pinecone_index=pinecone_index, namespace=collection_name)

In [51]:
node_ids = vector_store.add(nodes)

Upserted vectors: 100%|██████████| 1032/1032 [00:07<00:00, 132.17it/s]
