In [1]:
import os
from CameraDocument import CameraDocument
from glob import glob

camera_model_list = ["gfx100ii", "x-e4", "x-s20", "x-t5", "x100v"]
json_dir = "./data/json"
embedding_model = "text-embedding-3-small"

documents = []
for camera_model in camera_model_list:
    detail_json_dir = os.path.join(json_dir, camera_model, embedding_model)
    json_path = os.path.join(detail_json_dir, "*.json")
    json_list = glob(json_path)

    for path in json_list:
        document = CameraDocument()
        document.load_json_with_vector(path)
        documents.append(document)    

In [2]:
len(documents)

1843

In [3]:
import time
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
from dotenv import load_dotenv

load_dotenv(dotenv_path=".env", override=True)

pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

spec = ServerlessSpec(cloud="aws", region="us-east-1")

index_name = 'camera-document'

# check if index already exists (it shouldn't if this is your first run)
if index_name not in pc.list_indexes().names():
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=1536,  # dimensionality of text-embed-3-small
        metric='dotproduct',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 0}},
 'total_vector_count': 0}

In [5]:
def upload_to_pinecone(index, documents):
    vectors = []
    for doc in documents:
        page = doc.metadata.get("page", "unknown")
        model = doc.metadata.get("model", "unknown")
        doc_id = f"{model}_page{page}"
        
        vector = {
            "id": doc_id,
            "values": doc.embedding_result,
            "metadata": {
                "page": page,
                "model": model,
                "chapter": doc.metadata.get("chapter"),
                "section": doc.metadata.get("section"),
                "subsection": doc.metadata.get("subsection"),
                "parsing_result": doc.parsing_result,
            },
        }
        vectors.append(vector)
    
    batch_size = 100
    for i in range(0, len(vectors), batch_size):
        batch = vectors[i:i+batch_size]
        index.upsert(vectors=batch)
        time.sleep(0.1) 


upload_to_pinecone(index, documents)