In [1]:
%pip install -U voyageai


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [16]:
import os
import voyageai
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, PointStruct, VectorParams
from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
import time
from requests.exceptions import HTTPError
import numpy as np

def vectorise(collection_name: str):
    # Load environment variables from .env file
    load_dotenv()

    path = "data/website_data.txt"

    VOYAGE_API_KEY = os.getenv("VOYAGE_API_KEY")
    QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
    QDRANT_URL = os.getenv("QDRANT_URL")

    # Configure Voyage client
    vo = voyageai.Client(api_key=VOYAGE_API_KEY)

    # Initialize Qdrant client
    client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)

    # Function to split text into chunks
    def make_chunks(inptext: str):
        text_splitter = RecursiveCharacterTextSplitter(
            separators=["\n"],
            chunk_size=1000,
            chunk_overlap=20,
            length_function=len,
        )
        chunks = text_splitter.create_documents([inptext])
        return chunks

    # Load texts from the provided file
    file_path = path
    with open(file_path, 'r', encoding='utf-8') as file:
        full_text = file.read()

    texts = make_chunks(full_text)

    # Retry mechanism for handling rate limits
    max_retries = 5
    retry_delay = 10  # seconds
    batch_size = 10  # Number of chunks per batch

    all_embeddings = []
    for attempt in range(max_retries):
        try:
            # Process text in batches
            for i in range(0, len(texts), batch_size):
                batch_texts = [chunk.page_content for chunk in texts[i:i+batch_size]]
                results = vo.embed(
                    batch_texts,
                    model="voyage-2",
                    input_type="document"
                ).embeddings
                
                all_embeddings.extend(results)  # Collect all embeddings

            break  # Exit loop if successful
        except HTTPError as e:
            if e.response.status_code == 429:  # Rate limit error
                print(f"Rate limit exceeded. Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
                retry_delay *= 2  # Exponential backoff
            else:
                raise  # Re-raise exception if not a rate limit error
        except Exception as e:
            print(f"An error occurred: {e}")
            raise  # Re-raise exception if it's not a rate limit error

    # Check if the collection already exists
    try:
        client.get_collection(collection_name=str(collection_name))
        print("Collection already exists.")
    except Exception as e:
        print("Creating collection...")
        client.create_collection(
            collection_name=str(collection_name),
            vectors_config=VectorParams(size=1024, distance=Distance.COSINE)  # Changed to match voyage-2 model
        )

    # Prepare points to be inserted into Qdrant
    points = [
        PointStruct(
            id=idx,
            vector=embedding,
            payload={"text": chunk.page_content},
        )
        for idx, (embedding, chunk) in enumerate(zip(all_embeddings, texts))
    ]

    # Insert the points into the Qdrant collection
    client.upsert(
        collection_name=str(collection_name),
        points=points
    )

    print("Collection created and points upserted.")

    return collection_name


In [17]:
collection_name = vectorise('voyager_rag')

An error occurred: You have not yet added your payment method in the billing page and will have reduced rate limits of 3 RPM and 10K TPM.  Please add your payment method in the billing page (https://dash.voyageai.com/billing/payment-methods) to unlock our standard rate limits (https://docs.voyageai.com/docs/rate-limits).  Even with payment methods entered, the free tokens (50M tokens per model) will still apply.


RateLimitError: You have not yet added your payment method in the billing page and will have reduced rate limits of 3 RPM and 10K TPM.  Please add your payment method in the billing page (https://dash.voyageai.com/billing/payment-methods) to unlock our standard rate limits (https://docs.voyageai.com/docs/rate-limits).  Even with payment methods entered, the free tokens (50M tokens per model) will still apply.