In [None]:
import pandas as pd
import os
import huggingface_hub
from transformers import AutoTokenizer
import voyageai

In [17]:
doc = pd.read_csv('/Users/npatel237/LawChatBot/Title18_CSV_Data/Title18_processed_sections.csv', encoding='utf-8')
processed_content = doc['Processed_Content']

In [None]:
voyageai.api_key = os.getenv("VOYAGE_API")
vo = voyageai.Client()

Tokenziation:

In [19]:
tokenizer = AutoTokenizer.from_pretrained('voyageai/voyage-2')

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


In [None]:
def chunk_text(text, max_tokens=4096, overlap=512):
    """
    Splits text into chunks based on the token limit of voyage-law-2 tokenizer.
    Uses a sliding window approach with overlap.
    
    Args:
        text (str): The input text to be chunked.
        max_tokens (int): Maximum tokens per chunk (4096 for voyage-law-2).
        overlap (int): Overlapping tokens to maintain context between chunks.

    Returns:
        list of str: List of text chunks.
    """
    tokenizer = AutoTokenizer.from_pretrained("voyageai/voyage-2")
    tokens = tokenizer.encode(text, add_special_tokens=False)

    chunks = []
    start = 0
    while start < len(tokens):
        chunk = tokens[start:start + max_tokens]
        chunks.append(tokenizer.decode(chunk))
        start += max_tokens - overlap

    return chunks

doc["Processed_Content"] = doc["Processed_Content"].apply(lambda x: chunk_text(x) if len(x) > 4096 else [x])
df_exploded = doc.explode("Processed_Content").reset_index(drop=True)

In [None]:
def get_embeddings(texts, model="voyage-law-2", batch_size=32):
    """
    Compute embeddings using the VoyageAI Python client in batches.

    Args:
        texts (list of str): List of text data to embed.
        model (str): The embedding model to use.
        batch_size (int): Number of texts per batch.

    Returns:
        list: List of embedding vectors.
    """
    embeddings = []

    texts = [str(text) for text in texts]  

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size] 
        
        try:
            response = vo.embed(batch, model=model)
            batch_embeddings = response.embeddings  
            embeddings.extend(batch_embeddings)
        except Exception as e:
            print(f"Error processing batch {i // batch_size + 1}: {e}")

    return embeddings

df_exploded["Processed_Content"] = df_exploded["Processed_Content"].astype(str)

df_exploded["Embedding"] = get_embeddings(df_exploded["Processed_Content"].tolist())


In [None]:
df_exploded.to_parquet("embeddings_voyage.parquet", engine="pyarrow")