In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer
import voyageai

In [2]:
voyageai.api_key = os.getenv("VOYAGE_API")
vo = voyageai.Client()

In [3]:
tokenizer = AutoTokenizer.from_pretrained('voyageai/voyage-2')

tokenizer_config.json:   0%|          | 0.00/816 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


In [4]:
doc = pd.read_parquet("../New_Embeddings_2025/embeddings_voyage.parquet", engine="pyarrow")

In [53]:
sample = """
Health Care Service Fees. — (1)  Generally speaking.— The Director may evaluate and collect a fee for medical services rendered in conjunction with eac in compliance with this section and any regulations the Director may establish to implement this section.
"""

In [54]:
def chunk_text(text, max_tokens=4096, overlap=512):
    """
    Splits text into chunks based on the token limit of voyage-law-2 tokenizer.
    Uses a sliding window approach with overlap.
    
    Args:
        text (str): The input text to be chunked.
        max_tokens (int): Maximum tokens per chunk (4096 for voyage-law-2).
        overlap (int): Overlapping tokens to maintain context between chunks.

    Returns:
        list of str: List of text chunks.
    """
    tokens = tokenizer.encode(text, add_special_tokens=False)

    chunks = []
    start = 0
    while start < len(tokens):
        chunk = tokens[start:start + max_tokens]
        chunks.append(tokenizer.decode(chunk))
        start += max_tokens - overlap

    return chunks

sample = chunk_text(sample)

In [55]:
def get_embeddings(texts, model="voyage-law-2", batch_size=32):
    """
    Compute embeddings using the VoyageAI Python client in batches.

    Args:
        texts (list of str): List of text data to embed.
        model (str): The embedding model to use.
        batch_size (int): Number of texts per batch.

    Returns:
        list: List of embedding vectors.
    """
    embeddings = []

    texts = [str(text) for text in texts]  

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size] 
        
        try:
            response = vo.embed(batch, model=model)
            batch_embeddings = response.embeddings  
            embeddings.extend(batch_embeddings)
        except Exception as e:
            print(f"Error processing batch {i // batch_size + 1}: {e}")

    return embeddings

query_embedding = get_embeddings(sample)


In [57]:
query_embedding = np.array(query_embedding).reshape(1, -1)


# Convert the embeddings from a Pandas Series to a NumPy array
embeddings = np.vstack(doc['Embedding'].values)  # Stack into a 2D array

print(type(embeddings))  # Should print <class 'numpy.ndarray'>

# Compute cosine similarity
similarities = cosine_similarity(query_embedding, embeddings)
print(len(similarities[0]))

# Get the index of the most similar document
most_similar_index = np.argmax(similarities)

print("Most similar document index:", most_similar_index)

<class 'numpy.ndarray'>
1647
Most similar document index: 789


In [58]:
similarities = cosine_similarity(query_embedding, embeddings)

# Get the indices of the top 5 most similar documents (sorted in descending order)
top_5_indices = np.argsort(similarities[0])[-5:][::-1]

print("Top 5 most similar document indices:", top_5_indices)

Top 5 most similar document indices: [789 779 318 676 772]


[ 670  231  861 1179 1429]-----------[670 861 860 231 862]
[ 345 1580 1581 1573  203]-----------[ 345 1580 1581  203  508]
[1186 1183 1181 1151 1041]----------- [1186 1181 1151 1041 1183]
[789 779 772 318 250]----------------- [789 779 318 676 772]

In [41]:
print(doc['Processed_Content'][789])

[['(a) Definitions.—In this section—\n\n(1) the term “account” means the trust fund account (or institutional equivalent) of a prisoner;\n\n(2) the term “Director” means the Director of the Bureau of Prisons;\n\n(3) the term “health care provider” means any person who is—\n\n(A) authorized by the Director to provide health care services; and\n\n(B) operating within the scope of such authorization;\n\n(4) the term “health care visit”—\n\n(A) means a visit, as determined by the Director, by a prisoner to an institutional or noninstitutional health care provider; and\n\n(B) does not include a visit initiated by a prisoner—\n\n(i) pursuant to a staff referral; or\n\n(ii) to obtain staff-approved follow-up treatment for a chronic condition; and\n\n(5) the term “prisoner” means—\n\n(A) any individual who is incarcerated in an institution under the jurisdiction of the Bureau of Prisons; or\n\n(B) any other individual, as designated by the Director, who has been charged with or convicted of an