In [6]:
import pandas as pd
import os
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer
import voyageai

In [7]:
voyageai.api_key = os.getenv("VOYAGE_API")
vo = voyageai.Client()

In [8]:
tokenizer = AutoTokenizer.from_pretrained('voyageai/voyage-2')

In [9]:
doc = pd.read_parquet("/Users/npatel237/LawChatBot/New_Embeddings_2025/embeddings_voyage.parquet", engine="pyarrow")

In [17]:
sample = """
After the final decision, funds that have been given to or received by an official as a bribe and are received or submitted as evidence in any US court or before any of its officers shall

"""

In [18]:
def chunk_text(text, max_tokens=4096, overlap=512):
    """
    Splits text into chunks based on the token limit of voyage-law-2 tokenizer.
    Uses a sliding window approach with overlap.
    
    Args:
        text (str): The input text to be chunked.
        max_tokens (int): Maximum tokens per chunk (4096 for voyage-law-2).
        overlap (int): Overlapping tokens to maintain context between chunks.

    Returns:
        list of str: List of text chunks.
    """
    tokens = tokenizer.encode(text, add_special_tokens=False)

    chunks = []
    start = 0
    while start < len(tokens):
        chunk = tokens[start:start + max_tokens]
        chunks.append(tokenizer.decode(chunk))
        start += max_tokens - overlap

    return chunks

sample = chunk_text(sample)

In [19]:
def get_embeddings(texts, model="voyage-law-2", batch_size=32):
    """
    Compute embeddings using the VoyageAI Python client in batches.

    Args:
        texts (list of str): List of text data to embed.
        model (str): The embedding model to use.
        batch_size (int): Number of texts per batch.

    Returns:
        list: List of embedding vectors.
    """
    embeddings = []

    texts = [str(text) for text in texts]  

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size] 
        
        try:
            response = vo.embed(batch, model=model)
            batch_embeddings = response.embeddings  
            embeddings.extend(batch_embeddings)
        except Exception as e:
            print(f"Error processing batch {i // batch_size + 1}: {e}")

    return embeddings

query_embedding = get_embeddings(sample)


In [20]:
query_embedding = np.array(query_embedding).reshape(1, -1)


# Convert the embeddings from a Pandas Series to a NumPy array
embeddings = np.vstack(doc['Embedding'].values)  # Stack into a 2D array

print(type(embeddings))  # Should print <class 'numpy.ndarray'>

# Compute cosine similarity
similarities = cosine_similarity(query_embedding, embeddings)
print(len(similarities[0]))

# Get the index of the most similar document
most_similar_index = np.argmax(similarities)

print("Most similar document index:", most_similar_index)

<class 'numpy.ndarray'>
1647
Most similar document index: 670


In [21]:
doc['Processed_Content'][670]

'[[\'Moneys received or tendered in evidence in any United States Court, or before any officer thereof, which have been paid to or received by any official as a bribe, shall, after the final disposition of the case, proceeding or investigation, be deposited in the registry of the court to be disposed of in accordance with the order of the court, to be subject, however, to the provisions of section 2042 of Title 28.\\n(June 25, 1948, ch. 645, 62 Stat. 840; May 24, 1949, ch. 139, §55, 63 Stat. 96; renumbered §3666, Pub. L. 98–473, title II, §212(a)(1), Oct. 12, 1984, 98 Stat. 1987.)\\n\\nHistorical and Revision Notes\\n1948 Act\\nBased on title 18, U.S.C., 1940 ed., §570 (Jan. 7, 1925, ch. 33, 43 Stat. 726).\\nChanges were made in phraseology.\\n\\n1949 Act\\nThis section [section 55] corrects section 3612 of title 18, U.S.C., so that the reference in such section will be to the correct section number in title 28, U.S.C., as revised and enacted in 1948.\\n\\nEditorial Notes\\nAmendments\