In [16]:
import pandas as pd
import os
import huggingface_hub
from transformers import AutoTokenizer


In [17]:
doc = pd.read_csv('/Users/npatel237/LawChatBot/Title18_CSV_Data/Title18_processed_sections.csv', encoding='utf-8')
processed_content = doc['Processed_Content']

In [18]:
import voyageai

voyageai.api_key = os.getenv("VOYAGE_API")
vo = voyageai.Client()

Tokenziation:

In [19]:
tokenizer = AutoTokenizer.from_pretrained('voyageai/voyage-2')

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


In [25]:
def chunk_text(text, max_tokens=4096, overlap=512):
    """
    Splits text into chunks based on the token limit of voyage-law-2 tokenizer.
    Uses a sliding window approach with overlap.
    
    Args:
        text (str): The input text to be chunked.
        max_tokens (int): Maximum tokens per chunk (4096 for voyage-law-2).
        overlap (int): Overlapping tokens to maintain context between chunks.

    Returns:
        list of str: List of text chunks.
    """
    tokenizer = AutoTokenizer.from_pretrained("voyageai/voyage-2")
    tokens = tokenizer.encode(text, add_special_tokens=False)

    chunks = []
    start = 0
    while start < len(tokens):
        chunk = tokens[start:start + max_tokens]
        chunks.append(tokenizer.decode(chunk))
        start += max_tokens - overlap  # Sliding window approach

    return chunks

# Apply chunking to the dataset
doc["Processed_Content"] = doc["Processed_Content"].apply(lambda x: chunk_text(x) if len(x) > 4096 else [x])
df_exploded = doc.explode("Processed_Content").reset_index(drop=True)

In [26]:
df_exploded

Unnamed: 0,Section,Url,Content,Metadata,Processed_Content,Processed_Section,Processed_Metadata
0,103. Ã¢ÂÂFront Matter,https://uscode.house.gov/view.xhtml?req=granul...,18 USC Ch. 103: Front Matter Result...,"Amendments1992-Pub. L. 102Ã¢ÂÂ519, title I, ...",[[TITLE 18 / PART I / CHAPTER 103 / Front Matt...,103. Front Matter,"Amendments1992-Pub. L. 102519, title I, §101(c..."
1,2111. Special maritime and territorial jurisdi...,https://uscode.house.gov/view.xhtml?req=granul...,"Whoever, within the special maritime and terri...","Amendments1994-Pub. L. 103Ã¢ÂÂ322 inserted ""...","[[Whoever, within the special maritime and ter...",2111. Special maritime and territorial jurisdi...,"Amendments1994-Pub. L. 103322 inserted ""or att..."
2,2112. Personal property of United States,https://uscode.house.gov/view.xhtml?req=granul...,Whoever robs or attempts to rob another of any...,"Amendments1994-Pub. L. 103Ã¢ÂÂ322 inserted ""...",[[Whoever robs or attempts to rob another of a...,2112. Personal property of United States,"Amendments1994-Pub. L. 103322 inserted ""or att..."
3,2113. Bank robbery and incidental crimes,https://uscode.house.gov/view.xhtml?req=granul...,"(a) Whoever, by force and violence, or by inti...",References in TextSection 1(b) of the Internat...,"[[(a) Whoever, by force and violence, or by in...",2113. Bank robbery and incidental crimes,References in TextSection 1(b) of the Internat...
4,"2114. Mail, money, or other property of United...",https://uscode.house.gov/view.xhtml?req=granul...,(a) Assault.-A person who assaults any person ...,Amendments1996-Pub. L. 104Ã¢ÂÂ294 amended Pu...,[[(a) Assault.—A person who assaults any perso...,"2114. Mail, money, or other property of United...",Amendments1996-Pub. L. 104294 amended Pub. L. ...
...,...,...,...,...,...,...,...
1642,50. Prompt Disposition,https://uscode.house.gov/view.xhtml?req=granul...,Scheduling preference must be given to crimina...,,[[Scheduling preference must be given to crimi...,50. Prompt Disposition,
1643,51. Preserving Claimed Error,https://uscode.house.gov/view.xhtml?req=granul...,(a) Exceptions Unnecessary. Exceptions to ruli...,,[[(a) Exceptions Unnecessary. Exceptions to ru...,51. Preserving Claimed Error,
1644,52. Harmless and Plain Error,https://uscode.house.gov/view.xhtml?req=granul...,"(a) Harmless Error. Any error, defect, irregul...",,"[[(a) Harmless Error. Any error, defect, irreg...",52. Harmless and Plain Error,
1645,53. Courtroom Photographing and Broadcasting P...,https://uscode.house.gov/view.xhtml?req=granul...,Except as otherwise provided by a statute or t...,,[[Except as otherwise provided by a statute or...,53. Courtroom Photographing and Broadcasting P...,


In [29]:
def get_embeddings(texts, model="voyage-law-2", batch_size=32):
    """
    Compute embeddings using the VoyageAI Python client in batches.

    Args:
        texts (list of str): List of text data to embed.
        model (str): The embedding model to use.
        batch_size (int): Number of texts per batch.

    Returns:
        list: List of embedding vectors.
    """
    embeddings = []

    # Ensure texts is a list of strings (avoid nested lists or non-string values)
    texts = [str(text) for text in texts]  # Convert any non-string data to string

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]  # Get batch of texts
        
        try:
            response = vo.embed(batch, model=model)
            batch_embeddings = response.embeddings  # Extract embeddings
            embeddings.extend(batch_embeddings)  # Store results
        except Exception as e:
            print(f"Error processing batch {i // batch_size + 1}: {e}")

    return embeddings  # Returns a list of lists (each embedding is a list of floats)

# Ensure processed content is in a valid format before embedding
df_exploded["Processed_Content"] = df_exploded["Processed_Content"].astype(str)

# Generate embeddings
df_exploded["Embedding"] = get_embeddings(df_exploded["Processed_Content"].tolist())


In [None]:
df_exploded.to_parquet("embeddings_voyage.parquet", engine="pyarrow")