In [12]:
import pandas as pd
import os
import huggingface_hub
from transformers import AutoTokenizer
import voyageai
from tqdm import tqdm


In [13]:
voyageai.api_key = os.getenv("VOYAGE_API")
vo = voyageai.Client()

In [14]:
tokenizer = AutoTokenizer.from_pretrained('voyageai/voyage-2')

In [4]:
def chunk_text(text, max_tokens=4096, overlap=512):
    """
    Splits text into chunks based on the token limit of voyage-law-2 tokenizer.
    Uses a sliding window approach with overlap.
    
    Args:
        text (str): The input text to be chunked.
        max_tokens (int): Maximum tokens per chunk (4096 for voyage-law-2).
        overlap (int): Overlapping tokens to maintain context between chunks.

    Returns:
        list of str: List of text chunks.
    """
    tokens = tokenizer.encode(text, add_special_tokens=False)

    chunks = []
    start = 0
    while start < len(tokens):
        chunk = tokens[start:start + max_tokens]
        chunks.append(tokenizer.decode(chunk))
        start += max_tokens - overlap

    return chunks

In [5]:
def get_embeddings(texts, model="voyage-law-2", batch_size=32):
    """
    Compute embeddings using the VoyageAI Python client in batches.

    Args:
        texts (list of str): List of text data to embed.
        model (str): The embedding model to use.
        batch_size (int): Number of texts per batch.

    Returns:
        list: List of embedding vectors.
    """
    embeddings = []

    texts = [str(text) for text in texts]  

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size] 
        
        try:
            response = vo.embed(batch, model=model)
            batch_embeddings = response.embeddings  
            embeddings.extend(batch_embeddings)
        except Exception as e:
            print(f"Error processing batch {i // batch_size + 1}: {e}")

    return embeddings

---
Embedding per SECTIONS through ['Processed_Content']:


In [16]:
doc = pd.read_csv('/Users/npatel237/LawChatBot/Title18_CSV_Data/Title18_processed_sections.csv', encoding='utf-8')
processed_content = doc['Processed_Content']

Tokenziation:

In [None]:
doc["Processed_Content"] = doc["Processed_Content"].apply(lambda x: chunk_text(x) if len(x) > 4096 else [x])
df_exploded = doc.explode("Processed_Content").reset_index(drop=True)

Embedding:

In [None]:
df_exploded["Processed_Content"] = df_exploded["Processed_Content"].astype(str)
df_exploded["Embedding"] = get_embeddings(df_exploded["Processed_Content"].tolist())

In [None]:
df_exploded.to_parquet("embeddings_voyage.parquet", engine="pyarrow")

---
Embedding per chapters through ['Processed_Content']:

In [21]:
doc = pd.read_csv('/Users/npatel237/LawChatBot/Title18_CSV_Data/chunked_title_18semchunk_pages.csv', encoding='utf-8')

In [22]:
doc.head(5)

Unnamed: 0,chunk
0,6001\nImmunity of Witnesses\nV.\n5001\nCorrect...
1,"37\n756, 3058\n38\nT. 22 §465\n39\n5, 3241\n51..."
2,"79\n1003\n80\n287, 1001\n81\n289\n82\n641, 136..."
3,123\n912\n124\n211\n125\n543\n126\n541\n127\n1...
4,199\n205\n200\n204\n201\n1913\n202\n216\n203\n...


In [23]:
doc['chunk'] = doc['chunk'].astype(str).fillna("")

In [24]:
doc["Embedding"] = get_embeddings(doc["chunk"].tolist())

doc.to_parquet("embeddings_voyage_per_pages_semchunked.parquet", engine="pyarrow")

In [25]:
doc = pd.read_parquet("embeddings_voyage_per_pages_semchunked.parquet", engine="pyarrow")

In [26]:
doc.head()

Unnamed: 0,chunk,Embedding
0,6001\nImmunity of Witnesses\nV.\n5001\nCorrect...,"[-0.0461999773979187, -0.024502042680978775, 0..."
1,"37\n756, 3058\n38\nT. 22 §465\n39\n5, 3241\n51...","[-0.00995637383311987, 0.008323794230818748, 0..."
2,"79\n1003\n80\n287, 1001\n81\n289\n82\n641, 136...","[-0.011511960998177528, 0.031173910945653915, ..."
3,123\n912\n124\n211\n125\n543\n126\n541\n127\n1...,"[0.011239621788263321, 0.025644589215517044, 0..."
4,199\n205\n200\n204\n201\n1913\n202\n216\n203\n...,"[-0.015470130369067192, 0.024292832240462303, ..."
