In [12]:
import langchain
import pandas as pd
import vertexai
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_google_vertexai import VertexAIEmbeddings
from langchain_core.documents import Document
import os
import numpy as np
import tiktoken
from tqdm import tqdm

In [13]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/npatel237/LawChatBot/gcpservicekey.json"
PROJECT_ID = "lawrag"
LOCATION = "us-central1"
vertexai.init(project=PROJECT_ID, location=LOCATION)

In [14]:
embeddings_model = VertexAIEmbeddings(model_name="text-embedding-005")

In [7]:
def chunk_text(text, max_tokens=4096, overlap=512):
    tokenizer = tiktoken.get_encoding("cl100k_base")  # Same tokenizer as text-embedding-005
    tokens = tokenizer.encode(text)

    chunks = []
    start = 0
    while start < len(tokens):
        chunk = tokens[start:start + max_tokens]
        chunks.append(tokenizer.decode(chunk))
        start += max_tokens - overlap  # Sliding window
    return chunks

In [15]:
def get_embeddings(texts, batch_size=32):
    """
    Compute embeddings in batches using VertexAIEmbeddings in LangChain.
    Args:
        texts (list of str): List of text data to embed.
        batch_size (int): Number of texts to process per batch.

    Returns:
        list: List of embedding vectors.
    """
    embeddings = []
    
    for i in tqdm(range(0, len(texts), batch_size), desc="Generating Embeddings"):
        batch = texts[i:i + batch_size]  # Get batch of texts
        batch_embeddings = embeddings_model.embed_documents(batch)  # Generate embeddings
        embeddings.extend(batch_embeddings)  # Store results

    return embeddings  # Returning list of lists (each embedding is a list of floats)

---
Embedding per section:


In [16]:
doc = pd.read_csv('/Users/npatel237/LawChatBot/Processed_CSV_Data/Title18_processed_sections.csv', encoding='utf-8')

In [17]:
processed_content = doc['Processed_Content']
max_length = doc['Processed_Content'].apply(len).max()
print("Maximum text length in Processed_Content column:", max_length)
print("Total Rows:", len(doc))

Maximum text length in Processed_Content column: 39390
Total Rows: 1647


In [18]:
#doc["Processed_Content"] = doc["Processed_Content"].apply(lambda x: chunk_text(x) if len(x) > 4096 else [x])
df_exploded = doc.explode("Processed_Content").reset_index(drop=True)

In [19]:
df_exploded["Embedding"] = get_embeddings(df_exploded["Processed_Content"].tolist())

Generating Embeddings: 100%|██████████| 52/52 [00:23<00:00,  2.19it/s]


In [20]:
len(df_exploded)

1647

In [21]:
df_exploded.to_parquet("embeddings_gemini_text-005.parquet", engine="pyarrow")

---
Embedding per Chapter and Pages:


In [19]:
doc = pd.read_csv('../Title18_CSV_Data/chunked_title_18semchunk_pages.csv', encoding='utf-8')
doc.head(5)

Unnamed: 0,chunk
0,6001\nImmunity of Witnesses\nV.\n5001\nCorrect...
1,"37\n756, 3058\n38\nT. 22 §465\n39\n5, 3241\n51..."
2,"79\n1003\n80\n287, 1001\n81\n289\n82\n641, 136..."
3,123\n912\n124\n211\n125\n543\n126\n541\n127\n1...
4,199\n205\n200\n204\n201\n1913\n202\n216\n203\n...


In [20]:
doc['chunk'] = doc['chunk'].astype(str)

In [21]:
df_exploded = doc.explode("chunk").reset_index(drop=True)
df_exploded["Embedding"] = get_embeddings(df_exploded["chunk"].tolist())

Generating Embeddings: 100%|██████████| 68/68 [00:23<00:00,  2.88it/s]


In [22]:
df_exploded.to_parquet("embeddings_gemini_text-005_pages.parquet", engine="pyarrow")