In [1]:
import langchain
import pandas as pd
import vertexai
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_google_vertexai import VertexAIEmbeddings
from langchain_core.documents import Document
import os
import numpy as np
import tiktoken
from tqdm import tqdm

In [2]:
#os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "gcpservicekey.json"
PROJECT_ID = "lawrag"
LOCATION = "us-central1"
vertexai.init(project=PROJECT_ID, location=LOCATION)

In [3]:
doc = pd.read_csv('../Title18_CSV_Data/Title18_processed_sections.csv', encoding='utf-8')

In [4]:
processed_content = doc['Processed_Content']
max_length = doc['Processed_Content'].apply(len).max()
print("Maximum text length in Processed_Content column:", max_length)
print("Total Rows:", len(doc))

Maximum text length in Processed_Content column: 39390
Total Rows: 1647


In [5]:
embeddings_model = VertexAIEmbeddings(model_name="text-embedding-005")

In [6]:
def chunk_text(text, max_tokens=1024, overlap=128):
    tokenizer = tiktoken.get_encoding("cl100k_base")  # Same tokenizer as text-embedding-005
    tokens = tokenizer.encode(text)

    chunks = []
    start = 0
    while start < len(tokens):
        chunk = tokens[start:start + max_tokens]
        chunks.append(tokenizer.decode(chunk))
        start += max_tokens - overlap  # Sliding window
    return chunks

doc["Processed_Content"] = doc["Processed_Content"].apply(lambda x: chunk_text(x) if len(x) > 32000 else [x])
df_exploded = doc.explode("Processed_Content").reset_index(drop=True)



In [7]:
def get_embeddings(texts, batch_size=32):
    """
    Compute embeddings in batches using VertexAIEmbeddings in LangChain.
    Args:
        texts (list of str): List of text data to embed.
        batch_size (int): Number of texts to process per batch.

    Returns:
        list: List of embedding vectors.
    """
    embeddings = []
    
    for i in tqdm(range(0, len(texts), batch_size), desc="Generating Embeddings"):
        batch = texts[i:i + batch_size]  # Get batch of texts
        batch_embeddings = embeddings_model.embed_documents(batch)  # Generate embeddings
        embeddings.extend(batch_embeddings)  # Store results

    return embeddings  # Returning list of lists (each embedding is a list of floats)

# Store embeddings as a list of lists in DataFrame
df_exploded["Embedding"] = get_embeddings(df_exploded["Processed_Content"].tolist())

Generating Embeddings: 100%|██████████| 52/52 [00:16<00:00,  3.25it/s]


In [8]:
df_exploded.to_parquet("embeddings_gemini_text-005.parquet", engine="pyarrow")