In [41]:
import langchain
import pandas as pd
import vertexai
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_google_vertexai import VertexAIEmbeddings
from langchain_core.documents import Document
import os
import numpy as np
import tiktoken
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity


In [36]:
#os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "gcpservicekey.json"
PROJECT_ID = "lawrag"
LOCATION = "us-central1"
vertexai.init(project=PROJECT_ID, location=LOCATION)

In [37]:
embeddings_model = VertexAIEmbeddings(model_name="text-embedding-005")

In [48]:
doc = pd.read_parquet("embeddings_gemini_text-005.parquet", engine="pyarrow")

In [57]:
sample = "After reviewing the individual's financial disclosure report submitted in accordance with chapter 131 of title 5, the official in charge of the employee's appointment certifies in writing that the need for the individual's services outweighs the possibility of a conflict of interest brought about by the financial interest involved."

In [58]:
def chunk_text(text, max_tokens=1024, overlap=128):
    tokenizer = tiktoken.get_encoding("cl100k_base")  # Same tokenizer as text-embedding-005
    tokens = tokenizer.encode(text)

    chunks = []
    start = 0
    while start < len(tokens):
        chunk = tokens[start:start + max_tokens]
        chunks.append(tokenizer.decode(chunk))
        start += max_tokens - overlap  # Sliding window
    return chunks

sample = chunk_text(sample)

In [59]:
def get_embeddings(texts, batch_size=32):
    """
    Compute embeddings in batches using VertexAIEmbeddings in LangChain.
    Args:
        texts (list of str): List of text data to embed.
        batch_size (int): Number of texts to process per batch.

    Returns:
        list: List of embedding vectors.
    """
    embeddings = []
    
    for i in tqdm(range(0, len(texts), batch_size), desc="Generating Embeddings"):
        batch = texts[i:i + batch_size]  # Get batch of texts
        batch_embeddings = embeddings_model.embed_documents(batch)  # Generate embeddings
        embeddings.extend(batch_embeddings)  # Store results

    return embeddings  # Returning list of lists (each embedding is a list of floats)

# Store embeddings as a list of lists in DataFrame

query_embedding = get_embeddings(sample)

Generating Embeddings: 100%|██████████| 1/1 [00:00<00:00,  1.79it/s]


In [60]:
query_embedding = np.array(query_embedding).reshape(1, -1)

# Convert the embeddings from a Pandas Series to a NumPy array
embeddings = np.vstack(doc['Embedding'].values)  # Stack into a 2D array

print(type(embeddings))  # Should print <class 'numpy.ndarray'>

# Compute cosine similarity
similarities = cosine_similarity(query_embedding, embeddings)

# Get the index of the most similar document
most_similar_index = np.argmax(similarities)

print("Most similar document index:", most_similar_index)

<class 'numpy.ndarray'>
Most similar document index: 246


In [61]:
doc['Processed_Content'][246]

"(a) Except as permitted by subsection (b), whoever, being an officer or employee of the executive branch of the United States Government, or of any independent agency of the United States, a Federal Reserve bank director, officer, or employee, or an officer or employee of the District of Columbia, including a special Government employee, participates personally and substantially as a Government officer or employee, through decision, approval, disapproval, recommendation, the rendering of advice, investigation, or otherwise, in a judicial or other proceeding, application, request for a ruling or other determination, contract, claim, controversy, charge, accusation, arrest, or other particular matter in which, to his knowledge, he, his spouse, minor child, general partner, organization in which he is serving as officer, director, trustee, general partner or employee, or any person or organization with whom he is negotiating or has any arrangement concerning prospective employment, has a