In [1]:
import langchain
import pandas as pd
import vertexai
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_google_vertexai import VertexAIEmbeddings
from langchain_core.documents import Document
import os
import numpy as np
import tiktoken
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity


In [6]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = r"..\gcpservicekey.json"
PROJECT_ID = "lawrag"
LOCATION = "us-central1"
vertexai.init(project=PROJECT_ID, location=LOCATION)

In [7]:
embeddings_model = VertexAIEmbeddings(model_name="text-embedding-005")

In [10]:
doc = pd.read_parquet(r"..\New_Embeddings_2025\embeddings_gemini_text-005.parquet", engine="pyarrow")

In [33]:
sample = """
  will be placed in the Child Pornography Victims Reserve, which was created in accordance with section 1402.

"""

testing= sample

In [34]:
def chunk_text(text, max_tokens=1024, overlap=128):
    tokenizer = tiktoken.get_encoding("cl100k_base")  # Same tokenizer as text-embedding-005
    tokens = tokenizer.encode(text)

    chunks = []
    start = 0
    while start < len(tokens):
        chunk = tokens[start:start + max_tokens]
        chunks.append(tokenizer.decode(chunk))
        start += max_tokens - overlap  # Sliding window
    return chunks

sample = chunk_text(sample)

In [35]:
def get_embeddings(texts, batch_size=32):
    """
    Compute embeddings in batches using VertexAIEmbeddings in LangChain.
    Args:
        texts (list of str): List of text data to embed.
        batch_size (int): Number of texts to process per batch.

    Returns:
        list: List of embedding vectors.
    """
    embeddings = []
    
    for i in tqdm(range(0, len(texts), batch_size), desc="Generating Embeddings"):
        batch = texts[i:i + batch_size]  # Get batch of texts
        batch_embeddings = embeddings_model.embed_documents(batch)  # Generate embeddings
        embeddings.extend(batch_embeddings)  # Store results

    return embeddings  # Returning list of lists (each embedding is a list of floats)

# Store embeddings as a list of lists in DataFrame

query_embedding = get_embeddings(sample)

Generating Embeddings: 100%|██████████| 1/1 [00:00<00:00,  3.89it/s]


In [36]:
query_embedding = np.array(query_embedding).reshape(1, -1)


# Convert the embeddings from a Pandas Series to a NumPy array
embeddings = np.vstack(doc['Embedding'].values)  # Stack into a 2D array

print(type(embeddings))  # Should print <class 'numpy.ndarray'>

# Compute cosine similarity
similarities = cosine_similarity(query_embedding, embeddings)
print(len(similarities[0]))

# Get the index of the most similar document
most_similar_index = np.argmax(similarities)

print("Most similar document index:", most_similar_index)

<class 'numpy.ndarray'>
1663
Most similar document index: 96


In [38]:
similarities = cosine_similarity(query_embedding, embeddings)

# Get the indices of the top 5 most similar documents (sorted in descending order)
top_5_indices = np.argsort(similarities[0])[-5:][::-1]

print("Top 5 most similar document indices:", top_5_indices)

Top 5 most similar document indices: [96 94 79 75 98]


In [136]:
type(top_5_indices)

numpy.ndarray

In [26]:
doc['Processed_Content'][96]

'(a) Deposits Into the Reserve.-Notwithstanding any other provision of law, there shall be deposited into the Child Pornography Victims Reserve established under section 1402(d)(6) of the Victims of Crime Act of 1984 (34 U.S.C. 20101(d)) all assessments collected under section 2259A and any gifts, bequests, or donations to the Child Pornography Victims Reserve from private entities or individuals.\n\n(b) Availability for Defined Monetary Assistance.-Amounts in the Child Pornography Victims Reserve shall be available for payment of defined monetary assistance pursuant to section 2259(d). If at any time the Child Pornography Victims Reserve has insufficient funds to make all of the payments ordered under section 2259(d), the Child Pornography Victims Reserve shall make such payments as it can satisfy in full from available funds. In determining the order in which such payments shall be made, the Child Pornography Victims Reserve shall make payments based on the date they were ordered, wi

In [125]:
testing

'\n  Training Requirement.—\n\n(1) In general.—In order for an officer or employee of the Bureau of Prisons, including a correctional officer, to be eligible to receive and carry oleoresin capsicum spray pursuant to this section, the officer or employee\n  \n'

In [139]:
for x in top_5_indices.tolist():
    if str(sample[0]) in str(doc['Processed_Content'][x]):
        print("MATCH FOUND")
    else:
        print("No match found")
#doc['Processed_Content'][most_similar_index]

MATCH FOUND
MATCH FOUND
MATCH FOUND
MATCH FOUND
MATCH FOUND
