In [114]:
import langchain
import pandas as pd
import vertexai
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_google_vertexai import VertexAIEmbeddings
from langchain_core.documents import Document
import os
import numpy as np
import tiktoken
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity


In [115]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = r"LawChatBot\gcpservicekey.json"
PROJECT_ID = "lawrag"
LOCATION = "us-central1"
vertexai.init(project=PROJECT_ID, location=LOCATION)

In [116]:
embeddings_model = VertexAIEmbeddings(model_name="text-embedding-005")

In [117]:
doc = pd.read_parquet("embeddings_gemini_text-005.parquet", engine="pyarrow")

In [135]:
sample = """
  Moneys received or tendered in evidence in any United States Court, or before any officer thereof, which have been paid to or received by any official as a bribe, shall, after the final disposition
  
"""

testing= sample

In [132]:
def chunk_text(text, max_tokens=1024, overlap=128):
    tokenizer = tiktoken.get_encoding("cl100k_base")  # Same tokenizer as text-embedding-005
    tokens = tokenizer.encode(text)

    chunks = []
    start = 0
    while start < len(tokens):
        chunk = tokens[start:start + max_tokens]
        chunks.append(tokenizer.decode(chunk))
        start += max_tokens - overlap  # Sliding window
    return chunks

sample = chunk_text(sample)

In [133]:
def get_embeddings(texts, batch_size=32):
    """
    Compute embeddings in batches using VertexAIEmbeddings in LangChain.
    Args:
        texts (list of str): List of text data to embed.
        batch_size (int): Number of texts to process per batch.

    Returns:
        list: List of embedding vectors.
    """
    embeddings = []
    
    for i in tqdm(range(0, len(texts), batch_size), desc="Generating Embeddings"):
        batch = texts[i:i + batch_size]  # Get batch of texts
        batch_embeddings = embeddings_model.embed_documents(batch)  # Generate embeddings
        embeddings.extend(batch_embeddings)  # Store results

    return embeddings  # Returning list of lists (each embedding is a list of floats)

# Store embeddings as a list of lists in DataFrame

query_embedding = get_embeddings(sample)

Generating Embeddings: 100%|██████████| 1/1 [00:00<00:00,  3.96it/s]


In [134]:
query_embedding = np.array(query_embedding).reshape(1, -1)


# Convert the embeddings from a Pandas Series to a NumPy array
embeddings = np.vstack(doc['Embedding'].values)  # Stack into a 2D array

print(type(embeddings))  # Should print <class 'numpy.ndarray'>

# Compute cosine similarity
similarities = cosine_similarity(query_embedding, embeddings)
print(len(similarities[0]))

# Get the index of the most similar document
most_similar_index = np.argmax(similarities)

print("Most similar document index:", most_similar_index)

<class 'numpy.ndarray'>
1663
Most similar document index: 678


In [137]:
similarities = cosine_similarity(query_embedding, embeddings)

# Get the indices of the top 5 most similar documents (sorted in descending order)
top_5_indices = np.argsort(similarities[0])[-5:][::-1]

print("Top 5 most similar document indices:", top_5_indices)

Top 5 most similar document indices: [678 248 231 868 991]


In [136]:
type(top_5_indices)

numpy.ndarray

In [130]:
doc['Processed_Content'][678]

'Moneys received or tendered in evidence in any United States Court, or before any officer thereof, which have been paid to or received by any official as a bribe, shall, after the final disposition of the case, proceeding or investigation, be deposited in the registry of the court to be disposed of in accordance with the order of the court, to be subject, however, to the provisions of section 2042 of Title 28.\n(June 25, 1948, ch. 645, 62 Stat. 840; May 24, 1949, ch. 139, §55, 63 Stat. 96; renumbered §3666, Pub. L. 98–473, title II, §212(a)(1), Oct. 12, 1984, 98 Stat. 1987.)\n\nHistorical and Revision Notes\n1948 Act\nBased on title 18, U.S.C., 1940 ed., §570 (Jan. 7, 1925, ch. 33, 43 Stat. 726).\nChanges were made in phraseology.\n\n1949 Act\nThis section [section 55] corrects section 3612 of title 18, U.S.C., so that the reference in such section will be to the correct section number in title 28, U.S.C., as revised and enacted in 1948.\n\nEditorial Notes\nAmendments\n1949—Act May 24

In [125]:
testing

'\n  Training Requirement.—\n\n(1) In general.—In order for an officer or employee of the Bureau of Prisons, including a correctional officer, to be eligible to receive and carry oleoresin capsicum spray pursuant to this section, the officer or employee\n  \n'

In [139]:
for x in top_5_indices.tolist():
    if str(sample[0]) in str(doc['Processed_Content'][x]):
        print("MATCH FOUND")
    else:
        print("No match found")
#doc['Processed_Content'][most_similar_index]

MATCH FOUND
MATCH FOUND
MATCH FOUND
MATCH FOUND
MATCH FOUND
