In [2]:
from libraries import *

In [None]:
df = pd.read_csv(r'Chapter1_General_Provisions.csv', encoding='latin-1')

In [2]:
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased") # nlpaueb/legal-bert-base-uncased   |  
model = AutoModel.from_pretrained("nlpaueb/legal-bert-base-uncased")

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Step 3: Function to get embeddings
def get_embedding(text, tokenizer, model):
    # Tokenize and encode the input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    
    # Compute embeddings
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get the mean of the token embeddings (for sentence embedding)
    return outputs.last_hidden_state.mean(dim=1).squeeze()

# Step 4: Encode the entire 'Content' column
embeddings = np.array([get_embedding(text, tokenizer, model).cpu().numpy() for text in df['Content']]).astype('float32')

# Step 5: Initialize a FAISS index with L2 (Euclidean) distance
dimension = embeddings.shape[1]  # Get the dimensionality of the embeddings
index = faiss.IndexFlatL2(dimension)  # IndexFlatL2 is a flat index (no compression)

# Step 6: Add the vectors to the index
index.add(embeddings)

# Optional: Verify the number of vectors in the index
print(f"Number of vectors in the index: {index.ntotal}")

# Step 7: Prepare a test query

Number of vectors in the index: 26


In [4]:
embeddings

array([[-0.28555125,  0.22837129,  0.10524645, ...,  0.02770011,
         0.09220038,  0.14357919],
       [-0.06945419,  0.29135713, -0.14676999, ...,  0.02431776,
         0.06939627, -0.00483767],
       [-0.17025226,  0.23414913, -0.02093459, ...,  0.01080881,
         0.07847007,  0.12491217],
       ...,
       [-0.22828811,  0.13438858, -0.02343843, ..., -0.2920958 ,
        -0.01720813,  0.06804806],
       [ 0.15994701,  0.3775736 ,  0.13411745, ..., -0.25126198,
        -0.02746024,  0.04885098],
       [-0.01835807,  0.21772102,  0.05754883, ..., -0.12211549,
         0.08892547,  0.06340703]], dtype=float32)

In [18]:
test = ''' 
 for which the maximum fine is no greater than the amount set forth
for such an offense in section 3571(b)(6) or (7) in the case of an individual or section 3571(c)(6) or
(7) in the case of an organization.
'''

# Step 8: Encode the test query
test_embedding = get_embedding(test, tokenizer, model).reshape(1, -1).cpu().numpy().astype('float32')

# Step 9: Example: Search for the nearest neighbors
D, I = index.search(test_embedding, k=2)  # Search for 2 nearest neighbors

# Step 10: Print the distances and indices of the nearest neighbors
print("Distances:", D)
print("Indices:", I)

# Optionally print the neighbor texts
for idx in I[0]:
    print("Neighbor text:", df['Content'].iloc[idx])

Distances: [[10.9441185 16.679928 ]]
Indices: [[18 23]]
Neighbor text: As used in this title, the term "petty offense" means a Class B misdemeanor, a Class C
misdemeanor, or an infraction, for which the maximum fine is no greater than the amount set forth
for such an offense in section 3571(b)(6) or (7) in the case of an individual or section 3571(c)(6) or
(7) in the case of an organization.
(Added Pub. L. 100185, §4(a), Dec. 11, 1987, 101 Stat. 1279; amended Pub. L. 100690, title VII,
§7089(a), Nov. 18, 1988, 102 Stat. 4409.)
EDITORIAL NOTES
AMENDMENTS
1988Pub. L. 100690 inserted ", for which the maximum fine is no greater than the amount set forth for
such an offense in section 3571(b)(6) or (7) in the case of an individual or section 3571(c)(6) or (7) in the case
of an organization" after "infraction".
Neighbor text: (a) DEFINITIONS.In this section, the following definitions shall apply:
(1) CRIME OF VIOLENCE.The term "crime of violence" has the meaning set forth in
section 16

In [5]:
# Step 1: Load CSV and prepare embedding model
df = pd.read_csv(r'Chapter1_General_Provisions.csv', encoding='latin-1')

tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
model = AutoModel.from_pretrained("nlpaueb/legal-bert-base-uncased")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Function to get embeddings
def get_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze()

# Encode the entire 'Content' column
embeddings = np.array([get_embedding(text, tokenizer, model).cpu().numpy() for text in df['Content']]).astype('float32')

# Step 2: Connect to Qdrant Cloud using your API key and cluster URL
client = QdrantClient(
    url="https://cecd63ca-9699-4bc0-a60f-3c7e8c768fd3.europe-west3-0.gcp.cloud.qdrant.io:6333",  # Replace with your cloud cluster URL
    api_key="jwv0ETSb8751Q1yrQ3RVSsChFudsv1M63mEvxXXRnLf_ROoeAfe_Wg"  # Replace with your API key
)

# Step 3: Create a collection in your Qdrant cloud instance
collection_name = "LawEmbedding"  # The name of your collection in the cloud
dimension = embeddings.shape[1]  # Get the dimensionality of embeddings

client.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=dimension, distance=Distance.COSINE)  # Use Distance.COSINE for cosine similarity
)

# Step 4: Insert embeddings into Qdrant
points = [
    {
        "id": idx,
        "vector": embedding.tolist(),  # Qdrant accepts lists, not numpy arrays
        "payload": {"text": df['Content'][idx]}  # Optionally store the corresponding text with each embedding
    }
    for idx, embedding in enumerate(embeddings)
]

client.upsert(
    collection_name=collection_name,
    points=points
)

# Optional: Verify number of points in the collection
info = client.get_collection(collection_name)
print(f"Number of vectors in the collection: {info.points_count}")


  client.recreate_collection(


Number of vectors in the collection: 26


In [6]:
df

Unnamed: 0,Sections,Content
0,1. Repealed.,ENATE REVISION AMENDMENT\r\nIn the analysis of...
1,2. Principals.,(a) Whoever commits an offense against the Uni...
2,3. Accessory after the fact.,"Whoever, knowing that an offense against the U..."
3,4. Misprision of felony.,"Whoever, having knowledge of the actual commis..."
4,5. United States defined.,"The term ""United States"", as used in this titl..."
5,6. Department and agency defined.,"As used in this title:\r\nThe term ""department..."
6,7. Special maritime and territorial jurisdicti...,"The term ""special maritime and territorial jur..."
7,8. Obligation or other security of the United ...,"The term ""obligation or other security of the ..."
8,9. Vessel of the United States defined.,"The term ""vessel of the United States"", as use..."
9,10. Interstate commerce and foreign commerce d...,"The term ""interstate commerce"", as used in thi..."


In [7]:
if client.get_collection(collection_name):
    client.delete_collection(collection_name)  # This will delete the collection if it exists
    print(f"Collection '{collection_name}' deleted.")

Collection 'LawEmbedding' deleted.


In [8]:
dimension = embeddings.shape[1]  # Get the dimensionality of the embeddings

client.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=dimension, distance=Distance.COSINE)  # Use COSINE distance for similarity
)

# Step 5: Insert embeddings row by row into Qdrant
points = [
    {
        "id": idx,  # The row index will be used as the unique ID
        "vector": embedding.tolist(),  # Convert numpy array to list for Qdrant
        "payload": {
            "title": df['Sections'][idx],  # Store the title from the CSV row
            "text": df['Content'][idx]  # Store the content (text) from the CSV row
        }
    }
    for idx, embedding in enumerate(embeddings)
]

client.upsert(
    collection_name=collection_name,
    points=points
)

# Optional: Verify the number of vectors in the collection
info = client.get_collection(collection_name)
print(f"Number of vectors in the collection: {info.points_count}")

  client.recreate_collection(


Number of vectors in the collection: 26


In [19]:
test_embedding = get_embedding(test, tokenizer, model).cpu().numpy().astype('float32')
test_embedding_flat = test_embedding.flatten().tolist()  # Flatten to 1D list

search_result = client.search(
    collection_name=collection_name,
    query_vector=test_embedding_flat,  # Use 'query_vector' parameter directly
    limit=2  # Number of nearest neighbors you want to retrieve
)

# Step 5: Print the distances and indices of the nearest neighbors
for result in search_result:
    print(f"Distance: {result.score}")  # Score represents similarity (lower is better)
    print(f"Matched Title: {result.payload['title']}")
    print(f"Matched Text: {result.payload['text']}")

Distance: 0.9424052
Matched Title: 19. Petty offense defined.
Matched Text: As used in this title, the term "petty offense" means a Class B misdemeanor, a Class C
misdemeanor, or an infraction, for which the maximum fine is no greater than the amount set forth
for such an offense in section 3571(b)(6) or (7) in the case of an individual or section 3571(c)(6) or
(7) in the case of an organization.
(Added Pub. L. 100185, §4(a), Dec. 11, 1987, 101 Stat. 1279; amended Pub. L. 100690, title VII,
§7089(a), Nov. 18, 1988, 102 Stat. 4409.)
EDITORIAL NOTES
AMENDMENTS
1988Pub. L. 100690 inserted ", for which the maximum fine is no greater than the amount set forth for
such an offense in section 3571(b)(6) or (7) in the case of an individual or section 3571(c)(6) or (7) in the case
of an organization" after "infraction".
Distance: 0.9103349
Matched Title: 25. Use of minors in crimes of violence.
Matched Text: (a) DEFINITIONS.In this section, the following definitions shall apply:
(1) CRIME O