In [None]:
"""
In Retrieval-Augmented Generation (RAG), the term "agentic" refers to the ability of the retrieval-augmented model to actively control, adapt, and refine its retrieval and generation process rather than passively relying on a fixed retrieval mechanism. An agentic RAG system does more than simply fetch and generate responses; it dynamically decides what to retrieve, how to process retrieved information, and whether to refine queries to improve results.

Key Aspects of Agentic RAG:
Iterative Retrieval – The system can refine its search queries based on the initial retrieved results to get more relevant information.
Self-Correction – The model can assess the quality of retrieved documents and decide to re-query or adjust its approach.
Context-Aware Retrieval – Instead of retrieving a fixed set of documents, it considers the conversation flow or prior responses to optimize search.
Adaptive Chunking – The system intelligently chunks retrieved content into more meaningful sections based on the query.
Multi-Hop Reasoning – It can retrieve and combine multiple pieces of information from different sources to construct a more coherent response.
This approach makes RAG systems more autonomous, efficient, and contextually aware, enhancing their ability to provide accurate, well-informed, and dynamically updated responses.
"""

In [None]:
!pip install -U voyageai

In [1]:
import numpy as np
import faiss
from sklearn.metrics.pairwise import cosine_similarity

In [196]:
import pandas as pd
df=pd.read_csv(r'Title18_processed.csv', encoding='utf-8')
df2=pd.read_csv(r'Title18_reprocessed.csv', encoding='utf-8')
print(df.shape)
print(df.columns)
print(df2.columns)

(1647, 5)
Index(['Section', 'Url', 'Content', 'Metadata', 'Processed_Content'], dtype='object')
Index(['Section', 'Url', 'Content', 'Metadata'], dtype='object')


In [225]:
import random

random_number = random.randint(1, 1647)  # Generates a random number between 1 and 100
print(random_number)
print(df['Section'][random_number])
print("-------------------------------------------")
print(df2['Section'][random_number])
#

1553
3. Protective orders
-------------------------------------------
3. Protective orders



In [19]:
import voyageai
import os
import dotenv

# Load environment variables from .env file
dotenv.load_dotenv()

vo = voyageai.Client()

# Test embedding
result = vo.embed(["hello world"], model="voyage-3")
print(result)


RateLimitError: You have not yet added your payment method in the billing page and will have reduced rate limits of 3 RPM and 10K TPM.  Please add your payment method in the billing page (https://dash.voyageai.com/billing/payment-methods) to unlock our standard rate limits (https://docs.voyageai.com/docs/rate-limits).  Even with payment methods entered, the free tokens (200M tokens for Voyage series 3) will still apply. See our pricing docs (https://docs.voyageai.com/docs/pricing) for the free tokens for your model.

In [20]:
from sentence_transformers import SentenceTransformer

# initialize sentence transformer model
model = SentenceTransformer('pile-of-law/legalbert-large-1.7M-2')

No sentence-transformers model found with name pile-of-law/legalbert-large-1.7M-2. Creating a new one with MEAN pooling.


config.json:   0%|          | 0.00/589 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/1.35G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


tokenizer_config.json:   0%|          | 0.00/254 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/238k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.35G [00:00<?, ?B/s]

In [25]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('pile-of-law/legalbert-large-1.7M-2')
model = BertModel.from_pretrained('pile-of-law/legalbert-large-1.7M-2')
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
print(output.last_hidden_state)


tensor([[[-1.7574, -1.1169,  0.2101,  ..., -0.8277,  0.5690, -0.6633],
         [ 0.7162,  0.6218,  0.3411,  ...,  0.5368,  0.3713,  0.7100],
         [ 0.7752, -0.4015, -1.4968,  ..., -0.5067, -0.5794,  0.6381],
         ...,
         [ 0.5287,  0.1446, -1.0229,  ...,  0.3650, -0.4353, -1.8710],
         [-1.4724, -0.4680,  0.1470,  ...,  0.0874,  0.4158, -0.9055],
         [-2.0053, -0.9834,  0.0132,  ..., -0.8461,  0.2896, -0.4789]]],
       grad_fn=<NativeLayerNormBackward0>)


In [55]:
import pandas as pd
import torch
import faiss
import pickle
from transformers import BertTokenizer, BertModel


# Function to generate embeddings
def get_embedding(text):
    encoded_input = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        output = model(**encoded_input)
    return output.last_hidden_state[:, 0, :].squeeze().numpy()  # Use [CLS] token embedding

# Convert all content into embeddings
embeddings = [get_embedding(content) for content in df["Content"]]
embeddings = torch.tensor(embeddings).numpy()

# Initialize FAISS index
d = embeddings.shape[1]  # Dimension of embeddings
index = faiss.IndexFlatL2(d)  # L2 (Euclidean) distance index
index.add(embeddings)  # Add all embeddings

# Save FAISS index
faiss.write_index(index, "faiss_index.bin")

# Save Section-Index mapping
section_mapping = {i: section for i, section in enumerate(df["Section"])}
with open("section_mapping.pkl", "wb") as f:
    pickle.dump(section_mapping, f)

print("FAISS index and section mapping saved successfully!")


FAISS index and section mapping saved successfully!


In [56]:
# Create a dictionary mapping each index to Section, Metadata, and Url
data_mapping = {i: {"Section": df["Section"][i], "Metadata": df["Metadata"][i], "Url": df["Url"][i]} for i in range(len(df))}

# Save mapping dictionary
with open("data_mapping.pkl", "wb") as f:
    pickle.dump(data_mapping, f)

print("FAISS index and data mapping saved successfully!")

FAISS index and data mapping saved successfully!


In [70]:
# Load FAISS index and data mapping
index = faiss.read_index("faiss_index.bin")
with open("data_mapping.pkl", "rb") as f:
    data_mapping = pickle.load(f)

# Function to search FAISS and retrieve Section, Metadata, and URL
def search_faiss(query, top_k=20):
    query_embedding = get_embedding(query).reshape(1, -1)  # Convert to correct shape
    distances, indices = index.search(query_embedding, top_k)  # Perform search
    
    results = [
        {
            "Section": data_mapping[i]["Section"],
            "Metadata": data_mapping[i]["Metadata"],
            "Url": data_mapping[i]["Url"],
            "Score": distances[0][j]
        }
        for j, i in enumerate(indices[0])
    ]
    return results

# Example query
query = ''' 
Whoever, knowing that an offense against the United States has been committed, receives,
relieves, comforts or assists the offender in order to hinder or prevent his apprehension, trial or
punishment, is an accessory after the fact.
Except as otherwise expressly provided by any Act of Congress, an accessory after the fact shall
be imprisoned not more than one-half the maximum term of imprisonment or (notwithstanding
section 3571) fined not more than one-half the maximum fine prescribed for the punishment of the
principal, or both; or if the principal is punishable by life imprisonment or death, the accessory shall
be imprisoned not more than 15 years.
(June 25, 1948, ch. 645, 62 Stat. 684; Pub. L. 99–646, §43, Nov. 10, 1986, 100 Stat. 3601; Pub. L.
101–647, title XXXV, §3502, Nov. 29, 1990, 104 Stat. 4921; Pub. L. 103–322, title XXXIII,
§§330011(h), 330016(2)(A), Sept. 13, 1994, 108 Stat. 2145, 2148.)
'''
results = search_faiss(query)

# Print results
for res in results:
    print(f"Section: {res['Section']}")
    print(f"Metadata: {res['Metadata']}")
    print(f"URL: {res['Url']}")
    print(f"Score: {res['Score']}\n")


Section: 1509. Obstruction of court orders
Metadata: Amendments1994-Pub. L. 103Ã¢ÂÂ322 substituted "fined under this title" for "fined not more than $1,000" in first par.
URL: https://uscode.house.gov/view.xhtml?req=granuleid:USC-prelim-title18-section1509&num=0&edition=prelim
Score: 18.605010986328125

Section: 479. Uttering counterfeit foreign obligations or securities
Metadata: Amendments2001-Pub. L. 107Ã¢ÂÂ56 substituted "20 years" for "three years".1994-Pub. L. 103Ã¢ÂÂ322 substituted "fined under this title" for "fined not more than $3,000".
URL: https://uscode.house.gov/view.xhtml?req=granuleid:USC-prelim-title18-section479&num=0&edition=prelim
Score: 19.616374969482422

Section: 478. Foreign obligations or securities
Metadata: Amendments2001-Pub. L. 107Ã¢ÂÂ56 substituted "20 years" for "five years".1994-Pub. L. 103Ã¢ÂÂ322 substituted "fined under this title" for "fined not more than $5,000".
URL: https://uscode.house.gov/view.xhtml?req=granuleid:USC-prelim-title18-sect

In [60]:
import google.generativeai as genai
with open(r"C:/Users/mkolla1/OneDrive - Georgia State University/Desktop/CareerSwipe/fakekey.txt", "r") as f:
    GOOGLE_API_KEY = f.read()
genai.configure(api_key=GOOGLE_API_KEY)


In [74]:
def chatbot(question):
    # Search FAISS index for relevant sections
    system_prompt = """ 
    You are LegalBot, an AI assistant specializing in Title 18 of U.S. law. Your role is to provide accurate, well-structured, and concise legal explanations based **only** on the given reference materials. 
    
    **Guidelines for Your Response:**
    - Answer the question strictly based on the provided sections.
    - If the relevant information is not found, Find the information from online resources and provide the answer.
    - Keep explanations clear and professional, avoiding unnecessary opinions.
    - Provide citations by mentioning the **Section number** where applicable.
    
    **Reference Material:** 
    {reference_material}
    
    Now, answer the following question based on the provided legal content.
    """

    results = search_faiss(question)
    
    # Extract relevant information
    relevant_sections = [res["Section"] for res in results]
    relevant_metadata = [res["Metadata"] for res in results]
    relevant_urls = [res["Url"] for res in results]
    
    # Retrieve corresponding Content from df
    content_map = df.set_index("Section")["Content"].to_dict()
    relevant_contents = [content_map.get(section, "") for section in relevant_sections]
    
    # Group the extracted data into a structured list
    grouped_results = [
        {
            "Section": section,
            "Metadata": metadata,
            "Url": url,
            "Content": content
        }
        for section, metadata, url, content in zip(relevant_sections, relevant_metadata, relevant_urls, relevant_contents)
    ]
    
    # Format reference material as a readable string
    reference_material = "\n\n".join([
        f"**{res['Section']}**\nMetadata: {res['Metadata']}\nURL: {res['Url']}\nContent: {res['Content']}" 
        for res in grouped_results
    ])
    print(reference_material)
    # Generate response using VoyageAI / Gemini
    model = genai.GenerativeModel(
        'models/gemini-1.5-flash',
        system_instruction=system_prompt.format(reference_material=reference_material)
    )
    
    response = model.generate_content(f"**Question:** {question}")
    
    return response.text


In [75]:
test= """Under 18 U.S.C. § 3771, what rights do crime victims have in federal cases?
"""
testing = chatbot(test)
print(testing)

**3115. Inventory upon execution and return of search warrant-(Rule)**
Metadata: nan
URL: https://uscode.house.gov/view.xhtml?req=granuleid:USC-prelim-title18-section3115&num=0&edition=prelim
Content: See Federal Rules of Criminal ProcedureInventory of property seized under search warrant and copies to persons affected, Rule 41(d).(June 25, 1948, ch. 645, 62 Stat. 820.)

**3010. Exceptions unnecessary-(Rule)**
Metadata: nan
URL: https://uscode.house.gov/view.xhtml?req=granuleid:USC-prelim-title18-section3010&num=0&edition=prelim
Content: See Federal Rules of Criminal ProcedureObjections substituted for exceptions, rule 51.(June 25, 1948, ch. 645, 62 Stat. 815.)

**3103. Grounds for issuing search warrant-(Rule)**
Metadata: nan
URL: https://uscode.house.gov/view.xhtml?req=granuleid:USC-prelim-title18-section3103&num=0&edition=prelim
Content: See Federal Rules of Criminal ProcedureGrounds prescribed for issuance of search warrant, Rule 41(b).(June 25, 1948, ch. 645, 62 Stat. 819.)

**300