In [None]:
!pip install requests transformers sentence-transformers beautifulsoup4

Web Scraping the Sections

In [None]:
import requests
from bs4 import BeautifulSoup

BASE_URL = "https://devgan.in"
LAW_TYPE = "ipc"
MAIN_URL = f"{BASE_URL}/{LAW_TYPE}/"

# Step 1: Get all chapter links
response = requests.get(MAIN_URL)
soup = BeautifulSoup(response.text, "html.parser")

chapters = []
for row in soup.select("table.menu tr"):
    columns = row.find_all("td")
    if len(columns) == 2:
        chapter_number = columns[0].text.strip()
        chapter_title = columns[1].text.strip()
        chapter_link = BASE_URL + columns[1].find("a")["href"]
        chapters.append((chapter_number, chapter_title, chapter_link))

# Step 2: Scrape each chapter's content
for chapter_number, chapter_title, chapter_link in chapters:
    chapter_response = requests.get(chapter_link)
    chapter_soup = BeautifulSoup(chapter_response.text, "html.parser")

    # Extracting the main content - Modify selector if needed
    content_div = chapter_soup.find("div", id="content")

    if content_div:
        chapter_content = content_div.get_text(separator="\n", strip=True)
    else:
        chapter_content = "Content not found."

    print(f"Chapter {chapter_number}: {chapter_title}")
    print(f"URL: {chapter_link}")
    print(chapter_content)
    print("\n" + "-"*100 + "\n")


In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Example text (replace with scraped section text)
section_text = "Whoever does any act with the intention of causing death..."

# Convert text to vector
embedding = model.encode(section_text)

print(np.array(embedding).shape)  # Output should be (384,) for MiniLM


In [None]:
import faiss
import numpy as np

# Initialize FAISS index
dimension = 384  # Embedding size of the model
index = faiss.IndexFlatL2(dimension)

# Store embeddings
vectors = np.array([embedding], dtype=np.float32)
index.add(vectors)


In [None]:
import pinecone

pinecone.init(api_key="your_api_key", environment="us-west1-gcp")

index = pinecone.Index("law-sections")

# Store vector with metadata
index.upsert(vectors=[("section_302", embedding.tolist(), {"text": section_text})])


Query

In [None]:
query = "What is the punishment for theft?"
query_embedding = model.encode(query)

# FAISS search
D, I = index.search(np.array([query_embedding], dtype=np.float32), k=3)
print(f"Top results: {I}")

# Pinecone search
results = index.query(query_embedding.tolist(), top_k=3, include_metadata=True)
for match in results["matches"]:
    print(match["metadata"]["text"])
