Installing packages

In [1]:
!pip install requests faiss-cpu sentence-transformers transformers beautifulsoup4

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_6

In [2]:
!git clone https://github.com/Rithvik50/Legal_AEye-Opener.git

Cloning into 'Legal_AEye-Opener'...
remote: Enumerating objects: 113, done.[K
remote: Counting objects: 100% (41/41), done.[K
remote: Compressing objects: 100% (36/36), done.[K
remote: Total 113 (delta 22), reused 8 (delta 5), pack-reused 72 (from 1)[K
Receiving objects: 100% (113/113), 618.77 KiB | 10.31 MiB/s, done.
Resolving deltas: 100% (39/39), done.


In [3]:
import requests
from bs4 import BeautifulSoup
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import json
from transformers import pipeline
import re

# Initialize embedding model (Sentence-BERT for efficiency)
model = SentenceTransformer("all-MiniLM-L6-v2")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

BASE_URL = "https://devgan.in"
LAW_TYPES = ["ipc", "bns"]  # Both IPC and BNS will be stored together

# FAISS setup
d = 384  # Dimension of embeddings (for MiniLM, it's 384)
index = faiss.IndexFlatL2(d)
documents = []  # To store metadata

def extract_section_number(content):
    match = re.search(r"Section\s*(\d+)", content)
    return match.group(1) if match else "Unknown"

def summarize_content(content):
    summary = summarizer(content[:1024], max_length=150, min_length=50, do_sample=False)[0]['summary_text']
    return summary

for LAW_TYPE in LAW_TYPES:
    MAIN_URL = f"{BASE_URL}/{LAW_TYPE}/"
    response = requests.get(MAIN_URL)
    soup = BeautifulSoup(response.text, "html.parser")

    chapters = []
    for row in soup.select("table.menu tr"):
        columns = row.find_all("td")
        if len(columns) == 2:
            chapter_number = columns[0].text.strip()
            chapter_title = columns[1].text.strip()
            chapter_link = BASE_URL + columns[1].find("a")["href"]
            chapters.append((chapter_number, chapter_title, chapter_link))

    # Scrape each chapter's content
    for chapter_number, chapter_title, chapter_link in chapters:
        chapter_response = requests.get(chapter_link)
        chapter_soup = BeautifulSoup(chapter_response.text, "html.parser")
        content_div = chapter_soup.find("div", id="content")

        if content_div:
            chapter_content = content_div.get_text(separator="\n", strip=True)
        else:
            chapter_content = "Content not found."

        # Generate embeddings and store in FAISS
        embedding = model.encode(chapter_content).astype(np.float32)
        index.add(np.array([embedding]))

        section_number = extract_section_number(chapter_content)
        section_summary = summarize_content(chapter_content)
        formatted_summary = f"Section {section_number}: {section_summary}" if section_number != "Unknown" else section_summary

        # Store metadata
        documents.append({
            "law_type": LAW_TYPE.upper(),
            "chapter_number": chapter_number,
            "chapter_title": chapter_title,
            "content": chapter_content,
            "summary": formatted_summary,
            "source_url": chapter_link
        })

index = faiss.write_index(index, "/content/Legal_AEye-Opener/law_faiss.index")

# Save metadata
with open("/content/Legal_AEye-Opener/law_metadata.json", "w", encoding="utf-8") as f:
    json.dump(documents, f, ensure_ascii=False, indent=4)

print("Scraping, summarization, and FAISS indexing complete!")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Scraping, summarization, and FAISS indexing complete!
