In [16]:
import boto3
from botocore import UNSIGNED
from botocore.config import Config

s3 = boto3.client(
    "s3",
    config=Config(signature_version=UNSIGNED),
    region_name="ap-south-1"
)

bucket = "indian-supreme-court-judgments"

response = s3.list_objects_v2(
    Bucket=bucket,
    Delimiter="/"
)

for prefix in response.get("CommonPrefixes", []):
    print(prefix["Prefix"])


data-old/
data/
metadata/


In [17]:
response = s3.list_objects_v2(
    Bucket=bucket,
    Prefix="data/zip/",
    Delimiter="/"
)

years = []
for p in response.get("CommonPrefixes", []):
    years.append(p["Prefix"])

print(years[-10:])  # show last few years


['data/zip/year=2016/', 'data/zip/year=2017/', 'data/zip/year=2018/', 'data/zip/year=2019/', 'data/zip/year=2020/', 'data/zip/year=2021/', 'data/zip/year=2022/', 'data/zip/year=2023/', 'data/zip/year=2024/', 'data/zip/year=2025/']


In [None]:
import os

save_dir = "data/raw/zips/"
os.makedirs(save_dir, exist_ok=True)

years = ["2023", "2024"]  # adjust if latest differs

for year in years:
    key = f"data/zip/year={year}/english.zip"
    local_path = os.path.join(save_dir, f"{year}_english.zip")

    s3.download_file(bucket, key, local_path)
    print(f"Downloaded {year} English judgments")


In [7]:
import zipfile

extract_dir = "data/raw/sc_last_2_years/"
os.makedirs(extract_dir, exist_ok=True)

for year in years:
    zip_path = f"data/raw/zips/{year}_english.zip"
    out_dir = f"{extract_dir}/{year}"
    os.makedirs(out_dir, exist_ok=True)

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(out_dir)

    print(f"Extracted {year} judgments")


Extracted 2023 judgments
Extracted 2024 judgments


In [8]:
import random
import shutil

final_dir = "data/processed/sc_sampled/"
os.makedirs(final_dir, exist_ok=True)

SAMPLES_PER_YEAR = 300

for year in years:
    year_dir = f"{extract_dir}/{year}"
    files = [f for f in os.listdir(year_dir) if f.endswith(".json")]

    sampled = random.sample(files, min(SAMPLES_PER_YEAR, len(files)))

    for file in sampled:
        shutil.copy(
            os.path.join(year_dir, file),
            os.path.join(final_dir, f"{year}_{file}")
        )

    print(f"Sampled {len(sampled)} cases for {year}")


Sampled 0 cases for 2023
Sampled 0 cases for 2024


In [9]:
import pdfplumber
import os
from tqdm import tqdm

PDF_DIR = "data/raw/sc_last_2_years"
TEXT_DIR = "data/processed/texts"

os.makedirs(TEXT_DIR, exist_ok=True)

def extract_text(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

for year in ["2023", "2024"]:
    year_dir = os.path.join(PDF_DIR, year)

    for root, dirs, files in os.walk(year_dir):
        for file in tqdm(files, desc=f"Processing {year}"):
            if file.lower().endswith(".pdf"):
                pdf_path = os.path.join(root, file)
                text = extract_text(pdf_path)

                out_file = f"{year}_{file.replace('.pdf', '.txt')}"
                with open(os.path.join(TEXT_DIR, out_file), "w", encoding="utf-8") as f:
                    f.write(text)


Processing 2023:  91%|█████████▏| 780/854 [14:25<02:54,  2.35s/it]  Cannot set gray non-stroke color because /'R5823' is an invalid float value
Cannot set gray non-stroke color because /'R5823' is an invalid float value
Cannot set gray non-stroke color because /'R5826' is an invalid float value
Processing 2023: 100%|██████████| 854/854 [18:43<00:00,  1.32s/it]
Processing 2024: 100%|██████████| 782/782 [14:59<00:00,  1.15s/it]  


In [10]:
import re
import os

TEXT_DIR = "data/processed/texts"
CLEAN_DIR = "data/processed/clean_texts"
os.makedirs(CLEAN_DIR, exist_ok=True)

def clean_text(text):
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'Page \d+ of \d+', '', text)
    text = re.sub(r'JUDGMENT|ORDER', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

for file in os.listdir(TEXT_DIR):
    with open(os.path.join(TEXT_DIR, file), "r", encoding="utf-8") as f:
        cleaned = clean_text(f.read())

    with open(os.path.join(CLEAN_DIR, file), "w", encoding="utf-8") as f:
        f.write(cleaned)


In [12]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bhojw\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\bhojw\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [13]:
import os
import nltk
nltk.download("punkt")
from nltk.tokenize import sent_tokenize

CHUNK_DIR = "data/processed/chunks"
CLEAN_DIR = "data/processed/clean_texts"
os.makedirs(CHUNK_DIR, exist_ok=True)

CHUNK_SIZE = 500  # words

def chunk_text(text):
    sentences = sent_tokenize(text)
    chunks, current = [], []

    count = 0
    for sent in sentences:
        words = sent.split()
        count += len(words)
        current.append(sent)

        if count >= CHUNK_SIZE:
            chunks.append(" ".join(current))
            current, count = [], 0

    if current:
        chunks.append(" ".join(current))

    return chunks

for file in os.listdir(CLEAN_DIR):
    with open(os.path.join(CLEAN_DIR, file), "r", encoding="utf-8") as f:
        text = f.read()

    chunks = chunk_text(text)

    for i, chunk in enumerate(chunks):
        with open(os.path.join(CHUNK_DIR, f"{file}_chunk_{i}.txt"), "w", encoding="utf-8") as f:
            f.write(chunk)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bhojw\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
from sentence_transformers import SentenceTransformer
import os
import numpy as np

CHUNK_DIR = "data/processed/chunks"
EMB_DIR = "data/processed/embeddings"
os.makedirs(EMB_DIR, exist_ok=True)

model = SentenceTransformer("all-MiniLM-L6-v2")

texts = []
metadata = []

for file in os.listdir(CHUNK_DIR):
    with open(os.path.join(CHUNK_DIR, file), "r", encoding="utf-8") as f:
        text = f.read()
        texts.append(text)
        metadata.append(file)

print(f"Total chunks: {len(texts)}")

embeddings = model.encode(
    texts,
    batch_size=32,
    show_progress_bar=True
)

np.save(os.path.join(EMB_DIR, "embeddings.npy"), embeddings)
np.save(os.path.join(EMB_DIR, "metadata.npy"), metadata)


  from .autonotebook import tqdm as notebook_tqdm


Total chunks: 27551


Batches: 100%|██████████| 861/861 [07:31<00:00,  1.91it/s]


In [15]:
import faiss
import numpy as np
import os

EMB_DIR = "data/processed/embeddings"
INDEX_DIR = "data/processed/faiss"
os.makedirs(INDEX_DIR, exist_ok=True)

embeddings = np.load(os.path.join(EMB_DIR, "embeddings.npy"))

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

faiss.write_index(index, os.path.join(INDEX_DIR, "sc_judgments.index"))

print("FAISS index created.")


FAISS index created.


In [16]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import os

model = SentenceTransformer("all-MiniLM-L6-v2")

query = "Employee terminated without notice in private company"

query_embedding = model.encode([query])

index = faiss.read_index("data/processed/faiss/sc_judgments.index")
metadata = np.load("data/processed/embeddings/metadata.npy", allow_pickle=True)

D, I = index.search(query_embedding, k=5)

print("Top matching chunks:")
for idx in I[0]:
    print(metadata[idx])


Top matching chunks:
2024_2024_12_492_499_EN.txt_chunk_2.txt
2024_2024_11_2369_2374_EN.txt_chunk_0.txt
2023_2023_15_893_902_EN.txt_chunk_4.txt
2024_2024_8_901_915_EN.txt_chunk_8.txt
2024_2024_10_2303_2313_EN.txt_chunk_3.txt


In [17]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import os

MODEL_NAME = "all-MiniLM-L6-v2"
CHUNK_DIR = "data/processed/chunks"

model = SentenceTransformer(MODEL_NAME)

index = faiss.read_index("data/processed/faiss/sc_judgments.index")
metadata = np.load(
    "data/processed/embeddings/metadata.npy",
    allow_pickle=True
)

def retrieve_chunks(query, top_k=5):
    query_embedding = model.encode([query])
    D, I = index.search(query_embedding, top_k)

    retrieved_texts = []
    for idx in I[0]:
        chunk_file = metadata[idx]
        with open(os.path.join(CHUNK_DIR, chunk_file), "r", encoding="utf-8") as f:
            retrieved_texts.append(f.read())

    return retrieved_texts


In [1]:
def build_legal_prompt(user_case, retrieved_chunks):
    context = "\n\n".join(retrieved_chunks)

    prompt = f"""
You are a legal research assistant trained on Indian law.

User Case Description:
{user_case}

Relevant Supreme Court Judgment Excerpts:
{context}

Tasks:
1. Identify applicable legal principles and statutes.
2. Explain how courts have ruled in similar cases.
3. Provide a general legal strategy based on past judgments.
4. Highlight common mistakes and important precautions.

Important:
- Do NOT provide legal advice.
- Provide informational and educational guidance only.
- Mention uncertainty where applicable.

Answer clearly and concisely.
"""
    return prompt


In [9]:
from groq import Groq

client = Groq(api_key="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")

chat_completion = client.chat.completions.create(
    model="llama-3.1-8b-instant",
    messages=[
        {"role": "user", "content": "Summarize contract law"}
    ]
)

print(chat_completion.choices[0].message.content)



Contract law is a set of rules and regulations that governs the creation, performance, and enforcement of contracts between two or more parties. Here's a summary of the key aspects of contract law:

**Formation of a Contract**

1. **Offer**: One party makes an offer to another party, which must be clear and unambiguous.
2. **Acceptance**: The offeree accepts the offer, which must be communicated in a timely manner.
3. **Consideration**: Both parties must provide something of value (consideration) to the other party.
4. **Capacity**: Parties involved in a contract must have the capacity to enter into a contract (i.e., be of sound mind, be of legal age, etc.).
5. **Legality**: The contract must not be for an illegal purpose or outcome.

**Types of Contracts**

1. **Express Contracts**: Written agreements between parties.
2. **Implied Contracts**: Unwritten agreements that can be inferred from the parties' actions or circumstances.
3. **Quasi Contracts**: Unintended obligations between pa

In [11]:
def generate_response(prompt, model="llama-3.1-8b-instant", max_tokens=500, temperature=0.2):
    completion = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a legal research assistant. You DO NOT give legal advice. You only summarize patterns from judgments."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=max_tokens,
        temperature=temperature
    )
    return completion.choices[0].message.content


In [12]:
print(generate_response("Summarize breach of contract in Indian law."))


In Indian law, breach of contract is governed by the Indian Contract Act, 1872. Here's a summary of the key aspects:

**Definition of Breach of Contract:**
A breach of contract occurs when one party fails to perform their obligations under the contract, or performs them in a manner that is not in accordance with the terms of the contract.

**Types of Breach of Contract:**
There are two types of breach of contract:

1. **Actual Breach:** This occurs when a party fails to perform their obligations under the contract.
2. **Anticipatory Breach:** This occurs when a party indicates, before the time for performance, that they will not perform their obligations under the contract.

**Consequences of Breach of Contract:**
The consequences of breach of contract depend on the nature of the breach and the terms of the contract. Some common consequences include:

1. **Damages:** The injured party may claim damages from the breaching party.
2. **Specific Performance:** The court may order the breac

In [11]:
# STEP 6: Structured Legal Output Prompt Builder

def build_legal_prompt(user_case, retrieved_chunks):
    """
    retrieved_chunks: list of tuples (chunk_text, confidence)
    """
    context = ""
    for txt, conf in retrieved_chunks:
        context += f"\n[Confidence={round(conf,2)}]\n{txt}\n"

    prompt = f"""
You are a legal research assistant trained on Indian Supreme Court judgments.
You DO NOT provide legal advice. You ONLY summarize patterns.

User Case Description:
{user_case}

Relevant Judgment Extracts:
{context}

Now respond in EXACTLY this structure:

1. Key Legal Issues Raised
2. Possible Applicable Acts / IPC / CrPC / Labour Codes
3. How Supreme Court Has Handled Similar Cases
4. General Strategic Considerations (Non-advisory)
5. Potential Risks / Limitations
6. Documentation Commonly Required in Such Matters
7. Strong Disclaimer (Not Legal Advice)

Start now:
"""
    return prompt


In [12]:
dummy_chunks = [("Employer terminated without notice...", 0.82)]
prompt = build_legal_prompt("I was fired from my IT job.", dummy_chunks)
print(prompt)



You are a legal research assistant trained on Indian Supreme Court judgments.
You DO NOT provide legal advice. You ONLY summarize patterns.

User Case Description:
I was fired from my IT job.

Relevant Judgment Extracts:

[Confidence=0.82]
Employer terminated without notice...


Now respond in EXACTLY this structure:

1. Key Legal Issues Raised
2. Possible Applicable Acts / IPC / CrPC / Labour Codes
3. How Supreme Court Has Handled Similar Cases
4. General Strategic Considerations (Non-advisory)
5. Potential Risks / Limitations
6. Documentation Commonly Required in Such Matters
7. Strong Disclaimer (Not Legal Advice)

Start now:



In [15]:
import numpy as np
import faiss
import os
from sentence_transformers import SentenceTransformer

# Paths already used earlier in your pipeline
EMB_DIR = "data/processed/embeddings"
INDEX_PATH = "data/processed/faiss/sc_judgments.index"
CHUNK_DIR = "data/processed/chunks"

# Load metadata and index
metadata = np.load(os.path.join(EMB_DIR, "metadata.npy"), allow_pickle=True)
index = faiss.read_index(INDEX_PATH)

# Load embedding model
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

def retrieve_chunks(query, top_k=3):
    """
    Takes a query string, retrieves top_k legal chunks using FAISS similarity.
    Returns a list of tuples: (text_chunk, confidence_score)
    """
    query_emb = embed_model.encode([query])
    D, I = index.search(query_emb, top_k)

    results = []
    for dist, idx in zip(D[0], I[0]):
        conf = float(1 / (1 + dist))   # heuristic confidence score
        fname = metadata[idx]          # filename of chunk
        with open(os.path.join(CHUNK_DIR, fname), "r", encoding="utf-8") as f:
            chunk_text = f.read()
        results.append((chunk_text, conf))
    
    return results


FileNotFoundError: [Errno 2] No such file or directory: 'data/processed/embeddings\\metadata.npy'

In [15]:
print(retrieve_chunks("employment termination", top_k=2))


[('You have continued to remain absent at work premises without authorisation and also you did not present yourself for our enquiry meetings called for as per our disciplinary Policy. Considering all the above, as per your agreed employment terms Clause 11, 12(V), 17, 24 & 25, your employment has been terminated with effective from the closing hours of 06 Jan 2021. […]” 5. It is evident from the above that there is no allegation whatsoever that the appellant has violated clause 19 of the appointment leading to the of termination. 6. During the pendency of disciplinary action, as the appellant was not paid his salary, he issued a legal notice for payment of wages on 29.05.2021 and filed a petition under Section 15(2) of the PW Act before the authority under the PW Act. As a counterblast, the respondent issued a notice alleging that the disputes must be settled through arbitration and proceeded to unilaterally appoint an arbitrator. We may mention here itself that even in the said reply 

In [14]:
# Full RAG test flow inside notebook

user_case = "I was terminated from a private job without notice."

retrieved = retrieve_chunks(user_case, top_k=3)
prompt = build_legal_prompt(user_case, retrieved)

response = generate_response(prompt, max_tokens=350)
print(response)


NameError: name 'retrieve_chunks' is not defined