<a href="https://colab.research.google.com/github/Ste-0507/LegalMind/blob/main/legalMind_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U langchain langchain-community faiss-cpu sentence-transformers pypdf transformers gradio


In [None]:
!jupyter nbconvert --ClearMetadataPreprocessor.enabled=True \
    --to notebook --inplace your_notebook.ipynb


In [None]:
# -------------------------
# 0. Install Required Libraries
# -------------------------
# Run in Colab or terminal:
# !pip install -U langchain langchain-community faiss-cpu sentence-transformers pypdf transformers gradio

# -------------------------
# 1. Imports
# -------------------------
import os
import re
import pickle
import numpy as np
import gradio as gr
import faiss
import torch

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# -------------------------
# 2. Download IPC PDF if not present
# -------------------------
PDF_PATH = "Indian_Penal_Code.pdf"
if not os.path.exists(PDF_PATH):
    os.system("wget https://www.iitk.ac.in/wc/data/IPC_186045.pdf -O Indian_Penal_Code.pdf")

# -------------------------
# 3. Chunking and Section Tagging
# -------------------------
def load_and_chunk_pdf(pdf_path):
    loader = PyPDFLoader(pdf_path)
    pages = loader.load()
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = splitter.split_documents(pages)
    texts = [chunk.page_content for chunk in chunks]
    return texts

def tag_sections(texts):
    tagged = []
    current_section = ""
    for text in texts:
        match = re.search(r"(Section\s+\d+)[\.:]?\s*(.*)", text, flags=re.IGNORECASE)
        if match:
            current_section = f"{match.group(1)}: {match.group(2).strip()}"
        tagged.append(f"{current_section}\n{text}")
    return tagged

# -------------------------
# 4. Embeddings + Vector Store
# -------------------------
def generate_embeddings(texts, model_name="all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(texts, show_progress_bar=True)
    return np.array(embeddings).astype("float32"), model

def build_faiss_index(embedding_array):
    index = faiss.IndexFlatL2(embedding_array.shape[1])
    index.add(embedding_array)
    return index

def save_vector_store(index, embedding_array, texts):
    faiss.write_index(index, "faiss_index.bin")
    np.save("embeddings.npy", embedding_array)
    with open("texts.pkl", "wb") as f:
        pickle.dump(texts, f)

def load_vector_store():
    index = faiss.read_index("faiss_index.bin")
    embedding_array = np.load("embeddings.npy")
    with open("texts.pkl", "rb") as f:
        texts = pickle.load(f)
    embedder = SentenceTransformer("all-MiniLM-L6-v2")
    return index, embedding_array, texts, embedder

# -------------------------
# 5. Load FLAN-T5 Model
# -------------------------
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

# -------------------------
# 6. QA Generation Function
# -------------------------
def generate_answer_with_flan(query, context):
    input_text = f"Answer this legal question using the Indian Penal Code:\n{context}\n\nQuestion: {query}"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024).to(device)
    outputs = model.generate(**inputs, max_new_tokens=300)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def get_answer(query, index, texts, embedder, top_k=5):
    query_embedding = embedder.encode([query])
    D, I = index.search(np.array(query_embedding).astype("float32"), top_k)
    selected_chunks = [texts[i] for i in I[0]]
    context = "\n\n".join(selected_chunks)
    answer = generate_answer_with_flan(query, context)
    return f"📜 *Answer*:\n{answer}\n\n🔍 *Referenced Sections (from IPC)*:\n{context}"

# -------------------------
# 7. Gradio Interface
# -------------------------
def qa_interface(query):
    index, embedding_array, texts, embedder = load_vector_store()
    try:
        return get_answer(query, index, texts, embedder)
    except Exception as e:
        return f"Error: {str(e)}"

iface = gr.Interface(
    fn=qa_interface,
    inputs=gr.Textbox(label="Ask a legal question (IPC-related):"),
    outputs=gr.Textbox(label="Answer"),
    title="LegalMind IPC Chatbot",
    description="Ask legal questions based on the Indian Penal Code (IPC)."
)

# -------------------------
# 8. Run Main
# -------------------------
if __name__ == "__main__":
    if not os.path.exists("faiss_index.bin"):
        print("🔧 Building FAISS index...")
        texts = load_and_chunk_pdf(PDF_PATH)
        texts = tag_sections(texts)
        embedding_array, embedder = generate_embeddings(texts)
        index = build_faiss_index(embedding_array)
        save_vector_store(index, embedding_array, texts)
        print("✅ Vector store built.")

    iface.launch(debug=True, share=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

🔧 Building FAISS index...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/34 [00:00<?, ?it/s]

✅ Vector store built.
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://bace248a1f39521655.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
