In [1]:
!pip install gradio openai sentence-transformers faiss-cpu datasets pandas nltk pymongo pymupdf
!pip install langchain-community



In [2]:
import os
import gradio as gr
import pandas as pd
import numpy as np
import faiss
import nltk
import re
import tempfile
import fitz  # PyMuPDF

from datasets import load_dataset
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from pymongo import MongoClient

nltk.download("punkt")

nltk.download("punkt_tab")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
from google.colab import files
uploaded = files.upload()


Saving ipc_sections.csv to ipc_sections (1).csv


In [4]:
def connect_mongo():
    try:
        client = MongoClient("mongodb+srv://<username>:<password>@<cluster>.mongodb.net/?retryWrites=true&w=majority&appName=<cluster-name>")
        db = client["argulex"]
        collection = db["prompt_history"]
        print("✅ MongoDB connected.")
        return collection
    except Exception as e:
        print("❌ MongoDB Connection Failed:", e)
        return None

collection = connect_mongo()


❌ MongoDB Connection Failed: The DNS query name does not exist: _mongodb._tcp.<cluster>.mongodb.net.


In [5]:
# Load IPC CSV
ipc_df = pd.read_csv("ipc_sections.csv")
ipc_df.dropna(subset=["Description"], inplace=True)

# Load Constitution dataset
constitution_ds = load_dataset("Sharathhebbar24/Indian-Constitution", split="train")

# Preprocess
constitution_data = [
    {"source": row["article_id"], "text": sent.strip()}
    for row in constitution_ds
    if "article_desc" in row and row["article_desc"]
    for sent in sent_tokenize(row["article_desc"])
    if len(sent.strip()) > 20
]

ipc_data = [
    {"source": f"Section {row['Section']}", "text": sent.strip()}
    for _, row in ipc_df.iterrows()
    for sent in sent_tokenize(str(row["Description"]))
    if len(sent.strip()) > 20
]

all_data = constitution_data + ipc_data
texts = [item["text"] for item in all_data]
sources = [item["source"] for item in all_data]


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(texts, convert_to_tensor=False, show_progress_bar=True)
embeddings = np.array(embeddings).astype("float32")

index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)


Batches:   0%|          | 0/46 [00:00<?, ?it/s]

In [7]:
os.environ["OPENAI_API_KEY"] = "your-openai-api-key"

llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a legal expert. Given the legal text below and the user's question, provide a precise and accurate legal answer:

Legal Text:
{context}

Question:
{question}

Answer:"""
)

qa_chain = LLMChain(llm=llm, prompt=prompt_template)


  llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
  qa_chain = LLMChain(llm=llm, prompt=prompt_template)


In [8]:
def get_best_match(question):
    question_emb = model.encode([question], convert_to_tensor=False)
    D, I = index.search(np.array(question_emb).astype("float32"), k=1)
    best_idx = I[0][0]
    return texts[best_idx], sources[best_idx]

def legal_chatbot(question):
    context, source = get_best_match(question)
    try:
        response = qa_chain.run({"context": context, "question": question})
    except Exception as e:
        response = f"Error during generation: {e}"

    try:
        if collection:
            collection.insert_one({
                "question": question,
                "context": context,
                "source": source,
                "response": response
            })
    except:
        pass

    return f"✅ **Answer:** {response}\n\n📚 **Source:** {source}"


In [9]:
def split_into_sentences(text):
    return re.split(r'(?<=[.!?])\s+', text.strip())

def generate_report(file_path):
    try:
        filename = os.path.basename(file_path)
        case_name = filename.replace(".pdf", "").replace("_", " ").title()
        clean_filename = filename.replace(".pdf", "").replace(" ", "_").replace("-", "_").title()

        doc = fitz.open(file_path)
        full_text = ""
        for page in doc:
            full_text += page.get_text()

        date_match = re.search(r"DATE\s+OF\s+JUDGMENT:\s*([\d/]+)", full_text, re.I)
        judge_match = re.search(r"BENCH:\s*(.+?)(?:BENCH|CITATION|$)", full_text, re.I)
        date = date_match.group(1) if date_match else "Unknown"
        judges = judge_match.group(1).strip() if judge_match else "Unknown"

        sentences = split_into_sentences(full_text)
        summary = " ".join(sentences[:3]) if len(sentences) >= 3 else full_text

        conclusion = "Conclusion not found."
        for i, sentence in enumerate(sentences):
            if re.search(r"(conclusion|held|ordered|final decision|therefore)", sentence, re.I):
                conclusion = " ".join(sentences[i:i+2])
                break

        report = f"""🧾 Generated Case Report
🏛️ Case Name: {case_name}
📅 Date: {date}
👨‍⚖️ Judges: {judges}

🧠 Summary:
{summary}

✅ Conclusion:
{conclusion}
"""
        temp_dir = tempfile.gettempdir()
        report_path = os.path.join(temp_dir, f"{clean_filename}.txt")
        with open(report_path, "w", encoding="utf-8") as f:
            f.write(report)

        return report, report_path

    except Exception as e:
        return f"❌ Error: {e}", None


In [10]:
with gr.Blocks() as app:
    gr.Markdown("## ⚖️ ArguLex - Legal AI Assistant")

    with gr.Tabs():
        with gr.TabItem("👥 Legal Chatbot"):
            gr.Markdown("Ask legal questions related to the Indian Constitution or IPC.")
            public_input = gr.Textbox(label="❓ Ask Your Legal Question")
            public_output = gr.Markdown()
            public_button = gr.Button("🔍 Get Answer")
            public_button.click(legal_chatbot, inputs=public_input, outputs=public_output)

        with gr.TabItem("👨‍⚖️ Professional PDF Analyzer"):
            gr.Markdown("Upload court PDFs to generate structured reports.")
            file_input = gr.File(label="📎 Upload Court PDF", type="filepath")
            report_output = gr.Textbox(label="📋 Case Report", lines=25)
            report_file = gr.File(label="📥 Download Report", visible=False)
            analyze_button = gr.Button("🧠 Generate Report")

            def handle_upload(file):
                summary_text, generated_file_path = generate_report(file)
                return summary_text, gr.update(value=generated_file_path, visible=True)

            analyze_button.click(handle_upload, inputs=[file_input], outputs=[report_output, report_file])

app.launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://cfe2326353f32a3a6b.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


