In [None]:
!pip install -U bitsandbytes
!pip install pymupdf pytesseract pillow
!pip install torch transformers
!pip install accelerate

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.4-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-

In [None]:
!pip install huggingface_hub
from huggingface_hub import login
login(token="Huggingface_Token")



In [None]:
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import torch
from transformers import LEDTokenizer, LEDForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForCausalLM


In [None]:
import os
os.environ["HUGGINGFACEHUB_API_TOKEN"] = ""

app_code = """
import streamlit as st
import fitz
import pytesseract
from PIL import Image
import io
import torch
import re
from transformers import LEDTokenizer, LEDForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
import os

login(token=os.getenv("HUGGINGFACEHUB_API_TOKEN"))

st.set_page_config(page_title="Legal Case Summarizer", layout="centered")

st.markdown(\"\"\"<style>
    .main { background-color: #f4f4f9; }
    .block-container { padding: 2rem; }
    h1, h2 { color: #2f3e46; }
    .stButton>button {
        background-color: #1e3d59;
        color: white;
        border-radius: 8px;
    }
</style>\"\"\", unsafe_allow_html=True)

st.title("⚖️ Legal Case Summarizer 🏛️ [India]")
st.subheader("Enhancing Accessibility to Judicial Documents")

uploaded_file = st.file_uploader("📂 Upload a legal PDF or text file", type=["pdf", "txt"])

def extract_text_from_pdf(pdf_file):
    text = ""
    doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
    for page in doc:
        page_text = page.get_text()
        if page_text.strip():
            text += page_text + "\\n"
        else:
            pix = page.get_pixmap()
            img = Image.open(io.BytesIO(pix.tobytes()))
            ocr_text = pytesseract.image_to_string(img)
            text += ocr_text + "\\n"
    doc.close()
    return text

def preprocess_text(text):
    text = re.sub(r"(\\b\\d{1,4}\\s*S\\.C\\.R\\.\\s*[_\\s]*)+", "", text)
    text = re.sub(r"(Page\\s*\\d+|\\[\\d+\\])", "", text)
    text = re.sub(r"[_\\-]{3,}", "", text)
    lines = text.splitlines()
    seen = {}
    cleaned_lines = []
    for line in lines:
        line_clean = line.strip()
        if line_clean:
            seen[line_clean] = seen.get(line_clean, 0) + 1
            if seen[line_clean] <= 2:
                cleaned_lines.append(line_clean)
    joined_text = " ".join(cleaned_lines)
    return re.sub(r"\\s{2,}", " ", joined_text)

def load_led_model(model_name="allenai/led-base-16384"):
    tokenizer = LEDTokenizer.from_pretrained(model_name)
    model = LEDForConditionalGeneration.from_pretrained(model_name)
    return tokenizer, model

def summarize_text(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", max_length=16384, truncation=True)
    summary_ids = model.generate(
        inputs.input_ids,
        max_length=1024,
        min_length=1000,
        num_beams=5,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        early_stopping=True
    )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

def load_quantized_llama(model_name="meta-llama/Llama-3.1-8B-Instruct"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        torch_dtype=torch.float16,
        load_in_8bit=True
    )
    return tokenizer, model

def interpret_summary(summary_text, tokenizer, model):
    prompt = f\"\"\"You are an AI trained in Indian legal document summarization. Your task is to analyze the following case summary and provide a clear, structured explanation for a layperson with little legal knowledge.
Ensure that the summary is simple, jargon-free, and directly useful for someone unfamiliar with legal terms.
Include any relevant Indian laws, sections, or precedents where necessary.

Provide answers to each of the following sections clearly:

1. BACKGROUND OF THE CASE
2. KEY LEGAL ISSUES
3. ARGUMENTS FROM BOTH SIDES
4. COURT’S DECISION & REASONING
5. IMPACT AND LESSONS FOR THE PUBLIC
6. IMPORTANT TAKEAWAYS IN SIMPLE TERMS

Respond ONLY with the six sections and their content. Do NOT repeat the case summary or any instructions.

Here is the summary:
{summary_text}
\"\"\"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to("cuda")
    output_ids = model.generate(
        inputs.input_ids,
        max_new_tokens=1600,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.2,
        no_repeat_ngram_size=7
    )
    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Extract cleanly only the structured part
    structured = re.search(
        r"(1\.?\s*BACKGROUND OF THE CASE.*6\.?\s*IMPORTANT TAKEAWAYS IN SIMPLE TERMS.*?)$",
        output,
        re.DOTALL | re.IGNORECASE
    )
    return structured.group(1).strip() if structured else "⚠️ Could not extract all 6 structured sections. Please try again with a clearer document."

if uploaded_file:
    with st.spinner("🔍 Processing your document... Please wait."):
        if uploaded_file.name.endswith(".pdf"):
            raw_text = extract_text_from_pdf(uploaded_file)
        else:
            raw_text = uploaded_file.read().decode("utf-8")

        processed_text = preprocess_text(raw_text)

        tokenizer_led, model_led = load_led_model()
        summary = summarize_text(processed_text, tokenizer_led, model_led)

        tokenizer_llama, model_llama = load_quantized_llama()
        structured_summary = interpret_summary(summary, tokenizer_llama, model_llama)

    st.success("✅ Document processed successfully!")
    st.markdown("### 🧾 Structured Summary")
    st.markdown(f"<div style='background-color: #fff; padding: 20px; border-radius: 10px; color:#111; white-space: pre-wrap'>{structured_summary}</div>", unsafe_allow_html=True)
"""

# Save to app.py
with open("app.py", "w") as f:
    f.write(app_code)


In [None]:
from pyngrok import ngrok, conf
import time

# Set your ngrok auth token
ngrok.set_auth_token("")

# Disconnect all existing ngrok tunnels
ngrok.kill() #This line kills all current tunnels of the ngrok process before you start the streamlit session.
# Kill previous Streamlit sessions if any
!fuser -k 8501/tcp || echo "No existing streamlit process"

# Open ngrok tunnel
# Removed the 'config' parameter and directly specified 'proto' and 'addr'
tunnel = ngrok.connect(
    addr="8501",  # Specify the local port
    proto="http"  # or "tcp" if needed
)
print("Streamlit UI available at:", tunnel.public_url)

# Launch Streamlit app in background
!streamlit run app.py &>/dev/null &
time.sleep(3)

8501/tcp:            84437
Streamlit UI available at: https://823a-34-125-167-206.ngrok-free.app
