In [None]:
# Install required packages
!pip install gradio langchain pypdf transformers torch
!pip install langchain-community

Collecting gradio
  Downloading gradio-5.16.0-py3-none-any.whl.metadata (16 kB)
Collecting pypdf
  Downloading pypdf-5.3.0-py3-none-any.whl.metadata (7.2 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.8-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.7.0 (from gradio)
  Downloading gradio_client-1.7.0-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downl

In [None]:
import gradio as gr
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline  # Fixed import
import torch
import tempfile
import os
import re

# Legal-specific model
MODEL_NAME = "manjunathainti/fine_tuned_t5_summarizer"
MAX_CHUNK_LENGTH = 4096
MIN_CHUNK_LENGTH = 128

def load_model():
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    device = "cuda" if torch.cuda.is_available() else "cpu"

    model = AutoModelForSeq2SeqLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float32 if device == "cpu" else torch.float16,
        low_cpu_mem_usage=True
    ).to(device)

    return tokenizer, model

def preprocess_legal_text(text):
    # Remove page numbers and headers
    text = re.sub(r'^\d+$\n', '', text, flags=re.MULTILINE)

    # Clean up whitespace while preserving paragraph structure
    text = re.sub(r'\n{3,}', '\n\n', text)

    # Preserve legal numbering and indentation
    text = re.sub(r'(?<=\n)\s{2,}(?=\d+\.)', '  ', text)

    return text

def split_into_legal_sections(text):
    section_markers = {
        'primary': [
            'BILL',
            'BE it enacted',
            'STATEMENT OF OBJECTS AND REASONS',
            'ANNEXURE'
        ],
        'secondary': [
            'CHAPTER',
            'PART',
            'Article',
            'Section',
            'Amendment of',
            'Insertion of'
        ]
    }

    sections = []
    current_section = []
    current_title = None

    lines = text.split('\n')

    for line in lines:
        primary_marker = next((marker for marker in section_markers['primary']
                             if marker in line), None)
        if primary_marker:
            if current_section:
                sections.append({
                    'title': current_title or 'General',
                    'content': '\n'.join(current_section),
                    'level': 'primary'
                })
            current_section = [line]
            current_title = line.strip()
            continue

        secondary_marker = next((marker for marker in section_markers['secondary']
                               if marker in line), None)
        if secondary_marker:
            if current_section:
                sections.append({
                    'title': current_title or 'General',
                    'content': '\n'.join(current_section),
                    'level': 'secondary'
                })
            current_section = [line]
            current_title = line.strip()
            continue

        current_section.append(line)

    if current_section:
        sections.append({
            'title': current_title or 'General',
            'content': '\n'.join(current_section),
            'level': 'secondary'
        })

    return sections

def summarize_legal_text(text, tokenizer, model):
    device = next(model.parameters()).device

    summarizer = pipeline(
        "summarization",
        model=model,
        tokenizer=tokenizer,
        device=device,
        max_length=200,
        min_length=50,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )

    text = f"Summarize the following legal text while preserving key details and references: {text}"

    try:
        max_chunk_size = 4096
        if len(tokenizer.encode(text)) > max_chunk_size:
            chunks = [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]
            summaries = []
            for chunk in chunks:
                chunk_summary = summarizer(chunk, max_length=200, min_length=50)[0]['summary_text']
                summaries.append(chunk_summary)
            return " ".join(summaries)
        else:
            summary = summarizer(text, max_length=200, min_length=50)[0]['summary_text']
            return summary
    except Exception as e:
        print(f"Error in summarization: {str(e)}")
        return "Error generating summary for this section."



def process_pdf(file_obj, progress=gr.Progress()):
    if file_obj is None:
        return "Please upload a PDF file."

    try:
        progress(0.1, desc="Loading document...")

        # Create temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
            if hasattr(file_obj, 'name'):
                with open(file_obj.name, 'rb') as f:
                    content = f.read()
            else:
                with open(file_obj, 'rb') as f:
                    content = f.read()
            tmp_file.write(content)
            file_path = tmp_file.name

        progress(0.2, desc="Loading legal AI model...")
        tokenizer, model = load_model()

        progress(0.3, desc="Extracting text...")
        loader = PyPDFLoader(file_path)
        pages = loader.load_and_split()
        text = " ".join([page.page_content for page in pages])

        progress(0.4, desc="Processing legal text...")
        text = preprocess_legal_text(text)
        sections = split_into_legal_sections(text)

        progress(0.5, desc="Generating detailed summary...")
        section_summaries = []

        for i, section in enumerate(sections):
            if len(section['content'].strip()) > MIN_CHUNK_LENGTH:
                # Format based on section level
                if section['level'] == 'primary':
                    section_summaries.append(f"\n\n== {section['title']} ==\n")
                else:
                    section_summaries.append(f"\n-- {section['title']} --\n")

                summary = summarize_legal_text(section['content'], tokenizer, model)
                section_summaries.append(summary)

            progress((0.5 + (0.5 * (i + 1) / len(sections))),
                    desc=f"Processing section {i+1}/{len(sections)}")

        os.unlink(file_path)

        final_summary = "\n".join(section_summaries)
        if not final_summary.strip():
            return "Could not generate summary. Please check if the PDF contains readable text."

        return final_summary

    except Exception as e:
        if 'file_path' in locals():
            try:
                os.unlink(file_path)
            except:
                pass
        return f"An error occurred: {str(e)}"

def main():
    iface = gr.Interface(
        fn=process_pdf,
        inputs=gr.File(label="Upload Legal Document (PDF)", file_types=[".pdf"]),
        outputs=gr.Textbox(label="Legal Document Summary", lines=25),
        title="Legal Document Summarizer",
        description="""Upload a legal document (PDF) to generate a comprehensive summary.
                      This tool is specifically optimized for legal texts including bills,
                      acts, and constitutional amendments.""",
        examples=[],
        cache_examples=False,
        theme=gr.themes.Soft()
    )

    iface.launch(debug=True, share=True)

if __name__ == "__main__":
    main()

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://ead2fb7a497543d0d3.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

Device set to use cpu
Your max_length is set to 200, but your input_length is only 184. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=92)
Device set to use cpu
Device set to use cpu
Device set to use cpu
Your max_length is set to 200, but your input_length is only 157. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=78)
Device set to use cpu
Your max_length is set to 200, but your input_length is only 65. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=32)
