In [1]:
# ✅ Install dependencies including the new lxml_html_clean
%pip install -q transformers gradio newspaper3k PyMuPDF lxml_html_clean

Note: you may need to restart the kernel to use updated packages.


In [1]:
import gradio as gr
from transformers import pipeline
from newspaper import Article
import fitz  # PyMuPDF

In [2]:
# Load summarization pipeline
try:
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
except Exception as e:
    raise RuntimeError(f"❌ Failed to load summarization model: {str(e)}")

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


In [3]:
# Helper Functions
def extract_text_from_url(url):
    try:
        if not url.startswith("http"):
            return "❌ Invalid URL format. Please include http or https."
        article = Article(url)
        article.download()
        article.parse()
        if not article.text.strip():
            return "❌ No text found in the article."
        return article.text
    except Exception as e:
        return f"❌ Error extracting from URL: {str(e)}"

def extract_text_from_pdf(file):
    try:
        if not file.name.endswith(".pdf"):
            return "❌ Invalid file format. Only PDFs are supported."
        doc = fitz.open(stream=file.read(), filetype="pdf")
        text = ""
        for page in doc:
            text += page.get_text()
        if not text.strip():
            return "❌ No text found in PDF."
        return text
    except Exception as e:
        return f"❌ Error reading PDF: {str(e)}"

In [5]:
# Main summarization logic
def summarize_input(text, url, file, word_length):
    source = "Textbox"

    try:
        if file is not None:
            text = extract_text_from_pdf(file)
            source = "PDF"
        elif url and url.strip():
            text = extract_text_from_url(url)
            source = "URL"
    except Exception as e:
        return f"❌ Failed to extract input: {str(e)}"

    if not text or "❌" in text:
        return text  # return the error message directly

    if len(text.strip()) < 30:
        return (
            f"⚠️ Please provide at least 30 characters of valid content.\n\n"
            f"📝 Current Input Length: {len(text.strip())} characters."
        )

    # Estimate token length from word count (approx. 1 word ≈ 1.3 tokens)
    estimated_max = min(int(word_length * 1.3), 1024)
    estimated_min = max(int(word_length * 0.5), 20)

    try:
        summary_result = summarizer(
            text,
            max_length=estimated_max,
            min_length=estimated_min,
            do_sample=False
        )
        summary = summary_result[0]['summary_text']
    except Exception as e:
        return f"❌ Summarization failed: {str(e)}"

    return (
        f"✅ **Source**: {source}\n"
        f"🧾 **Input Length**: {len(text.strip())} characters\n"
        f"📏 **Target Summary Length**: {word_length} words "
        f"(≈ {estimated_min}-{estimated_max} tokens)\n\n"
        f"### 🧠 Summary:\n{summary}"
    )

In [6]:

# Build UI
with gr.Blocks() as demo:
    gr.Markdown("## 🧠 English Text Summarizer (Text / URL / PDF)")

    with gr.Row():
        text_input = gr.Textbox(label="📄 Paste Text (Optional)", lines=10, placeholder="Type or paste up to 5000 characters...")
        url_input = gr.Textbox(label="🔗 URL (Optional)", placeholder="https://example.com/article")
        file_input = gr.File(label="📎 Upload PDF (Optional)", file_types=[".pdf"])

    word_slider = gr.Slider(
        minimum=30,
        maximum=790,
        value=80,
        step=10,
        label="📝 Desired Summary Length (Words)",
        info="Max supported summary length ≈ 790 words (1024 tokens)."
    )

    summarize_button = gr.Button("🚀 Summarize")
    output = gr.Markdown(elem_id="summary-output")

    summarize_button.click(
        fn=summarize_input,
        inputs=[text_input, url_input, file_input, word_slider],
        outputs=output
    )

demo.launch()

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


