In [3]:
!pip install newspaper3k transformers gradio
!pip install langdetect

Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting gradio
  Downloading gradio-5.4.0-py3-none-any.whl.metadata (16 kB)
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting feedparser>=5.2.1 (from newspaper3k)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.1.2-py3-none-any.whl.metadata (11 kB)
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jieba3k>=0.35.1 (from newspaper3k)
  Downloading jieba3k-0.35.1.zip (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m51.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tinysegmenter==0.3 (from newspaper3k)
  Downloading tinysegme

In [5]:
from newspaper import Article
from transformers import pipeline
from langdetect import detect
import gradio as gr

# Load the translation and summarization pipelines
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")  # Supports multiple languages to English
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def split_text(text, max_tokens=300):
    words = text.split()
    chunks = []
    current_chunk = []

    # Split words into chunks of approximately max_tokens
    for word in words:
        if len(current_chunk) + len(word.split()) <= max_tokens:
            current_chunk.append(word)
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]
    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

def fetch_detect_translate_summarize(url):
    try:
        # Fetch the article
        article = Article(url)
        article.download()
        article.parse()
        text = article.text

        # Check if article text is empty
        if not text:
            return "Error: No text found in the article.", "The article may be empty or couldn't be parsed."

        # Detect language
        language = detect(text)
        print(f"Detected language: {language}")

        # Translate if the language is not English
        if language != 'en':
            print("Translating text to English...")
            chunks = split_text(text, max_tokens=300)
            translated_chunks = []

            for chunk in chunks:
                translation = translator(chunk, max_length=450, truncation=True)
                translated_chunks.append(translation[0]['translation_text'])

            english_text = " ".join(translated_chunks)
        else:
            english_text = text

        # Summarize the (translated) English text
        summary = summarizer(english_text, max_length=150, min_length=30, do_sample=False, truncation=True)

        if not summary or len(summary) == 0:
            return "Error: Summarization failed.", "No summary returned."

        summarized_text = summary[0]['summary_text']

        return english_text, summarized_text
    except Exception as e:
        print("Error details:", str(e))  # Print the error details
        return "Error fetching or processing the article.", str(e)

# Gradio Interface with customized labels
app = gr.Interface(
    fn=fetch_detect_translate_summarize,
    inputs=gr.Textbox(label="News URL"),
    outputs=[
        gr.Textbox(label="Full English Translation"),
        gr.Textbox(label="Summary")
    ],
    title="Multilingual News Translator and Summarizer",
    description="Enter a news URL in any language. This tool will detect the language, translate it to English if needed, and provide a summary."
)

# Launch the app
app.launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://f553e0b6b1e178a091.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


