In [1]:
!pip install transformers rouge-score datasets nltk --quiet

import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [3]:
import kagglehub
import shutil
import os

# Download to default location
original_path = kagglehub.dataset_download("gowrishankarp/newspaper-text-summarization-cnn-dailymail")

# Copy to /content/
target_path = "/content/data"
if not os.path.exists(target_path):
    shutil.copytree(original_path, target_path)

print("✅ Dataset is now available at:", target_path)


✅ Dataset is now available at: /content/data


In [10]:
import pandas as pd
import re
from transformers import BartTokenizer, BartForConditionalGeneration
from rouge_score import rouge_scorer

# Text cleaner
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'[^a-zA-Z0-9.,!?\'\s]', '', text)
    return text.strip()

# BART Summarizer
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

def bart_summarize(text, max_length=60, min_length=20):
    inputs = tokenizer([text], max_length=1024, return_tensors='pt', truncation=True)
    summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=max_length, min_length=min_length, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# ROUGE
def compute_rouge(ref, pred):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(ref, pred)
    return {k: round(v.fmeasure, 4) for k, v in scores.items()}


In [11]:
# Load and sample data
train_df = pd.read_csv(f"{target_path}/cnn_dailymail/train.csv")[['article', 'highlights']].dropna().sample(2, random_state=1)
test_df = pd.read_csv(f"{target_path}/cnn_dailymail/test.csv")[['article', 'highlights']].dropna().sample(2, random_state=2)

# Clean articles
train_df['clean_article'] = train_df['article'].apply(clean_text)
test_df['clean_article'] = test_df['article'].apply(clean_text)


In [12]:
def evaluate_and_print(df, label):
    print(f"\n\n========== {label.upper()} SET ==========")
    for i, row in df.iterrows():
        cleaned = row['clean_article']
        reference = row['highlights']
        prediction = bart_summarize(cleaned)

        print(f"\n--- Sample {i+1} ---")
        print("📰 Article (truncated):\n", cleaned[:400], "...\n")
        print("✅ Human Summary:\n", reference)
        print("🤖 BART Summary:\n", prediction)
        print("📊 ROUGE:", compute_rouge(reference, prediction))

# Evaluate both sets
evaluate_and_print(train_df, "train")
evaluate_and_print(test_df, "test")





--- Sample 121640 ---
📰 Article (truncated):
 a russian father who fed parrots, guinea pigs, cats and puppies to his pet snake and filmed the gruesome footage is being hunted by police. fatheroftwo andrei generalov, 32, from st. petersburg in northwestern russia, started uploading videos of his boa constrictor 'king' devouring rats and hamsters. but as the videos rose in popularity, he began taking suggestions from viewers who demanded more ' ...

✅ Human Summary:
 Andrei Generalov, 32, uploads videos of his Boa Constrictor eating animals .
Feeds his snake called 'King' parrots, guinea pigs, cats and even puppies .
Boas have jaws lined with hooked teeth for grabbing and holding prey .
They wrap their bodies around their victim, squeezing it until it suffocates .
Petition started to take him to court and he is now being hunted by police .
🤖 BART Summary:
  andrei generalov, 32, from st. petersburg, russia, started uploading videos of his boa constrictor 'king' devouring rats and hams

In [13]:
!pip install gradio transformers --quiet


In [14]:
from transformers import BartTokenizer, BartForConditionalGeneration
import torch

# Load pretrained BART
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

# Summarization function
def bart_summarize(text, max_length=60, min_length=20):
    inputs = tokenizer([text], max_length=1024, return_tensors='pt', truncation=True)
    summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=max_length, min_length=min_length, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)


In [15]:
import gradio as gr

# Gradio Interface
interface = gr.Interface(
    fn=bart_summarize,
    inputs=gr.Textbox(lines=15, label="Enter Article"),
    outputs=gr.Textbox(lines=6, label="BART Summary"),
    title="📰 Text Summarizer with BART",
    description="Paste a news article or paragraph. The model will return a concise summary using Facebook's BART-Large-CNN."
)

# Launch
interface.launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://10ef7f6dd04e10994d.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


