In [None]:
import pandas as pd
import torch
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from tqdm import tqdm  # Progress bar

# Load Pretrained Pegasus-Xsum Model
model_name = "google/pegasus-xsum"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

# Move to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Summarization Function
def summarize_with_pegasus(text):
    if not isinstance(text, str) or len(text.strip()) == 0:
        return ""
    try:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        inputs = {key: value.to(device) for key, value in inputs.items()}
        summary_ids = model.generate(**inputs, max_length=80, min_length=30, length_penalty=2.0, num_beams=4)
        return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    except Exception as e:
        print(f"Error summarizing text: {e}")
        return ""

# Load Your Dataset
df = pd.read_csv("combineddatasummary1000P.csv")  # Modify filename if needed

# Apply Summarization to the 'content' Column
tqdm.pandas()
df['pegasus_summary'] = df['content'].progress_apply(summarize_with_pegasus)

# Save Summarized Output
df.to_csv("news_summaries_pegasus.csv", index=False)
print("✅ Pegasus summarization done! Saved to 'news_summaries_pegasus.csv'")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.52M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]


  0%|          | 0/999 [00:00<?, ?it/s][A
  0%|          | 2/999 [00:38<5:17:18, 19.10s/it][A
  0%|          | 3/999 [00:59<5:31:20, 19.96s/it][A
  0%|          | 4/999 [01:17<5:21:24, 19.38s/it][A
  1%|          | 5/999 [01:37<5:22:10, 19.45s/it][A
  1%|          | 6/999 [01:54<5:10:45, 18.78s/it][A
  1%|          | 7/999 [02:27<6:24:21, 23.25s/it][A
  1%|          | 8/999 [02:44<5:51:49, 21.30s/it][A
  1%|          | 9/999 [03:02<5:32:22, 20.14s/it][A
  1%|          | 10/999 [03:20<5:21:55, 19.53s/it][A
  1%|          | 11/999 [03:52<6:24:06, 23.33s/it][A
  1%|          | 12/999 [04:23<7:01:45, 25.64s/it][A
  1%|▏         | 13/999 [04:42<6:29:55, 23.73s/it][A
  1%|▏         | 14/999 [05:03<6:18:04, 23.03s/it][A
  2%|▏         | 15/999 [05:17<5:29:21, 20.08s/it][A
  2%|▏         | 16/999 [05:33<5:12:17, 19.06s/it][A
  2%|▏         | 17/999 [05:47<4:44:04, 17.36s/it][A
  2%|▏         | 18/999 [06:12<5:22:37, 19.73s/it][A
  2%|▏         | 19/999 [06:25<4:49:16, 17.71s

✅ Pegasus summarization done! Saved to 'news_summaries_pegasus.csv'





In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=0792de486a2182d3951a6b94566daaf09eab8ceae6f43f42592d85f18c7745f2
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
import pandas as pd
import evaluate


# Check columns
print(df.columns)

# Assuming your columns are 'description' and 'pegasus_summary'
references = df['description'].astype(str).tolist()
predictions = df['pegasus_summary'].astype(str).tolist()

# Load ROUGE
rouge = evaluate.load('rouge')

# Compute ROUGE
results = rouge.compute(predictions=predictions, references=references)

# Print results
print("ROUGE-1: {:.4f}".format(results['rouge1']))
print("ROUGE-2: {:.4f}".format(results['rouge2']))
print("ROUGE-L: {:.4f}".format(results['rougeL']))


Index(['headlines', 'description', 'content', 'url', 'category',
       'pegasus_summary'],
      dtype='object')
ROUGE-1: 0.2390
ROUGE-2: 0.0645
ROUGE-L: 0.1730
