In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers datasets bert-score rouge-score nltk

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━

In [3]:
!pip install transformers datasets bert-score rouge-score nltk
import nltk
nltk.download('punkt_tab')



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaModel, BartTokenizer, BartModel
from transformers import T5Tokenizer, T5EncoderModel, GPT2Tokenizer, GPT2Model
from datasets import load_dataset
from nltk.tokenize import sent_tokenize
from collections import OrderedDict

# === Sentence Splitter ===
def split_into_sentences(text):
    return sent_tokenize(text)

# === Tokenizer Helper ===
def tokenize_sentences(sentences, tokenizer):
    return tokenizer(sentences, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

# === BERT Model ===
class BertForExtractiveSummarization(nn.Module):
    def __init__(self, pretrained_model="bert-base-uncased"):
        super().__init__()
        self.bert = BertModel.from_pretrained(pretrained_model)
        self.classifier = nn.Linear(self.bert.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        logits = self.classifier(cls_output).squeeze(-1)
        return logits

# === RoBERTa Model ===
class RobertaForExtractiveSummarization(nn.Module):
    def __init__(self, pretrained_model="roberta-base"):
        super().__init__()
        self.roberta = RobertaModel.from_pretrained(pretrained_model)
        self.classifier = nn.Linear(self.roberta.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        logits = self.classifier(cls_output).squeeze(-1)
        return logits

# === BART Model ===
class BartForExtractiveSummarization(nn.Module):
    def __init__(self, pretrained_model="facebook/bart-base"):
        super().__init__()
        self.bart = BartModel.from_pretrained(pretrained_model)
        self.classifier = nn.Linear(self.bart.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bart(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        logits = self.classifier(cls_output).squeeze(-1)
        return logits

# === FLAN-T5-Small Model ===
class FlanT5ForExtractiveSummarization(nn.Module):
    def __init__(self, pretrained_model="google/flan-t5-small"):
        super().__init__()
        self.encoder = T5EncoderModel.from_pretrained(pretrained_model)
        self.classifier = nn.Linear(self.encoder.config.d_model, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        logits = self.classifier(cls_output).squeeze(-1)
        return logits

# === GPT-2 Model ===
class GPT2ForExtractiveSummarization(nn.Module):
    def __init__(self, pretrained_model="gpt2"):
        super().__init__()
        self.encoder = GPT2Model.from_pretrained(pretrained_model)
        self.classifier = nn.Linear(self.encoder.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # Use first token
        logits = self.classifier(cls_output).squeeze(-1)
        return logits


# === Model Inference Helper ===
def generate_summary(model, tokenizer, sentences, device):
    if len(sentences) < 3:
        sentences += [""] * (3 - len(sentences))

    tokenized = tokenize_sentences(sentences, tokenizer)
    input_ids = tokenized['input_ids'].to(device)
    attention_mask = tokenized['attention_mask'].to(device)

    with torch.no_grad():
        logits = model(input_ids, attention_mask).squeeze(0)
        if logits.dim() == 0 or len(logits) != len(sentences):
            return ""
        top_indices = sorted(range(len(logits)), key=lambda i: logits[i], reverse=True)[:3]
        return " ".join([sentences[i] for i in top_indices])

# === Main Execution ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load Sample 69
dataset = load_dataset("cnn_dailymail", "3.0.0", split="test")
sample = dataset[69]
article = sample["article"]
reference_summary = sample["highlights"]
sentences = split_into_sentences(article)

# === Load BERT Model ===
bert_model = BertForExtractiveSummarization().to(device)
bert_state = torch.load("/best_bert_model.pt", map_location=device)
bert_model.load_state_dict({k.replace("module.", ""): v for k, v in bert_state.items()})
bert_model.eval()
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_summary = generate_summary(bert_model, bert_tokenizer, sentences, device)

# === Load RoBERTa Model ===
roberta_model = RobertaForExtractiveSummarization().to(device)
roberta_state = torch.load("/best_roberta_cnn_dailymail_model.pt", map_location=device)
roberta_model.load_state_dict({k.replace("module.", ""): v for k, v in roberta_state.items()})
roberta_model.eval()
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta_summary = generate_summary(roberta_model, roberta_tokenizer, sentences, device)

# === Load BART Model ===
bart_model = BartForExtractiveSummarization().to(device)
bart_state = torch.load("/best_bart_cnn_dailymail_model.pt", map_location=device)
bart_model.load_state_dict({k.replace("module.", ""): v for k, v in bart_state.items()})
bart_model.eval()
bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
bart_summary = generate_summary(bart_model, bart_tokenizer, sentences, device)

# === Load FLAN-T5-Small Model ===
flan_model = FlanT5ForExtractiveSummarization().to(device)
flan_state = torch.load("/best_flan_t5_cnn_dailymail_model.pt", map_location=device)
flan_model.load_state_dict({k.replace("module.", ""): v for k, v in flan_state.items()})
flan_model.eval()
flan_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
flan_summary = generate_summary(flan_model, flan_tokenizer, sentences, device)

# === Load GPT-2 Model ===
gpt2_model = GPT2ForExtractiveSummarization().to(device)
gpt2_state = torch.load("/best_gpt2_extractive.pt", map_location=device)
gpt2_model.load_state_dict({k.replace("module.", ""): v for k, v in gpt2_state.items()})
gpt2_model.eval()
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token  # Set pad token if missing
gpt2_summary = generate_summary(gpt2_model, gpt2_tokenizer, sentences, device)


# === Print Results ===
print("\n   Original Article:\n", article)
print("\n   Reference Summary:\n", reference_summary)
print("\n   BERT Predicted Summary:\n", bert_summary)
print("\n   RoBERTa Predicted Summary:\n", roberta_summary)
print("\n   BART Predicted Summary:\n", bart_summary)
print("\n   FLAN-T5-Small Predicted Summary:\n", flan_summary)
print("\n   GPT-2 Predicted Summary:\n", gpt2_summary)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]


   Original Article:
 (CNN)Nine British citizens were arrested in Turkey on Wednesday, suspected of trying to cross illegally into Syria, the Turkish military said on its website. The group included four children -- the oldest being 10 or 11, with the youngest born in 2013, a Turkish official told CNN on condition of anonymity. The nine were arrested at the Turkey-Syria border, the Turkish military said. It didn't say why the group allegedly was trying to get into Syria, which has been torn by a roughly four-year war between Syrian government forces and Islamist extremist groups and other rebels. Among the war's combatants is ISIS, which has taken over parts of Syria and Iraq for what it claims is its Islamic caliphate, and which is known to have been recruiting Westerners. Accompanying the children were three men and two women; all nine had British passports, the Turkish official said. UK police charge man with terror offenses after Turkey trip . The British Foreign Office said Wedne