<a href="https://colab.research.google.com/github/Pranav-JJ/Transformers-Abstractive-Summarisation/blob/main/AbstractiveSummarisationBBCNews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers==4.20.0
!pip install keras_nlp==0.3.0
!pip install datasets
!pip install huggingface-hub
!pip install nltk
!pip install rouge-score

Collecting transformers==4.20.0
  Downloading transformers-4.20.0-py3-none-any.whl (4.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.1.0 (from transformers==4.20.0)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1 (from transformers==4.20.0)
  Downloading tokenizers-0.12.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m100.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.16.4 tokenizers-0.12.1 transformers-4.20.0
Collecting keras_nlp==0.3.0
  Downloading keras_nlp-0.3.0-py3-none-any.whl (142 kB)
[2K

In [None]:
import os
import logging

import nltk
import numpy as np
import tensorflow as tf
from tensorflow import keras

# Only log error messages
tf.get_logger().setLevel(logging.ERROR)

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
# The percentage of the dataset you want to split as train and test
TRAIN_TEST_SPLIT = 0.5
MAX_INPUT_LENGTH = 1024  # Maximum length of the input to the model
MIN_TARGET_LENGTH = 5  # Minimum length of the output by the model
MAX_TARGET_LENGTH = 128  # Maximum length of the output by the model
BATCH_SIZE = 2  # Batch-size for training our model
LEARNING_RATE =  1e-5  # Learning-rate for training our model
MAX_EPOCHS = 1  # Maximum number of epochs we will train the model for

# This notebook is built on the t5-small checkpoint from the Hugging Face Model Hub
MODEL_CHECKPOINT = "t5-small"

In [None]:
from datasets import load_dataset

raw_datasets = load_dataset("gopalkalpande/bbc-news-summary", split="train")


Downloading readme:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading and preparing dataset csv/gopalkalpande--bbc-news-summary to /root/.cache/huggingface/datasets/gopalkalpande___csv/gopalkalpande--bbc-news-summary-f610c9f6377bc0fc/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/7.32M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/gopalkalpande___csv/gopalkalpande--bbc-news-summary-f610c9f6377bc0fc/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


In [None]:
print(raw_datasets)

Dataset({
    features: ['File_path', 'Articles', 'Summaries'],
    num_rows: 2224
})


In [None]:
print(raw_datasets[0])



In [None]:
raw_datasets = raw_datasets.train_test_split(
    train_size=TRAIN_TEST_SPLIT, test_size=TRAIN_TEST_SPLIT
)

In [None]:
print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['File_path', 'Articles', 'Summaries'],
        num_rows: 1112
    })
    test: Dataset({
        features: ['File_path', 'Articles', 'Summaries'],
        num_rows: 1112
    })
})


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

In [None]:
if MODEL_CHECKPOINT in ["t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b"]:
    prefix = "summarize: "
else:
    prefix = ""

In [None]:
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["Articles"]]
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["Summaries"], max_length=MAX_TARGET_LENGTH, truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

In [None]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/1112 [00:00<?, ? examples/s]

Map:   0%|          | 0/1112 [00:00<?, ? examples/s]

In [None]:
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq

model = TFAutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/231M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

In [None]:
train_dataset = tokenized_datasets["train"].to_tf_dataset(
    batch_size=BATCH_SIZE,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=True,
    collate_fn=data_collator,
)
test_dataset = tokenized_datasets["test"].to_tf_dataset(
    batch_size=BATCH_SIZE,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=False,
    collate_fn=data_collator,
)
generation_dataset = (
    tokenized_datasets["test"]
    .shuffle()
    .select(list(range(200)))
    .to_tf_dataset(
        batch_size=BATCH_SIZE,
        columns=["input_ids", "attention_mask", "labels"],
        shuffle=False,
        collate_fn=data_collator,
    )
)

In [None]:
optimizer = keras.optimizers.Adam(learning_rate=LEARNING_RATE)
model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [None]:
import keras_nlp

rouge_l = keras_nlp.metrics.RougeL()


def metric_fn(eval_predictions):
    predictions, labels = eval_predictions
    decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    for label in labels:
        label[label < 0] = tokenizer.pad_token_id  # Replace masked label tokens
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge_l(decoded_labels, decoded_predictions)
    # We will print only the F1 score, you can use other aggregation metrics as well
    result = {"RougeL": result["f1_score"]}

    return result

In [None]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(
    metric_fn, eval_dataset=generation_dataset, predict_with_generate=True
)

callbacks = [metric_callback]

# For now we will use our test set as our validation_data
model.fit(
    train_dataset, validation_data=test_dataset, epochs=MAX_EPOCHS, callbacks=callbacks
)





<keras.callbacks.History at 0x7f9f10722680>

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, framework="tf")

summarizer(
    raw_datasets["test"][0]["Articles"],
    min_length=MIN_TARGET_LENGTH,
    max_length=MAX_TARGET_LENGTH,
)

Token indices sequence length is longer than the specified maximum sequence length for this model (612 > 512). Running this sequence through the model will result in indexing errors


[{'summary_text': 'Labour has withdrawn two controversial posters and launched four new designs...A row was sparked after the party published posters appearing to depict Michael Howard, who is Jewish, as Fagin, and as a flying pig, amid claims they were anti-Semitic...Mr Milburn said he appreciated people\'s concerns, but insisted that "what they were was anti-Tory" and "not in any way, shape or form anti-semitism"The posters were labelled a "big misjudgement" by the Conservatives who said Labour\'s "'}]

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
output_dir = "model"  # Replace with your desired output directory

# Save the model and tokenizer
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)


('model/tokenizer_config.json',
 'model/special_tokens_map.json',
 'model/tokenizer.json')

In [None]:
from transformers.keras_callbacks import PushToHubCallback

push_to_hub_callback = PushToHubCallback(
    output_dir="T5-Finetuned-BBCNewsSummary",
    tokenizer=tokenizer,
)


Cloning https://huggingface.co/phoen1x/T5-Finetuned-BBCNewsSummary into local empty directory.


In [None]:
model.push_to_hub("T5-Finetuned-BBCNewsSummary", organization="keras-io")
tokenizer.push_to_hub("T5-Finetuned-BBCNewsSummary", organization="keras-io")

Upload file tf_model.h5:   0%|          | 1.00/231M [00:00<?, ?B/s]

To https://huggingface.co/phoen1x/T5-Finetuned-BBCNewsSummary
   333e60b..e57e1d0  main -> main

   333e60b..e57e1d0  main -> main

To https://huggingface.co/phoen1x/T5-Finetuned-BBCNewsSummary
   e57e1d0..85d3f67  main -> main

   e57e1d0..85d3f67  main -> main



'https://huggingface.co/phoen1x/T5-Finetuned-BBCNewsSummary/commit/85d3f671e679e4bacefedf96f7f1fe94c79aba81'

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("phoen1x/T5-Finetuned-BBCNewsSummary")

model = AutoModelForSeq2SeqLM.from_pretrained("phoen1x/T5-Finetuned-BBCNewsSummary", from_tf=True)

Downloading:   0%|          | 0.00/2.33k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.15k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/231M [00:00<?, ?B/s]

All TF 2.0 model weights were used when initializing T5ForConditionalGeneration.

All the weights of T5ForConditionalGeneration were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use T5ForConditionalGeneration for predictions without further training.


In [None]:
text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."


In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model="phoen1x/T5-Finetuned-BBCNewsSummary")
summarizer(text)

Downloading:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/231M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at phoen1x/T5-Finetuned-BBCNewsSummary.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


Downloading:   0%|          | 0.00/2.33k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.15k [00:00<?, ?B/s]

NameError: ignored

In [None]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'])

reference_summary = "During filming near York, a collision left Richard Hammond with critical injuries. Discussing on The Diary Of A CEO podcast, he noted his well-functioning working memory contrasted with his somewhat deficient long-term memory. Expressing apprehension, the 53-year-old host revealed his hesitancy to undergo testing for potential conditions contributing to this memory difference."
generated_summary = "Richard Hammond suffered serious head injuries in the crash, which happened while he was filming at a site near York. He told The Diary Of A CEO podcast his working memory was good but he worries about his long-term memory, which was not brilliant. The TV presenter, 53, added he is too scared to get checked for conditions that might have such an effect."


scores = scorer.score(reference_summary, generated_summary)

print(scores['rouge1'].fmeasure)  # Print F1 score for ROUGE-1
print(scores['rouge2'].fmeasure)  # Print F1 score for ROUGE-2
print(scores['rougeL'].fmeasure)  # Print F1 score for ROUGE-L


0.4297520661157025
0.16806722689075632
0.347107438016529


In [None]:

# 0.6176470588235294
# 0.3582089552238806
# 0.5735294117647058

# 0.6412213740458016
# 0.3875968992248063
# 0.45801526717557256

# 0.6527777777777777
# 0.4225352112676056
# 0.5138888888888888

# 0.4297520661157025
# 0.16806722689075632
# 0.347107438016529


In [None]:
sc1 = 0.6176470588235294
sc2 = 0.3582089552238806
sc3 = 0.5735294117647058

sc4 = 0.6412213740458016
sc5 = 0.3875968992248063
sc6 = 0.45801526717557256

sc7 = 0.6527777777777777
sc8 = 0.4225352112676056
sc9 = 0.5138888888888888

sc10 = 0.4297520661157025
sc11 = 0.16806722689075632
sc12 = 0.347107438016529

avgR1 = (sc1+sc4+sc7+sc10)/4
avgR2 = (sc2+sc5+sc8+sc11)/4
avgRL = (sc3+sc6+sc9+sc12)/4

print(avgR1)
print(avgR2)
print(avgRL)

0.5853495691907028
0.3341020731517622
0.4731352514614241


In [None]:
original_article = ""