## Setup
### Installing the requirements

In [None]:
!pip install transformers datasets evaluate rouge_score

### Define variables

In [None]:
TRAIN_TEST_SPLIT = 0.2 # The percentage of the dataset we will split as train and test
MAX_INPUT_LENGTH = 1024  # Maximum length of the input to the model
MIN_TARGET_LENGTH = 5  # Minimum length of the output by the model
MAX_TARGET_LENGTH = 128  # Maximum length of the output by the model
BATCH_SIZE = 8  # Batch-size for training our model
LEARNING_RATE = 2e-5  # Learning-rate for training our model
MAX_EPOCHS = 1  # Maximum number of epochs we will train the model for

MODEL_CHECKPOINT = "t5-small"

# Local directory where to save the finetuned model
MODEL_PATH = "/content"

# Repository name for saving model to the Hugging Face Hub
REPO_NAME = "sree10304/Abstractive_Summarization"

# File for inference example
INPUT_FILE = "Input/Airlines_Are_Just_Banks_Now.txt"

# For summarization tasks, T5 requires the following prefix
PREFIX = "summarize: "

# Enable parallelized tokenization
TOKENIZERS_PARALLELISM= True

# Disable W&B logging
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
#Loading Xsum dataset
from datasets import load_dataset

raw_datasets = load_dataset("xsum", split="train")

The dataset has the following fields:

- __document__: the original BBC article to be summarized
- __summary__: the single sentence summary of the BBC article
- __id__: ID of the document-summary pair

In [None]:
raw_datasets = raw_datasets.select(list(range(500)))

In [None]:
raw_datasets

Dataset({
    features: ['document', 'summary', 'id'],
    num_rows: 500
})

We can see how the data looks like by retrieving the first item in ``raw_datasets``:

In [None]:
print(raw_datasets[0])



In [None]:
#Train Test split 80% 20%

raw_split_datasets = raw_datasets.train_test_split(train_size=0.8, test_size=0.2)

In [None]:
raw_split_datasets

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 400
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 100
    })
})

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

In [None]:
def preprocess_fn(examples, tokenizer):
    if MODEL_CHECKPOINT in ["t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b"]: prefix = PREFIX
    else: prefix = ""

    inputs = [prefix + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(text=examples["summary"], max_length=MAX_TARGET_LENGTH, truncation=True)

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

In [None]:
raw_split_datasets

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 400
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 100
    })
})

In [None]:
tokenized_datasets = raw_split_datasets.map(preprocess_fn, fn_kwargs={"tokenizer":tokenizer}, batched=True)

Map:   0%|          | 0/400 [00:00<?, ? examples/s]



Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
#Model T5
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq

model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
import evaluate

rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
import numpy as np

def metric_fn(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
#!pip uninstall transformers

Found existing installation: transformers 4.30.1
Uninstalling transformers-4.30.1:
  Would remove:
    /usr/local/bin/transformers-cli
    /usr/local/lib/python3.10/dist-packages/transformers-4.30.1.dist-info/*
    /usr/local/lib/python3.10/dist-packages/transformers/*
Proceed (Y/n)? Y
  Successfully uninstalled transformers-4.30.1


In [None]:
!pip install transformers==4.30.1

Collecting transformers==4.30.1
  Downloading transformers-4.30.1-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.30.1)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.14.1
    Uninstalling tokenizers-0.14.1:
      Successfully uninstalled tokenizers-0.14.1
  Attempting uninstall: transformers
    Found existing installation: transformers 4.34.0
    Uninstalling transformers-4.34.0:
      Successfully uninstalled transformers-4.34.0
Successfully installed tokenizers-0.13.3 transformers-4.30.1


In [None]:
"""! pip install -U accelerate
! pip install -U transformers"""

In [None]:
"""from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=metric_fn,
)"""

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.23.0-py3-none-any.whl (258 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.23.0


In [None]:
pip install transformers[torch]



In [None]:
from transformers import T5ForConditionalGeneration, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments

# Define the model and tokenizer
"""model_name = "t5-small"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)"""

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5-finetuned",
    overwrite_output_dir=True,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    save_steps=1000,
    save_total_limit=2,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=500,
    evaluation_strategy="steps",
    eval_steps=1000,
    logging_first_step=True,
    learning_rate=2e-5,
    predict_with_generate=True,
    push_to_hub=False,
    report_to="tensorboard",
)

# Create Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_datasets["train"],  # Replace with your training dataset
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=metric_fn,# Replace with your evaluation dataset
)
print(trainer)
# Train the model
#trainer.train()




<transformers.trainer_seq2seq.Seq2SeqTrainer object at 0x7d1f90f38dc0>


In [None]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


TrainOutput(global_step=300, training_loss=3.2132230297724407, metrics={'train_runtime': 4750.6638, 'train_samples_per_second': 0.253, 'train_steps_per_second': 0.063, 'total_flos': 249268934934528.0, 'train_loss': 3.2132230297724407, 'epoch': 3.0})

## Save the model

First we save the model, locally, for future use:

In [None]:
trainer.save_model("/content/t5-finetuned")

In [None]:
pip install transformers==4.24.0

In [None]:
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
trainer.push_to_hub("sree10304/t5-finetuned")

In [None]:
"""input_file = "/content/Airlines_Are_Just_Banks_Now.txt"
with open(input_file, 'r') as file:
    input = file.read().replace('\n', '')"""

In [None]:
#Example
#input="swashbucklingopener chris gayle has been included in the 13man west indies team which will play a t20 international match against india in jamaica on sunday july 9the lefthander is the highest runmaker for the windies in the t20 format with 1519 runs average 3532 strike rate 14549 including two centuries this will be his first t20 international on his home ground sabina park he last played a t20 international for west indies in april 2016 in the world t20 final vs england at kolkatas eden gardenswe welcome chris back to the t20 squad he is the most prolific batsman in this format and will add value to our team at the top of the order he will get the chance to play on his home ground and against a topquality indian team said courtney browne cricket west indies chairman of selectorsthe lefthanded batsman has been included in the side at the expense of lendl simmons who scored 6 17 and 15 in the series against afghanistan west indies last t20 assignmenttest and odi skipper jason holder who did not feature in that series has been rested againwi teamcarlos brathwaite captainsamuel badreeronsford beatonchris gayleevin lewisjason mohammedsunil narinekieron pollardrovman powellmarlon samuels."

input="Data science is the study of data to extract meaningful insights for business. It is a multidisciplinary approach that combines principles and practices from the fields of mathematics, statistics, artificial intelligence, and computer engineering to analyze large amounts of data. This analysis helps data scientists to ask and answer questions like what happened, why it happened, what will happen, and what can be done with the results"

In [None]:
pref_input = PREFIX + input

In [None]:
pref_input

'summarize: Data science is the study of data to extract meaningful insights for business. It is a multidisciplinary approach that combines principles and practices from the fields of mathematics, statistics, artificial intelligence, and computer engineering to analyze large amounts of data. This analysis helps data scientists to ask and answer questions like what happened, why it happened, what will happen, and what can be done with the results'

In [None]:
#My Model
from transformers import pipeline

summarizer = pipeline("summarization", model="sree10304/t5-finetuned")
summarizer(pref_input)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Your max_length is set to 200, but your input_length is only 83. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=41)


[{'summary_text': 'data science is a multidisciplinary approach that combines principles and practices from the fields of mathematics, statistics, artificial intelligence, and computer engineering to analyze large amounts of data.'}]

In [None]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge

def evaluation(finetune,normal):
  bleu_score = sentence_bleu([finetune.split()], normal.split())
  rouge = Rouge()
  rouge_scores = rouge.get_scores(finetune, normal)
  return bleu_score,rouge_scores


In [None]:
hug_model_finetune="chris gayleevin lewisjason mohammedsunil narinekieron pollardrovman powellmarlon samuels has been included in the west indies squad."
finetune="swashbucklingopener chris gayle has been included in the west indies team which will play a t20 international against india on sunday july 9 . he is the highest runmaker for the windies with 1519 runs average 3532 strike rate 14549 including two centuries."

reference="swashbuckling opener chris gayle has been included in the 13man west indies squad which will play a t20 match against india in jamaica on july 9 this will be his first t20 international at his home ground and will add value to our team said courtney browne."

normal="chris gayle has been included in the 13man west indies team. the lefthander is the highest runmaker for the windies in the t20 format. this will be his first t20 international on his home ground sabina park"

In [None]:
evaluation(finetune,reference)    #My model accuracy with original summary in dataset

(0.23961661128439754,
 [{'rouge-1': {'r': 0.5, 'p': 0.525, 'f': 0.5121951169541941},
   'rouge-2': {'r': 0.30434782608695654,
    'p': 0.34146341463414637,
    'f': 0.3218390754762849},
   'rouge-l': {'r': 0.4523809523809524,
    'p': 0.475,
    'f': 0.46341462914931586}}])

In [None]:
evaluation(finetune,normal)       # My model accuray with normal t5

(0.29317239563019903,
 [{'rouge-1': {'r': 0.6333333333333333, 'p': 0.475, 'f': 0.5428571379591837},
   'rouge-2': {'r': 0.42857142857142855,
    'p': 0.36585365853658536,
    'f': 0.39473683713642665},
   'rouge-l': {'r': 0.6333333333333333, 'p': 0.475, 'f': 0.5428571379591837}}])

In [None]:
evaluation(hug_model_finetune,normal)

(0.11092770141728163,
 [{'rouge-1': {'r': 0.26666666666666666, 'p': 0.5, 'f': 0.3478260824196598},
   'rouge-2': {'r': 0.14285714285714285,
    'p': 0.3333333333333333,
    'f': 0.19999999580000008},
   'rouge-l': {'r': 0.26666666666666666, 'p': 0.5, 'f': 0.3478260824196598}}])