<a href="https://colab.research.google.com/github/Vasan-th/Vasan-th/blob/main/hugging_face_summarization_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.0 tokenizers-0.13.2 transformers-4.26.1
Looking in indexes: https://pypi.org/simple, http

In [None]:
# loading and processing the dataset

from datasets import load_dataset

xsum_data = load_dataset('xsum', streaming = True)
xsum_data

{'train': <datasets.iterable_dataset.IterableDataset at 0x7fe745340a60>,
 'validation': <datasets.iterable_dataset.IterableDataset at 0x7fe745340ee0>,
 'test': <datasets.iterable_dataset.IterableDataset at 0x7fe7452ce100>}

In [None]:
# sample data
next(iter(xsum_data['train'].take(1)))

 'summary': 'Clean-up operations are continuing across the Scottish Borders and Dumfries and Galloway after flooding caused by Storm Frank.',
 'id': '35232142'}

In [None]:
# filtering very short summaries

xsum_data = xsum_data.filter(lambda sample: len(sample['summary']) > 2)
xsum_data

{'train': <datasets.iterable_dataset.IterableDataset at 0x7fe74532af40>,
 'validation': <datasets.iterable_dataset.IterableDataset at 0x7fe7452ce370>,
 'test': <datasets.iterable_dataset.IterableDataset at 0x7fe7444e6250>}

In [None]:
# building the model instead of using pipeline

from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer

checkpoint = "google/pegasus-xsum"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
# preprocessing

max_input_length = 512   # max accepted length of pegasus model
max_target_length = 30   # summary length

def preprocess_function(examples):
  model_inputs = tokenizer(examples['document'], max_length = max_input_length, truncation = True )
  labels = tokenizer(examples['summary'], max_length = max_target_length, truncation = True)

  model_inputs['labels'] = labels['input_ids']
  return model_inputs

In [None]:
tokenized_dataset = xsum_data.map(preprocess_function, batched = True, remove_columns = ['document', 'summary', 'id'])

In [None]:
next(iter(tokenized_dataset['train'].take(1))).keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [None]:
tokenized_dataset

{'train': <datasets.iterable_dataset.IterableDataset at 0x7fe74725b700>,
 'validation': <datasets.iterable_dataset.IterableDataset at 0x7fe7444e6e50>,
 'test': <datasets.iterable_dataset.IterableDataset at 0x7fe74453ab80>}

In [None]:
iterable_tokeized_data = iter(tokenized_dataset)
print(next(iterable_tokeized_data))
print(next(iterable_tokeized_data))

train
validation


In [None]:
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
from transformers import TFAutoModelForSeq2SeqLM

model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint) 

TypeError: ignored

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model = model, return_tensors = 'tf')

In [None]:
tf_train_dataset = model.prepare_tf_dataset(
    dataset = tokenized_dataset['train'],
    collate_fn = data_collator,
    shuffle = True,
    batch_size = 8,
    drop_remainder = True,
    prefetch = True
)

tf_valid_dataset = model.prepare_tf_dataset(
    dataset = tokenized_dataset['validation'],
    collate_fn = data_collator,
    shuffle = True,
    batch_size =8,
    drop_remainder = True,
    prefetch = True
)

TypeError: ignored

In [None]:
from transformers import create_optimizer
import tensorflow as tf

num_train_epochs = 5
num_train_steps = len(tf_train_dataset)*num_train_epochs
model_name = 'pegasus'

optimizer, schedule = create_optimizer(
    init_lr = 5.6e-5,
    num_warmup_steps = 0,
    num_train_steps = num_train_steps,
    weight_decay_rate = 0.01
)

model.compile(optimizer = optimizer)


No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [None]:
from transformers.keras_callbacks import PushToHubCallback

callback = PushToHubCallback(
    output_dir = f'{model_name}-finetuned-xsum', tokenizer = tokenizer
)

model.fit(tf_train_dataset, validation_data = tf_valid_dataset, callbacks = [callback], epochs = num_train_epochs)

/content/pegasus-finetuned-xsum is already a clone of https://huggingface.co/Vasanth18/pegasus-finetuned-xsum. Make sure you pull the latest changes with `repo.git_pull()`.


Epoch 1/5


In [None]:
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer


checkpoint = "google/pegasus-xsum"
pipe = pipeline(model = checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/3.52M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

In [None]:
from datasets import load_dataset

xsum_data = load_dataset('xsum')

Downloading builder script:   0%|          | 0.00/5.76k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.24k [00:00<?, ?B/s]

Downloading and preparing dataset xsum/default to /root/.cache/huggingface/datasets/xsum/default/1.2.0/082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/204045 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11332 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11334 [00:00<?, ? examples/s]

Dataset xsum downloaded and prepared to /root/.cache/huggingface/datasets/xsum/default/1.2.0/082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
%%time 

xsum_data['train']['document'][:3]

CPU times: user 823 ms, sys: 5.99 ms, total: 829 ms
Wall time: 854 ms


 'A fire alarm went off at the Holiday Inn in Hope Street at about 04:20 BST on Saturday and guests were asked to leave the hotel.\nAs they gathered outside they saw the two buses, parked side-by-side in the car park, engulfed by flames.\nOne of the tour groups is from Germany, the other from China and Taiwan. It was their first night in Northern Ireland.\nThe driver of one of the buses said many of the passengers had left personal belongings on board and these had been destroyed.\nBoth groups have organised replacement coaches and will begin their tour of the north coast later than they had planned.\nPolice have appealed for information about the attack.\nInsp David Gibson said: "It appears as though the fire started under one of the buses before spreading to the second.\n"While the exact cause is still under investigation, it is thought that the fire was started deliberately."',
 'Ferrari appeared in a position to challenge until the final laps, when the Mercedes stretched their legs

In [None]:
%%time

xsum_data['train'][:3]['document']        # faster method

CPU times: user 671 µs, sys: 956 µs, total: 1.63 ms
Wall time: 12.5 ms


 'A fire alarm went off at the Holiday Inn in Hope Street at about 04:20 BST on Saturday and guests were asked to leave the hotel.\nAs they gathered outside they saw the two buses, parked side-by-side in the car park, engulfed by flames.\nOne of the tour groups is from Germany, the other from China and Taiwan. It was their first night in Northern Ireland.\nThe driver of one of the buses said many of the passengers had left personal belongings on board and these had been destroyed.\nBoth groups have organised replacement coaches and will begin their tour of the north coast later than they had planned.\nPolice have appealed for information about the attack.\nInsp David Gibson said: "It appears as though the fire started under one of the buses before spreading to the second.\n"While the exact cause is still under investigation, it is thought that the fire was started deliberately."',
 'Ferrari appeared in a position to challenge until the final laps, when the Mercedes stretched their legs

In [None]:
predictions = pipe(xsum_data['train']['document'][:5])       # max sequence length in the dataset is higher than the one model expected

Token indices sequence length is longer than the specified maximum sequence length for this model (1061 > 512). Running this sequence through the model will result in indexing errors


IndexError: ignored

In [None]:
model_input = tokenizer(xsum_data['train'][:3]['document'], max_length = 512, truncation = True)

In [None]:
model_input.keys()

dict_keys(['input_ids', 'attention_mask'])

In [None]:
tokenizer.convert_ids_to_tokens(model_input['input_ids'][0])

['▁The',
 '▁full',
 '▁cost',
 '▁of',
 '▁damage',
 '▁in',
 '▁Newton',
 '▁Stewart',
 ',',
 '▁one',
 '▁of',
 '▁the',
 '▁areas',
 '▁worst',
 '▁affected',
 ',',
 '▁is',
 '▁still',
 '▁being',
 '▁assessed',
 '.',
 '▁Repair',
 '▁work',
 '▁is',
 '▁ongoing',
 '▁in',
 '▁Ha',
 'wick',
 '▁and',
 '▁many',
 '▁roads',
 '▁in',
 '▁Peebles',
 'shire',
 '▁remain',
 '▁badly',
 '▁affected',
 '▁by',
 '▁standing',
 '▁water',
 '.',
 '▁Trains',
 '▁on',
 '▁the',
 '▁west',
 '▁coast',
 '▁mainline',
 '▁face',
 '▁disruption',
 '▁due',
 '▁to',
 '▁damage',
 '▁at',
 '▁the',
 '▁La',
 'mington',
 '▁Via',
 'duct',
 '.',
 '▁Many',
 '▁businesses',
 '▁and',
 '▁household',
 'ers',
 '▁were',
 '▁affected',
 '▁by',
 '▁flooding',
 '▁in',
 '▁Newton',
 '▁Stewart',
 '▁after',
 '▁the',
 '▁River',
 '▁Cree',
 '▁overflow',
 'ed',
 '▁into',
 '▁the',
 '▁town',
 '.',
 '▁First',
 '▁Minister',
 '▁Nicola',
 '▁Sturgeon',
 '▁visited',
 '▁the',
 '▁area',
 '▁to',
 '▁inspect',
 '▁the',
 '▁damage',
 '.',
 '▁The',
 '▁waters',
 '▁breached',
 '▁a',
 '

In [None]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint)

Downloading (…)"tf_model.h5";:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFPegasusForConditionalGeneration.

Some layers of TFPegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['final_logits_bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
