In [1]:
! pip install transformers datasets
! pip install rouge-score nltk
! pip install huggingface_hub

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 7.5 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.1.0-py3-none-any.whl (325 kB)
[K     |████████████████████████████████| 325 kB 52.8 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 40.4 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 65.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 62.1 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 8.6 MB/s

In [2]:
import transformers

print(transformers.__version__)

4.18.0


In [3]:
!pip install SentencePiece

Collecting SentencePiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l[K     |▎                               | 10 kB 36.5 MB/s eta 0:00:01[K     |▌                               | 20 kB 18.5 MB/s eta 0:00:01[K     |▉                               | 30 kB 15.4 MB/s eta 0:00:01[K     |█                               | 40 kB 13.9 MB/s eta 0:00:01[K     |█▍                              | 51 kB 6.5 MB/s eta 0:00:01[K     |█▋                              | 61 kB 7.7 MB/s eta 0:00:01[K     |██                              | 71 kB 8.2 MB/s eta 0:00:01[K     |██▏                             | 81 kB 8.1 MB/s eta 0:00:01[K     |██▍                             | 92 kB 9.0 MB/s eta 0:00:01[K     |██▊                             | 102 kB 7.3 MB/s eta 0:00:01[K     |███                             | 112 kB 7.3 MB/s eta 0:00:01[K     |███▎                            | 122 kB 7.3 MB/s eta 0:00:01[K     |███▌      

In [1]:
!pip install sentencepiece



In [2]:
import torch

In [3]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments

In [4]:
from datasets import load_dataset
dataset = load_dataset("xsum")
train_texts, train_labels = dataset['train']['document'][:1000], dataset['train']['summary'][:1000]

Using custom data configuration default
Reusing dataset xsum (/root/.cache/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934)


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
class PegasusDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels['input_ids'])  # len(self.labels)

In [6]:
tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-xsum')
encodings = tokenizer(train_texts, truncation=True, padding=True,return_tensors="pt")   
decodings = tokenizer(train_labels, truncation=True, padding=True,return_tensors="pt")
dataset_tokenized = PegasusDataset(encodings, decodings)
train_dataset =  dataset_tokenized   

Downloading:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

In [7]:
def prepare_fine_tuning(model_name, tokenizer, train_dataset, val_dataset=None, freeze_encoder=False, output_dir='./results'):
  """
  Prepare configurations and base model for fine-tuning
  """
  torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
  model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

  if freeze_encoder:
    for param in model.model.encoder.parameters():
      param.requires_grad = False

  if val_dataset is not None:
    training_args = TrainingArguments(
      output_dir=output_dir,           # output directory
      num_train_epochs=2000,           # total number of training epochs
      per_device_train_batch_size=1,   # batch size per device during training, can increase if memory allows
      per_device_eval_batch_size=1,    # batch size for evaluation, can increase if memory allows
      save_steps=500,                  # number of updates steps before checkpoint saves
      save_total_limit=5,              # limit the total amount of checkpoints and deletes the older checkpoints
      evaluation_strategy='steps',     # evaluation strategy to adopt during training
      eval_steps=100,                  # number of update steps before evaluation
      warmup_steps=500,                # number of warmup steps for learning rate scheduler
      weight_decay=0.01,               # strength of weight decay
      logging_dir='./logs',            # directory for storing logs
      logging_steps=10,
    )

    trainer = Trainer(
      model=model,                         # the instantiated 🤗 Transformers model to be trained
      args=training_args,                  # training arguments, defined above
      train_dataset=train_dataset,         # training dataset
      eval_dataset=val_dataset,            # evaluation dataset
      tokenizer=tokenizer
    )

  else:
    training_args = TrainingArguments(
      output_dir=output_dir,           # output directory
      num_train_epochs=5,           # total number of training epochs
      per_device_train_batch_size=1,   # batch size per device during training, can increase if memory allows
      save_steps=500,                  # number of updates steps before checkpoint saves
      save_total_limit=5,              # limit the total amount of checkpoints and deletes the older checkpoints
      warmup_steps=500,                # number of warmup steps for learning rate scheduler
      weight_decay=0.01,               # strength of weight decay
      logging_dir='./logs',            # directory for storing logs
      logging_steps=10,
    )

    trainer = Trainer(
      model=model,                         # the instantiated 🤗 Transformers model to be trained
      args=training_args,                  # training arguments, defined above
      train_dataset=train_dataset,         # training dataset
      tokenizer=tokenizer
    )

  return trainer


In [8]:
trainer = prepare_fine_tuning('google/pegasus-xsum', tokenizer, train_dataset)
trainer.train()

Downloading:   0%|          | 0.00/2.12G [00:00<?, ?B/s]

***** Running training *****
  Num examples = 1000
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 5000
  
  import sys


Step,Training Loss
10,7.0346
20,7.41
30,7.6655
40,7.6693
50,7.1291
60,6.7095
70,6.8734
80,8.2346
90,6.7231
100,6.5864


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
  
  import sys
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1000/special_tokens_map.json
  
  import sys
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1500/tokenizer_config.json
Special tokens file saved in ./resu

TrainOutput(global_step=5000, training_loss=0.9709793740868569, metrics={'train_runtime': 2014.7905, 'train_samples_per_second': 2.482, 'train_steps_per_second': 2.482, 'total_flos': 7223661035520000.0, 'train_loss': 0.9709793740868569, 'epoch': 5.0})

In [9]:
test_dataset=dataset['test'][1000]

In [10]:
test_dataset

{'document': "Media playback is not supported on this device\nOur mission is to inspire, empower and motivate our online community in the pursuit of feeling BodyPositive. And we'll be with you every step of the way as you enjoy and celebrate being YOU.\nWe're proud of all the things that make us unique and we reckon that by embracing our differences we can make our own lives happier - and inspire someone else along the way.\nWe'll be bringing you the very best in the world of health & fitness, wellbeing & lifestyle and style & beauty to inspire you to enjoy this next year exactly the way you want. And we want you to share your lives with us too by tweeting, posting, sharing, snapping and chatting your way through the next six months.\nStay tuned and join our body brigade!\nOn January 27, we'll be getting up close and personal with R1's Gemma Cairney with a BodyPositive takeover of The Surgery with Gemma and Dr Radha, so what is it you want to talk about?\nPerhaps you've hidden somethin

In [19]:
test_texts, test_labels = dataset['test']['document'][:10], dataset['test']['summary'][:10]

In [25]:
encodings = tokenizer(test_texts, truncation=True, padding=True,return_tensors="pt")   
#decodings = tokenizer(test_labels, truncation=True, padding=True,return_tensors="pt")
dataset_tokenized = PegasusDataset(encodings, decodings)
test_dataset =  dataset_tokenized

In [26]:
trainer.evaluate(test_dataset)

***** Running Evaluation *****
  Num examples = 10
  Batch size = 8
  
  import sys


{'epoch': 5.0,
 'eval_loss': 1.591928243637085,
 'eval_runtime': 0.5189,
 'eval_samples_per_second': 19.273,
 'eval_steps_per_second': 3.855}

In [22]:
test_dataset['document']


IndexError: ignored

In [27]:
pred=trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 10
  Batch size = 8
  
  import sys


In [28]:
pred

PredictionOutput(predictions=(array([[[ 1.0777233e+00,  3.5396080e+00,  6.0335851e-01, ...,
         -6.5574527e+00, -4.1149359e+00, -4.2889533e+00],
        [ 1.6801616e+00,  4.2202625e+00, -3.2914612e-02, ...,
         -6.8357353e+00, -2.4449897e+00, -7.5968056e+00],
        [ 8.7451422e-01,  2.8876371e+00,  1.4147942e-01, ...,
         -2.8626754e+00, -3.2365003e-01, -3.6243207e+00],
        ...,
        [ 1.7379698e+01, -3.1499100e-01, -9.6050138e+00, ...,
         -7.7156234e+00, -1.5618315e+01, -3.4255981e+01],
        [ 1.7363586e+01, -2.3650901e-01, -9.6015863e+00, ...,
         -7.8342586e+00, -1.5658760e+01, -3.4343296e+01],
        [ 1.7351589e+01, -1.3693582e+00, -9.5553427e+00, ...,
         -7.3701463e+00, -1.5763766e+01, -3.4246861e+01]],

       [[ 1.2536848e+00,  3.5596759e+00,  8.7885505e-01, ...,
         -1.9019294e+00, -1.0384676e-01, -7.3281045e+00],
        [ 7.6200312e-01, -1.2703360e+00,  1.1004099e+00, ...,
         -3.0406826e+00, -3.0193472e-01, -8.1379042e+