In [2]:
!pip install transformers
!pip install sentencepiece
!pip install --upgrade accelerate
!pip install rouge.score nltk py7zr
!pip install datasets


Collecting accelerate
  Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.12.0
    Uninstalling accelerate-0.12.0:
      Successfully uninstalled accelerate-0.12.0
Successfully installed accelerate-0.21.0
[0mCollecting rouge.score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting py7zr
  Downloading py7zr-0.20.6-py3-none-any.whl (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.7/66.7 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting pycryptodomex>=3.6.6 (from py7zr)
  Downloading pycryptodomex-3.18.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1

In [3]:
import transformers
from datasets import load_dataset, load_metric, load_from_disk ,DatasetDict
import numpy as np
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
data = load_dataset('cnn_dailymail',name="3.0.0",split=["train[:10000]","validation[:1000]","test[:1000]"])
data = DatasetDict({'train':data[0],'validation':data[1],'test':data[2]})

Downloading builder script:   0%|          | 0.00/3.51k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

Downloading and preparing dataset cnn_dailymail/3.0.0 (download: 558.32 MiB, generated: 1.28 GiB, post-processed: Unknown size, total: 1.82 GiB) to /root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234...


Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/159M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/376M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/572k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/661k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/5 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Dataset cnn_dailymail downloaded and prepared to /root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
metric = load_metric('rouge')
model_checkpoints = 'google/pegasus-cnn_dailymail'

Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [6]:
max_input = 1024
max_target = 128
tokenizer = transformers.PegasusTokenizer.from_pretrained(model_checkpoints,use_fast=False)


def preprocess_data(data_to_process):
  #get the dialogue text
  inputs = [dialogue for dialogue in data_to_process['article']]
  #tokenize text
  model_inputs = tokenizer(inputs,  max_length=max_input, padding='max_length', truncation=True)

  #tokenize labels
  with tokenizer.as_target_tokenizer():
    targets = tokenizer(data_to_process['highlights'], max_length=max_target, padding='max_length', truncation=True)

  model_inputs['labels'] = targets['input_ids']
  #reuturns input_ids, attention_masks, labels
  return model_inputs


tokenize_data = data.map(preprocess_data, batched = True, remove_columns=['id', 'article', 'highlights'])


Downloading (…)ve/main/spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

  0%|          | 0/10 [00:00<?, ?ba/s]



  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [7]:
#load model
model = transformers.PegasusForConditionalGeneration.from_pretrained(model_checkpoints)


#collator to create batches. It preprocess data with the given tokenizer
collator = transformers.DataCollatorForSeq2Seq(tokenizer, model=model)


#####################
# metrics
# compute rouge for evaluation
#####################

def compute_rouge(pred):
  predictions, labels = pred
  #decode the predictions
  decode_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
  #decode labels
  decode_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  #compute results
  res = metric.compute(predictions=decode_predictions, references=decode_labels, use_stemmer=True)
  #get %
  res = {key: value.mid.fmeasure * 100 for key, value in res.items()}

  pred_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
  res['gen_len'] = np.mean(pred_lens)

  return {k: round(v, 4) for k, v in res.items()}



caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


Downloading pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [8]:
args = transformers.Seq2SeqTrainingArguments(
    'pega_cnndm-summ',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=6,
    per_device_eval_batch_size= 6,
    gradient_accumulation_steps=3,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=10,
    predict_with_generate=True,
    eval_accumulation_steps=1,
    fp16=True
    )
#only CUDA available -> fp16=True
trainer = transformers.Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenize_data['train'],
    eval_dataset=tokenize_data['validation'],
    data_collator=collator,
    tokenizer=tokenizer,
    compute_metrics=compute_rouge
)

In [9]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
0,5.5835,0.993888,35.8106,15.5533,26.1048,26.1502,56.222
1,0.9567,0.617722,36.0619,15.9786,26.3774,26.4355,55.201
3,0.5659,0.620876,35.99,15.8494,26.3944,26.4552,54.402
3,0.5294,0.627989,35.7462,15.7906,26.2611,26.3058,53.478
4,0.5069,0.631587,35.9188,15.8357,26.2571,26.3045,51.705
6,0.4901,0.632553,35.9843,15.9366,26.5305,26.6063,51.451
6,0.4884,0.63695,35.9921,15.9093,26.5326,26.5955,51.853
7,0.4711,0.638658,36.3967,16.1212,26.8779,26.9481,51.344
9,0.4615,0.641885,36.023,15.857,26.535,26.5938,51.183
9,0.4586,0.642331,36.024,15.7802,26.4435,26.5043,51.513


TrainOutput(global_step=5550, training_loss=0.9932712671778223, metrics={'train_runtime': 22045.1509, 'train_samples_per_second': 2.268, 'train_steps_per_second': 0.252, 'total_flos': 7.215137115498086e+16, 'train_loss': 0.9932712671778223, 'epoch': 9.99})

In [11]:
predict_results = trainer.predict(
            tokenize_data['test'], metric_key_prefix="predict", max_length=128, num_beams=5
        )


In [12]:
predict_results.metrics

{'predict_loss': 0.6495304107666016,
 'predict_rouge1': 35.5129,
 'predict_rouge2': 14.9139,
 'predict_rougeL': 25.8484,
 'predict_rougeLsum': 25.8214,
 'predict_gen_len': 48.78,
 'predict_runtime': 958.2392,
 'predict_samples_per_second': 1.044,
 'predict_steps_per_second': 0.349}

In [13]:
predictions = tokenizer.batch_decode(
                    predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
                )
import numpy as np
labels = np.where(predict_results.label_ids != -100, predict_results.label_ids, tokenizer.pad_token_id)
actual = tokenizer.batch_decode(
                    labels, skip_special_tokens=True, clean_up_tokenization_spaces=True
                )
from nltk.tokenize import sent_tokenize
preds = [pred.strip() for pred in predictions]
labels = [label.strip() for label in actual]

# rougeLSum expects newline after each sentence
preds = ["\n".join(sent_tokenize(pred)) for pred in preds] #predicted summary
labels = ["\n".join(sent_tokenize(label)) for label in labels] #actual summary



In [14]:
import pandas as pd
df = pd.DataFrame({"Article":data["test"]["article"],"Actual Summary":labels,'Model Summary':preds})
df.to_csv('outputPegasus_cnndm1.csv',index=False)

In [15]:
!pip install evaluate rouge_score


Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.0
[0m

In [16]:
import evaluate
rouge = evaluate.load('rouge')
results = rouge.compute(predictions=preds,references=labels)


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [17]:
results

{'rouge1': 0.3406032634353271,
 'rouge2': 0.1450991014687318,
 'rougeL': 0.2510475100473287,
 'rougeLsum': 0.3086716415941861}

In [18]:
sum([len(x.split()) for x in preds])/1000

36.239

In [29]:
preds[5]

"There is no obvious reason to be concerned about the proposed merger of two of the world's biggest eyewear firms, Essilor and Luxottica."

In [28]:
labels[5]

'Since their impending merger was announced in January, there has been remarkably little comment about the huge proposed deal to combine Essilor and Luxottica.'