In [1]:
# from IPython.display import HTML, display

# def set_css():
#   display(HTML('''
#   <style>
#     pre {
#         white-space: pre-wrap;
#     }
#   </style>
#   '''))
# get_ipython().events.register('pre_run_cell', set_css)

In [2]:
!pip install datasets transformers accelerate huggingface-hub torch -q

In [3]:
from datasets import load_dataset
from huggingface_hub import notebook_login
import torch
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForCausalLM, TrainingArguments, Trainer, pipeline, AutoModelForSeq2SeqLM

In [4]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
model = 'facebook/bart-large-cnn'
dataset_name = 'abisee/cnn_dailymail'
version = '3.0.0'

In [6]:
dataset = load_dataset(dataset_name, version)
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForSeq2SeqLM.from_pretrained(model)

In [8]:
def format_data(data_point):
  input_encodings = tokenizer(data_point['article'], truncation=True, padding='max_length', max_length=1024)
  with tokenizer.as_target_tokenizer():
    target_encodings = tokenizer(data_point['highlights'], truncation=True, padding='max_length', max_length=128)

  return {
      'input_ids': input_encodings['input_ids'],
      'attention_mask': input_encodings['attention_mask'],
      'labels': target_encodings['input_ids']
  }

cnndaily_modified = dataset.map(format_data, batched=True)
cnndaily_modified

Map:   0%|          | 0/13368 [00:00<?, ? examples/s]



DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 11490
    })
})

In [9]:
# def format_data(data_point):
#   prompt = f"""
#   ### instruction : {"generate the summary for the given piece of corpus"}
#   ### input : {data_point['article']}
#   ### output :{data_point['highlights']}
#   """
#   tokens = tokenizer(
#     prompt,
#     truncation = True,
#     max_length = 256,
#     padding = "max_length"
#   )
#   tokens['labels'] = tokens['input_ids'].copy()
#   return tokens

training_data = cnndaily_modified['train'].take(500)
eval_data = cnndaily_modified['validation'].take(100)
testing_data = cnndaily_modified['test'].take(100)

In [10]:
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [13]:
trainer_args = TrainingArguments(
    output_dir='pegasus-samsum',
    num_train_epochs = 2,
    warmup_steps=5,
    learning_rate=5e-05,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    logging_steps=10,
    #lr_scheduler_type='linear',
    eval_strategy='steps',
    eval_steps=3,

)


trainer = Trainer(
    model=model,
    args=trainer_args,
    tokenizer=tokenizer,
    data_collator=seq2seq_data_collator,
    train_dataset=training_data,
    eval_dataset=eval_data
)

trainer.train()

Step,Training Loss,Validation Loss
3,No log,7.532949
6,No log,5.59772
9,No log,3.022466
12,5.770400,1.767227
15,5.770400,1.293585
18,5.770400,1.061002
21,1.754700,0.929516
24,1.754700,0.832715
27,1.754700,0.777765
30,0.926000,0.743842


Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


TrainOutput(global_step=250, training_loss=0.7594853677749633, metrics={'train_runtime': 1730.8643, 'train_samples_per_second': 0.578, 'train_steps_per_second': 0.144, 'total_flos': 2167104602112000.0, 'train_loss': 0.7594853677749633, 'epoch': 2.0})

In [14]:
trainer.save_model('bart-finetuned')

Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


In [15]:
gen_kwargs = {"length_penalty": 0.8, "num_beams":8, "max_length": 128}

sample_text = cnndaily_modified["test"][2]["article"]

highlight  = cnndaily_modified["test"][2]["highlights"]

pipe = pipeline("summarization", model="bart-finetuned")

# model_x = AutoModelForCausalLM.from_pretrained('bart-finetuned')

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [16]:
# input = testing_data[0]
trainer.evaluate(testing_data)


{'eval_loss': 0.6881666779518127,
 'eval_runtime': 13.9194,
 'eval_samples_per_second': 7.184,
 'eval_steps_per_second': 1.796,
 'epoch': 2.0}

In [17]:
 ##
print("article:")
print(sample_text)


print("\nReference Summary:")
print(highlight)


print("\nModel Summary:")
print(pipe(sample_text, **gen_kwargs)[0]["summary_text"])

article:
(CNN)If you've been following the news lately, there are certain things you doubtless know about Mohammad Javad Zarif. He is, of course, the Iranian foreign minister. He has been U.S. Secretary of State John Kerry's opposite number in securing a breakthrough in nuclear discussions that could lead to an end to sanctions against Iran -- if the details can be worked out in the coming weeks. And he received a hero's welcome as he arrived in Iran on a sunny Friday morning. "Long live Zarif," crowds chanted as his car rolled slowly down the packed street. You may well have read that he is "polished" and, unusually for one burdened with such weighty issues, "jovial." An Internet search for "Mohammad Javad Zarif" and "jovial" yields thousands of results. He certainly has gone a long way to bring Iran in from the cold and allow it to rejoin the international community. But there are some facts about Zarif that are less well-known. Here are six: . In September 2013, Zarif tweeted "Happy