In [1]:
from google.colab import drive #97
drive.mount('/content/drive')

Mounted at /content/drive


# Imports

In [2]:
# crash colab to get more RAM
# !kill -9 -1

In [3]:
! pip install datasets transformers rouge-score nltk

Collecting datasets
  Downloading datasets-1.11.0-py3-none-any.whl (264 kB)
[K     |████████████████████████████████| 264 kB 5.1 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.9.2-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 41.0 MB/s 
[?25hCollecting rouge-score
  Downloading rouge_score-0.0.4-py2.py3-none-any.whl (22 kB)
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 59.4 MB/s 
Collecting fsspec>=2021.05.0
  Downloading fsspec-2021.7.0-py3-none-any.whl (118 kB)
[K     |████████████████████████████████| 118 kB 58.8 MB/s 
[?25hCollecting huggingface-hub<0.1.0
  Downloading huggingface_hub-0.0.15-py3-none-any.whl (43 kB)
[K     |████████████████████████████████| 43 kB 1.8 MB/s 
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████

In [4]:
# VERSION = "1.8.1"
# !curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
# !python pytorch-xla-env-setup.py --version $VERSION

In [5]:
import transformers
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

print(transformers.__version__)

4.9.2


In [6]:
! nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



# Data Preprocessing

In [7]:
from datasets import load_dataset, load_metric

data = load_dataset("multi_news")
rouge = load_metric("rouge")

Downloading:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/918 [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset multi_news/default (download: 245.06 MiB, generated: 667.74 MiB, post-processed: Unknown size, total: 912.80 MiB) to /root/.cache/huggingface/datasets/multi_news/default/1.0.0/2e145a8e21361ba4ee46fef70640ab946a3e8d425002f104d2cda99a9efca376...


Downloading: 0.00B [00:00, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset multi_news downloaded and prepared to /root/.cache/huggingface/datasets/multi_news/default/1.0.0/2e145a8e21361ba4ee46fef70640ab946a3e8d425002f104d2cda99a9efca376. Subsequent calls will reuse this data.


Downloading:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

In [8]:
data['train'] = data['train'].select(range(1000))
data['validation'] = data['validation'].select(range(200))
data['test'] = data['test'].select(range(200))

data

DatasetDict({
    train: Dataset({
        features: ['document', 'summary'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['document', 'summary'],
        num_rows: 200
    })
    test: Dataset({
        features: ['document', 'summary'],
        num_rows: 200
    })
})

In [9]:
from transformers import AutoTokenizer

model_name = "allenai/led-base-16384" # google/bigbird-pegasus-large-arxiv, microsoft/prophetnet-large-uncased-cnndm
prefix = ""

tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

Tokenization

In [10]:
max_input_length = 2048 * 4
max_target_length = 1024

def preprocess_function(batch):

    # tokenize the inputs and labels
    inputs = tokenizer(
        batch["document"],
        padding="max_length",
        truncation=True,
        max_length=max_input_length,
    )
    outputs = tokenizer(
        batch["summary"],
        padding="max_length",
        truncation=True,
        max_length=max_target_length,
    )

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask

    # create 0 global_attention_mask lists
    batch["global_attention_mask"] = len(batch["input_ids"]) * [
        [0 for _ in range(len(batch["input_ids"][0]))]
    ]

    # since above lists are references, the following line changes the 0 index for all samples
    batch["global_attention_mask"][0][0] = 1
    batch["labels"] = outputs.input_ids

    # We have to make sure that the PAD token is ignored
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in labels]
        for labels in batch["labels"]
    ]

    return batch

In [11]:
processed_data = data.map(preprocess_function, batched=True, remove_columns=["document", "summary"])
processed_data.set_format(type='torch')

processed_data

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'global_attention_mask', 'input_ids', 'labels'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['attention_mask', 'global_attention_mask', 'input_ids', 'labels'],
        num_rows: 200
    })
    test: Dataset({
        features: ['attention_mask', 'global_attention_mask', 'input_ids', 'labels'],
        num_rows: 200
    })
})

# Fine Tuning

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, gradient_checkpointing=True, use_cache=False)

# data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True) # Batch Bucketing by Length

Downloading:   0%|          | 0.00/648M [00:00<?, ?B/s]

In [None]:
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(
        predictions=pred_str, references=label_str, rouge_types=["rouge2"]
    )["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

In [None]:
batch_size = 2

args = Seq2SeqTrainingArguments(
    "Summarization",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    gradient_accumulation_steps=4,
    fp16 = True,
    predict_with_generate=True
)

# GPU/CPU

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=processed_data["train"],
    eval_dataset=processed_data["validation"],
    # data_collator=data_collator,
    # tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

model.save_pretrained('/content/drive/MyDrive/Summarization/LED')

Using amp fp16 backend
***** Running training *****
  Num examples = 1000
  Num Epochs = 1
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 4
  Total optimization steps = 125


Epoch,Training Loss,Validation Loss,Rouge2 Precision,Rouge2 Recall,Rouge2 Fmeasure
1,No log,2.337468,0.2435,0.0167,0.0311


***** Running Evaluation *****
  Num examples = 200
  Batch size = 2


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to /content/drive/MyDrive/Summarization/mnews_bart
Configuration saved in /content/drive/MyDrive/Summarization/mnews_bart/config.json
Model weights saved in /content/drive/MyDrive/Summarization/mnews_bart/pytorch_model.bin


# Evaluation

In [12]:
model = AutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/Summarization/LED") #1000

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(
        predictions=pred_str, references=label_str, rouge_types=["rouge2"]
    )["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

In [None]:
import gc

eval_dataset = processed_data["validation"]
del processed_data
gc.collect()

batch_size = 1

args = Seq2SeqTrainingArguments(
    "Summarization",
    per_device_eval_batch_size=batch_size,
)

trainer = Seq2SeqTrainer(
    model,
    args,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

trainer.evaluate()

***** Running Evaluation *****
  Num examples = 200
  Batch size = 1


# Evaluation (Generation)

In [7]:
from datasets import load_dataset, load_metric

eval_dataset = load_dataset("multi_news", split='validation[:1%]').select(range(20))
rouge = load_metric("rouge")

eval_dataset

Downloading:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/918 [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset multi_news/default (download: 245.06 MiB, generated: 667.74 MiB, post-processed: Unknown size, total: 912.80 MiB) to /root/.cache/huggingface/datasets/multi_news/default/1.0.0/2e145a8e21361ba4ee46fef70640ab946a3e8d425002f104d2cda99a9efca376...


Downloading: 0.00B [00:00, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset multi_news downloaded and prepared to /root/.cache/huggingface/datasets/multi_news/default/1.0.0/2e145a8e21361ba4ee46fef70640ab946a3e8d425002f104d2cda99a9efca376. Subsequent calls will reuse this data.


Downloading:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Dataset({
    features: ['document', 'summary'],
    num_rows: 20
})

In [10]:
from transformers import AutoTokenizer

model = AutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/Summarization/LED")

tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384")

In [12]:
max_input_length = 2048 * 4
max_target_length = 1024

def generate_answer(batch):
  inputs_dict = tokenizer(batch["document"], padding="max_length", max_length=max_input_length, return_tensors="pt", truncation=True)
  inputs_dict = {k: inputs_dict[k] for k in inputs_dict}
  predicted_abstract_ids = model.generate(**inputs_dict, max_length=max_target_length, num_beams=3, length_penalty=0.8, no_repeat_ngram_size=2)
  batch["predicted"] = tokenizer.decode(predicted_abstract_ids[0], skip_special_tokens=True)
  print(batch["predicted"])
  return batch

result = eval_dataset.map(generate_answer)

  0%|          | 0/20 [00:00<?, ?ex/s]

– Dan Brown's latest book, "The Lost Symbol," has sold more than 80 million copies worldwide, according to Oxfam, the charity's second-biggest bookseller. "It's always good for an author to know that their books are popular," says the publisher, who says it's "a useful recycling exercise going on – it’s not just people saying 'I've read The Da Vinci Code and now I must get rid of it.'" The book has also been donated to charity by readers keen to make some room on their shelves.
– The GI Bill has been delayed by more than a year, and the VA says it will not be able to make retroactive payments to those who didn't receive their benefits for months due to a glitch in the system, reports the Tennessean. "We're just shocked. We were angry because we figured this would be automatic," says VA spokesman Curtis Cashour, who says the agency has "a legal justification that would allow them to move forward with this decision." "It was just going to hurt me more financially. I was freaking out and 

In [13]:
rouge_output = rouge.compute(
    predictions=result["predicted"], references=result["summary"][:max_target_length], rouge_types=["rouge2"]
)["rouge2"].mid

print({
    "rouge2_precision": round(rouge_output.precision, 4),
    "rouge2_recall": round(rouge_output.recall, 4),
    "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
})

{'rouge2_precision': 0.1559, 'rouge2_recall': 0.0864, 'rouge2_fmeasure': 0.1089}


In [11]:
eval_dataset['document']

['Whether a sign of a good read; or a comment on the \'pulp\' nature of some genres of fiction, the Oxfam second-hand book charts have remained in The Da Vinci Code author\'s favour for the past four years. \n  \n Dan Brown has topped Oxfam\'s \'most donated\' list again, his fourth consecutive year. Having sold more than 80 million copies of The Da Vinci Code and had all four of his novels on the New York Times bestseller list in the same week, it\'s hardly surprising that Brown\'s hefty tomes are being donated to charity by readers keen to make some room on their shelves. \n  \n Another cult crime writer responsible to heavy-weight hardbacks, Stieg Larsson, is Oxfam\'s \'most sold\' author for the second time in a row. Both the \'most donated\' and \'most sold\' lists are dominated by crime fiction, trilogies and fantasy, with JK Rowling the only female author listed in either of the Top Fives. \n  \n Click here or on "View Gallery" to see both charts in pictures ||||| A woman reads 

In [12]:
eval_dataset['summary']

['– The Da Vinci Code has sold so many copies—that would be at least 80 million—that it\'s bound to turn up in book donation piles. But at one charity shop in the UK, it\'s been donated so heavily that the shop has posted a sign propped up on a tower of Da Vinci Code copies that reads: "You could give us another Da Vinci Code... but we would rather have your vinyl!" The manager of the Oxfam shop in Swansea tells the Telegraph that people are laughing and taking pictures of the sizable display: "I would say that we get one copy of the book every day." He says people buy them "occasionally," but with vinyl sales up 25% in the past year, they\'d rather take records. Dan Brown\'s book isn\'t the only one that shops like Oxfam struggle to re-sell. Last year, Oxfam was hit with a large and steady supply of Fifty Shades of Grey, and it similarly begged donors: "Please—no more." But Brown has a particular kind of staying power. The Da Vinci Code was published in 2003, and within six years Brow

# Text Generation

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/Summarization/LED")

loading configuration file /content/drive/MyDrive/Summarization/LED/config.json
Model config LEDConfig {
  "_name_or_path": "allenai/led-base-16384",
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "architectures": [
    "LEDForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "attention_window": [
    1024,
    1024,
    1024,
    1024,
    1024,
    1024
  ],
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 768,
  "decoder_attention_heads": 12,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "encoder_attention_heads": 12,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 2,
  "gradient_checkpointing": true,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL

In [None]:
ARTICLE = data['test']['document'][0]

ARTICLE

'GOP Eyes Gains As Voters In 11 States Pick Governors \n  \n Enlarge this image toggle caption Jim Cole/AP Jim Cole/AP \n  \n Voters in 11 states will pick their governors tonight, and Republicans appear on track to increase their numbers by at least one, with the potential to extend their hold to more than two-thirds of the nation\'s top state offices. \n  \n Eight of the gubernatorial seats up for grabs are now held by Democrats; three are in Republican hands. Republicans currently hold 29 governorships, Democrats have 20, and Rhode Island\'s Gov. Lincoln Chafee is an Independent. \n  \n Polls and race analysts suggest that only three of tonight\'s contests are considered competitive, all in states where incumbent Democratic governors aren\'t running again: Montana, New Hampshire and Washington. \n  \n While those state races remain too close to call, Republicans are expected to wrest the North Carolina governorship from Democratic control, and to easily win GOP-held seats in Utah, N

In [None]:
inputs = tokenizer.encode(prefix + ARTICLE, return_tensors="pt", max_length=512, truncation=True)

outputs = model.generate(
    inputs, 
    max_length=150, 
    min_length=40,
    num_beams=5, 
    no_repeat_ngram_size=2, 
    early_stopping=True
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Input ids are automatically padded from 512 to 1024 to be a multiple of `config.attention_window`: 1024
To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


Output:
----------------------------------------------------------------------------------------------------
– Republicans appear on track to increase their numbers by at least one tonight, with the potential to extend their hold to more than two-thirds of the nation's top state offices. Eight states will pick their governors tonight; three are now held by Democrats, and Rhode Island's Gov. Lincoln Chafee is an Independent. Republicans currently hold 29 governorships, Democrats have 20 and independents have just one. And that's health care, says political scientist Thad Kousser, co-author of The Power of American Governors. "No matter who wins the presidency, national politics is going to be stalemated on the Affordable Care Act," he says.


Beam Search

In [None]:
outputs = model.generate(
    inputs, 
    max_length=150, 
    min_length=40,
    num_beams=5, 
    no_repeat_ngram_size=2,
    repetition_penalty=2.0, 
    num_return_sequences=3, 
    early_stopping=True
) 

print("Output:\n" + 100 * '-')
for i, beam_output in enumerate(outputs):
  print("{}: {}".format(i, tokenizer.decode(beam_output, skip_special_tokens=True)))

Sampling

In [None]:
outputs = model.generate(
    inputs,
    do_sample=True, 
    max_length=150, 
    top_k=50, 
    top_p=0.95, 
    num_return_sequences=3
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(outputs):
  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))