In [1]:
!pip install transformers datasets evaluate rouge_score

[0m

In [2]:
from datasets import load_dataset

billsum = load_dataset("billsum", split="ca_test")
billsum = billsum.train_test_split(test_size=0.2)

In [3]:
billsum["train"][0]

{'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nSection 48900 of the Education Code is amended to read:\n48900.\nA pupil shall not be suspended from school or recommended for expulsion, unless the superintendent of the school district or the principal of the school in which the pupil is enrolled determines that the pupil has committed an act as defined pursuant to any of subdivisions (a) to (r), inclusive:\n(a) (1) Caused, attempted to cause, or threatened to cause physical injury to another person.\n(2) Willfully used force or violence upon the person of another, except in self-defense.\n(b) Possessed, sold, or otherwise furnished a firearm, knife, explosive, or other dangerous object, unless, in the case of possession of an object of this type, the pupil had obtained written permission to possess the item from a certificated school employee, which is concurred in by the principal or the designee of the principal.\n(c) Unlawfully possessed, used, 

In [4]:
from transformers import AutoTokenizer

checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)



In [5]:
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [6]:
tokenized_billsum = billsum.map(preprocess_function, batched=True)

Map:   0%|          | 0/989 [00:00<?, ? examples/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

In [7]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [8]:
import evaluate

rouge = evaluate.load("rouge")

In [9]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [10]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)



In [11]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./my_awesome_billsum_model/",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
    # push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjylee9018[0m ([33mupstage6_doc_classification[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,2.860428,0.1308,0.0375,0.1087,0.1086,19.0
2,No log,2.652555,0.1372,0.0475,0.1149,0.1144,19.0
3,No log,2.592746,0.1409,0.0496,0.1165,0.1165,19.0
4,No log,2.576018,0.1431,0.0522,0.1185,0.1183,19.0


TrainOutput(global_step=248, training_loss=3.0191254154328377, metrics={'train_runtime': 98.6464, 'train_samples_per_second': 40.103, 'train_steps_per_second': 2.514, 'total_flos': 1070824333246464.0, 'train_loss': 3.0191254154328377, 'epoch': 4.0})