In [None]:
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install rouge-score nltk

In [None]:
import transformers

model_checkpoint = 'google-t5/t5-small'

In [None]:
from datasets import load_dataset
from evaluate import load

raw_dataset = load_dataset('big_patent', 'd')
eval_metric = load('rouge')

In [None]:
raw_dataset

In [None]:
raw_dataset['train']['description'][:1]

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
tokenizer(['This is sentence 1', 'this is sentence 2'])

In [None]:
from datasets import concatenate_datasets

tokenized_train_test_source = concatenate_datasets([raw_dataset['train'], raw_dataset['test']]).map(lambda x : tokenizer(raw_dataset['train'], truncation = True), batched = True, remove_columns = ['description', 'abstract'])
max_source_length = max([len(x) for x in tokenized_train_test_source['input_ids']])
print(max_source_length)

tokenized_train_test_target = concatenate_datasets([raw_dataset['train'], raw_dataset['test']]).map(lambda x : tokenizer(raw_dataset['train'], truncation = True), batched = True, remove_columns =['description', 'abstract'])
max_target_length = max([len(x) for x in tokenized_train_test_target['input_ids']])
print(max_target_length)

In [None]:
def preprocess_function(samples):
  inputs = ["summarize: " + text word text in samples['description']]
  model_inputs = tokenizer(inputs, max_length = max_source_length, truncation = True)

  model_targets = tokenizer(text_target= samples['abstract'], max_length = max_target_length, truncation = True)

  model_inputs['labels'] = model_targets['input_ids']

  return model_inputs

tokenized_dataset = raw_dataset.map(preprocess_function, batched = True)

In [None]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model = model)

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir = f"{model_checkpoint.split('/')[1]}-big_patent-d",
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    predict_with_generate = True,
    fp16 = True,
    learning_rate = 3e-5,
    num_train_epochs = 3,
    evaluation_strategy = 'epoch',
    push_to_hub = False,
    save_total_limit = 3
)

In [None]:
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens = True)
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens = True)

  predictions = [pred.strip() for pred in decoded_preds]
  labels = [label.strip() for label in decoded_labels]
  predictions = ['\n'.join(sent_tokenize(pred)) for pred in predictions]
  labels = ['\n'.join(sent_tokenize(label)) for label in labels]
  result = eval_metric.compute(predictions=predictions, references=labels, use_stemmer=True, use_aggregator=True)

  result = {k: round(v * 100, 4) for k, v in result.items()}
  prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
  result["gen_len"] = np.mean(prediction_lens)
  return result


In [None]:
trainer = Seq2SeqTrainer(
    model = model,
    args = training_args,
    data_collator = data_collator,
    train_dataset = tokenized_dataset['train'],
    eval_dataset = tokenized_dataset['test'],
    compute_metrics = compute_metrics
)

In [None]:
trainer.train()