In [1]:
!pip install datasets
!pip install transformers[torch]
!pip install rouge_score
!pip install evaluate

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-any.

In [2]:
from datasets import Dataset, DatasetDict
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoTokenizer, \
                          AutoModelForSeq2SeqLM, BartForConditionalGeneration, BartTokenizer, BigBirdPegasusModel
import evaluate
import tensorflow as tf
from transformers.keras_callbacks import KerasMetricCallback


In [3]:
from google.colab import files
import io

uploaded = files.upload()

Saving full_text.csv to full_text.csv


# Prep data

In [4]:
data = pd.read_csv('full_text.csv')
new_column_names = {
    'abstract': 'summary',
    'full_text': 'text'
}

data = data.rename(columns=new_column_names)

text = data.drop('summary', axis=1, inplace=False)
labels = data['summary']

text_train, text_test, labels_train, labels_test = train_test_split(text, labels, test_size=0.2, random_state=42)

train_dataset = Dataset.from_pandas(text_train.join(labels_train))
test_dataset = Dataset.from_pandas(text_test.join(labels_test))

# Create a DatasetDict with train and test keys
dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

max_lengths = data.applymap(lambda x: len(str(x))).max(axis=0)


In [5]:
data.applymap(lambda x: len(str(x))).max(axis=0)

summary     2244
text       81438
dtype: int64

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', '__index_level_0__'],
        num_rows: 4
    })
    test: Dataset({
        features: ['text', 'summary', '__index_level_0__'],
        num_rows: 1
    })
})

In [7]:
class TrainSummarizerModels:

    def __init__(self, data, tokenizer, checkpoint, model):

      self.tokenizer = tokenizer
      self.checkpoint = checkpoint
      self.data = data
      self.data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)
      self.rouge = evaluate.load("rouge")
      self.model = model

    def preprocess_function(self, examples):

      prefix = "summarize: "
      inputs = [prefix + doc for doc in examples["text"]]
      model_inputs = self.tokenizer(inputs, max_length=81438, truncation=True)
      labels = self.tokenizer(text_target=examples["summary"], max_length=2244, truncation=True)

      model_inputs["labels"] = labels["input_ids"]
      return model_inputs

    def tokenize_data(self):

      tokenized_data = dataset.map(self.preprocess_function, batched=True)
      return tokenized_data

    def compute_metrics(eval_pred, self):
      predictions, labels = eval_pred
      decoded_preds = self.tokenizer.batch_decode(predictions, skip_special_tokens=True)
      labels = np.where(labels != -100, labels, self.tokenizer.pad_token_id)
      decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)

      result = self.rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

      prediction_lens = [np.count_nonzero(pred != self.tokenizer.pad_token_id) for pred in predictions]
      result["gen_len"] = np.mean(prediction_lens)

      return {k: round(v, 4) for k, v in result.items()}

    def train_and_eval(self):

      tokenized_data = self.tokenize_data()

      training_args = Seq2SeqTrainingArguments(
      output_dir=f"{self.checkpoint}_model",
      evaluation_strategy="epoch",
      learning_rate=2e-5,
      per_device_train_batch_size=4,
      per_device_eval_batch_size=1,
      weight_decay=0.01,
      save_total_limit=3,
      num_train_epochs=4,
      predict_with_generate=True,
      fp16=False,
      push_to_hub=False)

      trainer = Seq2SeqTrainer(
      model=self.model,
      args=training_args,
      train_dataset=tokenized_data["train"],
      eval_dataset=tokenized_data["test"],
      tokenizer=self.tokenizer,
      data_collator=self.data_collator,
      compute_metrics=self.compute_metrics)

      trainer.train()
      eval_results = trainer.evaluate()
      print(f"Model {self.checkpoint} evaluation results:")
      print(eval_results)
      return eval_results




In [8]:
checkpoint = "google-t5/t5-small"
t5 = TrainSummarizerModels(dataset,
                          tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small"),
                          checkpoint = checkpoint,
                          model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
                          )
t5_eval = t5.train_and_eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 70.90 GiB. GPU 0 has a total capacity of 14.75 GiB of which 12.63 GiB is free. Process 25637 has 2.11 GiB memory in use. Of the allocated memory 1.96 GiB is allocated by PyTorch, and 32.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
checkpoint = "facebook/bart-large"

BART = TrainSummarizerModels(dataset,
                          tokenizer = BartTokenizer.from_pretrained("facebook/bart-large"),
                          checkpoint = checkpoint,
                          model = BartForConditionalGeneration.from_pretrained(checkpoint, forced_bos_token_id=0)
                          )
BART_eval = BART.train_and_eval()

In [None]:
checkpoint = "google/bigbird-pegasus-large-arxiv"

BBP = TrainSummarizerModels(dataset,
                          tokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv"),
                          checkpoint = checkpoint,
                          model = BigBirdPegasusModel.from_pretrained("google/bigbird-pegasus-large-arxiv")
                          )
BBP_eval = BBP.train_and_eval()

In [None]:
def select_best_model(eval1, eval2, eval3):
    best_model = None
    best_score = 0
    for eval_results, model in [(eval1, t5), (BART_eval, BART), (BBP_eval, BBP)]:
        rouge_score = eval_results["test_rouge_score"]
        if rouge_score > best_score:
            best_score = rouge_score
            best_model = model
    return best_model

best_model = select_best_model(t5_eval, BART_eval, BBP_eval)
print(f"Best model: {best_model.checkpoint}")

In [None]:
best_model.save_pretrained(f"{best_model.checkpoint}_best_model")

In [None]:
app_model = AutoModelForSeq2SeqLM.from_pretrained(f"{best_model.checkpoint}_best_model")
app_tokenizer = AutoTokenizer.from_pretrained(f"{best_model.checkpoint}_best_model")

def summarize_paper(text):
    inputs = app_tokenizer(text, return_tensors="pt")
    outputs = app_model.generate(inputs["input_ids"], attention_mask=inputs["attention_mask"])
    summary = app_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary