# Libraries & Functions

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
from datasets import Dataset, DatasetDict

In [3]:
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM
from transformers import Seq2SeqTrainingArguments
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer

In [6]:
import evaluate
rouge_score = evaluate.load("rouge")

In [8]:
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [9]:
from nltk.tokenize import sent_tokenize

def three_sentence_summary(text):
    return "\n".join(sent_tokenize(text)[:3])

In [10]:
max_input_length = 512
max_target_length = 50


def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["Abstract"],
        max_length=max_input_length,
        truncation=True,
    )
    labels = tokenizer(
        examples["Title"], max_length=max_target_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [11]:
from sklearn.model_selection import train_test_split

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract the median scores
    result = {key: value * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

In [12]:
def evaluate_baseline(dataset, metric):
    summaries = [three_sentence_summary(text) for text in dataset["Abstract"]]
    return metric.compute(predictions=summaries, references=dataset["Title"])

In [13]:
"""Connect to Huggingface Hub"""
!git config --global user.email "viktor.domazetoski@hotmail.com"
!git config --global user.name "ViktorDo1"

from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Input Data

In [7]:
# from google.colab import drive
# drive.mount("/content/drive")

In [8]:
raw_datasets = dict()

## BioarXiv

In [9]:
data_location_bioarxiv = "../Datasets/Species_Text_Data/"

In [10]:
df_abstracts = pd.read_csv(data_location_bioarxiv + "BioArxiv_abstract.tsv", sep = "\t")
df_titles = pd.read_csv(data_location_bioarxiv + "BioArxiv_title.tsv", sep = "\t")
df_summarization_bioarxiv = pd.DataFrame()
df_summarization_bioarxiv["Abstract"] = df_abstracts.values[:,0]
df_summarization_bioarxiv["Title"] = df_titles.values[:,0]
df_summarization_bioarxiv = df_summarization_bioarxiv[df_summarization_bioarxiv["Abstract"].notna()]

raw_datasets["BioArxiv"] = df_summarization_bioarxiv

In [11]:
df_summarization_bioarxiv

Unnamed: 0,Abstract,Title
0,There is profound interest in knowing the degr...,The Effectiveness of China’s National Forest P...
1,Continuous measurements of sap flow have been ...,Sap flow through petioles and petiolules revea...
2,Sampling is a fundamental tool in ecology and ...,Sampling principles for biodiversity study
3,Biologically meaningful and standardized defin...,"Black rhinoceros demography should be stage, n..."
4,Vaccines are the cornerstone of influenza cont...,Assessing the Use of Antiviral Treatment to Co...
...,...,...
7690,"Predators can alter the abundance, distributio...",The combined impact of killer whale consumptiv...
7691,Extreme droughts can weaken the biotic resista...,Soil legacies of extreme droughts enhance the ...
7692,The insurance effect of biodiversity—that dive...,How to measure response diversity
7693,The CLUE-S model is a popular choice for model...,Improving the predictive performance of CLUE-S...


## Living Planet Index

In [12]:
colab_dir = "../Datasets/Text_BioMacro/"

In [13]:
dataset_name = "LPI"
columns = ["Abstract", "Journal", "Title"]
LPI_negatives = pd.read_csv(colab_dir + "prepared_lpi_negatives.csv", usecols=columns)
LPI_negatives["label"] = 0

LPI_positives = pd.read_csv(colab_dir + "prepared_lpi_positives.csv", usecols=columns)
LPI_positives["label"] = 1


raw_datasets[dataset_name] = pd.concat([LPI_positives, LPI_negatives])
raw_datasets[dataset_name] = raw_datasets[dataset_name].drop(["Journal", "label"], axis = 1)
del LPI_positives
del LPI_negatives

In [14]:
raw_datasets[dataset_name]

Unnamed: 0,Abstract,Title
0,even though intensive aquaculture production o...,aquaculture non native salmonid invasions and ...
1,because sea otters enhydra lutris exert a wide...,bald eagles and sea otters in the aleutian arc...
2,interactions between sea otters enhydra lutris...,changes in sea urchins and kelp following a re...
3,bacterial abundance production and extracellul...,microbial activity and carbon nitrogen and pho...
4,the main objective of many conservation progra...,density dependent productivity depression in p...
...,...,...
4995,for any enzyme catalyzed reaction to occur the...,relationships between protein encoding gene ab...
4996,high intensity functional training hift is a p...,is high intensity functional training hift cro...
4997,the developmental plasticity of plants relies ...,differential tor activation and cell prolifera...
4998,ocean acidification produced by dissolution of...,global declines in oceanic nitrification rates...


## PREDICTS

In [15]:
dataset_name = "PREDICTS"
columns = ["Abstract", "Journal", "Title"]
PREDICTS_negatives = pd.read_csv(colab_dir + "prepared_predicts_negatives.csv", usecols=columns)
PREDICTS_negatives["label"] = 0

PREDICTS_positives = pd.read_csv(colab_dir + "prepared_predicts_positives.csv", usecols=columns)
PREDICTS_positives["label"] = 1

raw_datasets[dataset_name] = pd.concat([PREDICTS_positives, PREDICTS_negatives])
raw_datasets[dataset_name] = raw_datasets[dataset_name].drop(["Journal", "label"], axis = 1)

del PREDICTS_positives
del PREDICTS_negatives

In [16]:
raw_datasets[dataset_name]

Unnamed: 0,Abstract,Title
0,bees are believed to be dominant pollen vector...,bee diversity along a disturbance gradient in ...
1,the maintenance of grasslands as distinct habi...,grazing intensity and the diversity of grassho...
2,male euglossine bees were sampled with chemica...,abundance and diversity of euglossine bees in ...
3,niche breadth of species has been hypothesized...,ecological specialization and susceptibility t...
4,bumblebees hymenoptera apidae are important po...,use of genetic markers to quantify bumblebee f...
...,...,...
4995,we tested the hypothesis that the appearance o...,exo enzymatic activities and dissolved organic...
4996,given a constantly increasing urban population...,numerical study of the impact of vegetation co...
4997,musty and earthy odors frequently characterize...,contribution of streptomyces in sediment to ea...
4998,we selected five typical tree species includin...,utilization of lightflecks by seedlings of fiv...


## Preprocess Datasets

In [42]:
summarization_dataset_dict = dict()

for dataset_name in list(raw_datasets.keys()):
  indices_train, indices_test \
      = train_test_split(raw_datasets[dataset_name].index.values, test_size=0.25, random_state=42)

  df_summarization_train = raw_datasets[dataset_name].loc[indices_train]
  df_summarization_test = raw_datasets[dataset_name].loc[indices_test]

  summarization_dataset_dict[dataset_name] = DatasetDict()
  summarization_dataset_dict[dataset_name]["train"] = Dataset.from_pandas(df_summarization_train)
  summarization_dataset_dict[dataset_name]["test"] = Dataset.from_pandas(df_summarization_test)

# Model Training & Evaluation

## Flan T5

In [26]:
model_checkpoint = "google/flan-t5-base" #t5-small
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [43]:
tokenized_dataset_dict = dict()
for dataset_name in list(raw_datasets.keys()):
  tokenized_dataset_dict[dataset_name] = summarization_dataset_dict[dataset_name].map(preprocess_function, batched=True)

Map:   0%|          | 0/5770 [00:00<?, ? examples/s]

Map:   0%|          | 0/1924 [00:00<?, ? examples/s]

Map:   0%|          | 0/5138 [00:00<?, ? examples/s]

Map:   0%|          | 0/1761 [00:00<?, ? examples/s]

Map:   0%|          | 0/4929 [00:00<?, ? examples/s]

Map:   0%|          | 0/1679 [00:00<?, ? examples/s]

In [28]:
for dataset_name in list(raw_datasets.keys()):
  score = evaluate_baseline(summarization_dataset_dict[dataset_name]["test"], rouge_score)
  rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
  rouge_dict = dict((rn, round(score[rn] * 100, 2)) for rn in rouge_names)
  print(dataset_name, rouge_dict)

BioArxiv {'rouge1': 18.55, 'rouge2': 6.38, 'rougeL': 13.62, 'rougeLsum': 15.66}
LPI {'rouge1': 11.21, 'rouge2': 5.23, 'rougeL': 8.68, 'rougeLsum': 8.67}
PREDICTS {'rouge1': 11.18, 'rouge2': 5.09, 'rougeL': 8.65, 'rougeLsum': 8.65}


In [29]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [34]:
for dataset_name in list(raw_datasets.keys()):
  model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
  batch_size = 8
  num_train_epochs = 3
  # Show the training loss with every epoch
  logging_steps = len(tokenized_dataset_dict[dataset_name]["train"]) // batch_size
  model_name = model_checkpoint.split("/")[-1]

  args = Seq2SeqTrainingArguments(
      output_dir=f"{model_name}-finetuned-summaries-{dataset_name}",
      evaluation_strategy="epoch",
      learning_rate=5.6e-5,
      per_device_train_batch_size=batch_size,
      per_device_eval_batch_size=batch_size,
      weight_decay=0.01,
      save_total_limit=3,
      num_train_epochs=num_train_epochs,
      predict_with_generate=True,
      logging_steps=logging_steps,
      push_to_hub=True,
  )

  data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

  tokenized_datasets = tokenized_dataset_dict[dataset_name].remove_columns(
    summarization_dataset_dict[dataset_name]["train"].column_names
  )

  trainer = Seq2SeqTrainer(
      model,
      args,
      train_dataset=tokenized_dataset_dict[dataset_name]["train"],
      eval_dataset=tokenized_dataset_dict[dataset_name]["test"],
      data_collator=data_collator,
      tokenizer=tokenizer,
      compute_metrics=compute_metrics,
  )
  trainer.train()

  print(dataset_name)
  print(trainer.evaluate())

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,2.3891,2.102346,37.4416,15.5836,31.3282,31.3557
2,2.1936,2.079256,37.5552,15.5571,31.2356,31.2848
3,2.0965,2.078493,37.6428,15.5821,31.3016,31.3601


BioArxiv


{'eval_loss': 2.0784928798675537, 'eval_rouge1': 37.6428, 'eval_rouge2': 15.5821, 'eval_rougeL': 31.3016, 'eval_rougeLsum': 31.3601, 'eval_runtime': 259.4403, 'eval_samples_per_second': 7.416, 'eval_steps_per_second': 0.929, 'epoch': 3.0}


TypeError: ignored

In [45]:
for dataset_name in list(raw_datasets.keys())[1:]:
  model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
  batch_size = 8
  num_train_epochs = 3
  # Show the training loss with every epoch
  logging_steps = len(tokenized_dataset_dict[dataset_name]["train"]) // batch_size
  model_name = model_checkpoint.split("/")[-1]

  args = Seq2SeqTrainingArguments(
      output_dir=f"{model_name}-finetuned-summaries-{dataset_name}",
      evaluation_strategy="epoch",
      learning_rate=5.6e-5,
      per_device_train_batch_size=batch_size,
      per_device_eval_batch_size=batch_size,
      weight_decay=0.01,
      save_total_limit=3,
      num_train_epochs=num_train_epochs,
      predict_with_generate=True,
      logging_steps=logging_steps,
      push_to_hub=True,
  )

  data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

  tokenized_datasets = tokenized_dataset_dict[dataset_name].remove_columns(
    summarization_dataset_dict[dataset_name]["train"].column_names
  )

  trainer = Seq2SeqTrainer(
      model,
      args,
      train_dataset=tokenized_dataset_dict[dataset_name]["train"],
      eval_dataset=tokenized_dataset_dict[dataset_name]["test"],
      data_collator=data_collator,
      tokenizer=tokenizer,
      compute_metrics=compute_metrics,
  )
  trainer.train()

  print(dataset_name)
  print(trainer.evaluate())

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,2.0532,1.768475,39.4902,18.2882,33.8007,33.7651
2,1.8524,1.734018,40.3383,19.348,34.8847,34.8498
3,1.7578,1.716138,40.6551,19.5025,35.1574,35.1162


LPI


{'eval_loss': 1.7161375284194946, 'eval_rouge1': 40.6551, 'eval_rouge2': 19.5025, 'eval_rougeL': 35.1574, 'eval_rougeLsum': 35.1162, 'eval_runtime': 235.4239, 'eval_samples_per_second': 7.48, 'eval_steps_per_second': 0.939, 'epoch': 3.0}


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,2.0902,1.786919,39.395,17.617,33.5651,33.5775
2,1.8862,1.744547,40.1411,18.077,34.1945,34.1813
3,1.7932,1.742668,40.4435,18.3476,34.4782,34.4966


PREDICTS


{'eval_loss': 1.7426677942276, 'eval_rouge1': 40.4435, 'eval_rouge2': 18.3476, 'eval_rougeL': 34.4782, 'eval_rougeLsum': 34.4966, 'eval_runtime': 228.4059, 'eval_samples_per_second': 7.351, 'eval_steps_per_second': 0.919, 'epoch': 3.0}


## BART

In [46]:
model_checkpoint = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [47]:
tokenized_dataset_dict = dict()
for dataset_name in list(raw_datasets.keys()):
  tokenized_dataset_dict[dataset_name] = summarization_dataset_dict[dataset_name].map(preprocess_function, batched=True)

Map:   0%|          | 0/5770 [00:00<?, ? examples/s]

Map:   0%|          | 0/1924 [00:00<?, ? examples/s]

Map:   0%|          | 0/5138 [00:00<?, ? examples/s]

Map:   0%|          | 0/1761 [00:00<?, ? examples/s]

Map:   0%|          | 0/4929 [00:00<?, ? examples/s]

Map:   0%|          | 0/1679 [00:00<?, ? examples/s]

In [48]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Downloading model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [50]:
for dataset_name in list(raw_datasets.keys())[:]:
  model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
  batch_size = 8
  num_train_epochs = 3
  # Show the training loss with every epoch
  logging_steps = len(tokenized_dataset_dict[dataset_name]["train"]) // batch_size
  model_name = model_checkpoint.split("/")[-1]

  args = Seq2SeqTrainingArguments(
      output_dir=f"{model_name}-finetuned-summaries-{dataset_name}",
      evaluation_strategy="epoch",
      learning_rate=5.6e-5,
      per_device_train_batch_size=batch_size,
      per_device_eval_batch_size=batch_size,
      weight_decay=0.01,
      save_total_limit=3,
      num_train_epochs=num_train_epochs,
      predict_with_generate=True,
      logging_steps=logging_steps,
      push_to_hub=True,
  )

  data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

  tokenized_datasets = tokenized_dataset_dict[dataset_name].remove_columns(
    summarization_dataset_dict[dataset_name]["train"].column_names
  )

  trainer = Seq2SeqTrainer(
      model,
      args,
      train_dataset=tokenized_dataset_dict[dataset_name]["train"],
      eval_dataset=tokenized_dataset_dict[dataset_name]["test"],
      data_collator=data_collator,
      tokenizer=tokenizer,
      compute_metrics=compute_metrics,
  )
  trainer.train()

  print(dataset_name)
  print(trainer.evaluate())

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,2.8823,2.499151,38.0791,16.5388,32.2686,32.329
2,2.2944,2.42503,38.3288,16.5435,32.3596,32.4185
3,1.9627,2.4201,38.8359,16.7425,32.8002,32.8303


BioArxiv


{'eval_loss': 2.420100450515747, 'eval_rouge1': 38.8359, 'eval_rouge2': 16.7425, 'eval_rougeL': 32.8002, 'eval_rougeLsum': 32.8303, 'eval_runtime': 156.593, 'eval_samples_per_second': 12.287, 'eval_steps_per_second': 1.539, 'epoch': 3.0}


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,2.6282,2.175644,41.298,20.4257,36.1032,36.05
2,2.0324,2.05302,42.3223,21.6968,37.1757,37.1025
3,1.6874,2.026464,42.8284,22.1656,37.5993,37.541


LPI


{'eval_loss': 2.026463508605957, 'eval_rouge1': 42.8284, 'eval_rouge2': 22.1656, 'eval_rougeL': 37.5993, 'eval_rougeLsum': 37.541, 'eval_runtime': 137.4486, 'eval_samples_per_second': 12.812, 'eval_steps_per_second': 1.608, 'epoch': 3.0}


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,2.6937,2.211818,41.521,19.5656,35.7046,35.7044
2,2.0774,2.093801,41.5301,20.0789,36.036,36.0283
3,1.7338,2.070686,42.7919,20.9367,36.9789,36.9821


PREDICTS


{'eval_loss': 2.0706863403320312, 'eval_rouge1': 42.7919, 'eval_rouge2': 20.9367, 'eval_rougeL': 36.9789, 'eval_rougeLsum': 36.9821, 'eval_runtime': 130.4759, 'eval_samples_per_second': 12.868, 'eval_steps_per_second': 1.609, 'epoch': 3.0}
