In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, EarlyStoppingCallback
from transformers import DataCollatorForSeq2Seq
from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_dataset, Dataset
from datasets import concatenate_datasets
from transformers import T5Tokenizer, LongT5Model, LongT5Config, LongT5ForConditionalGeneration
from transformers import GenerationConfig
from random import randrange
import evaluate
import nltk
from ast import literal_eval
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize
from collections import Counter
nltk.download("punkt")

[nltk_data] Downloading package punkt to /home/risto/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# !pip install nltk

In [3]:
# !pip install evaluate

In [4]:
# !pip install rouge_score

In [5]:
# !pip install py7zr

In [6]:
# !pip install peft

In [7]:
# !pip install transformers --upgrade

source: https://towardsdatascience.com/how-to-adapt-a-multilingual-t5-model-for-a-single-language-b9f94f3d9c90

## model

In [8]:
model_id="agemagician/mlong-t5-tglobal-base"

In [9]:
tokenizer = T5Tokenizer.from_pretrained(model_id)
model = LongT5ForConditionalGeneration.from_pretrained(model_id)

normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.


In [11]:
def msize(m):
    return sum(p.numel() for p in m.parameters())
print(msize(model.shared) / msize(model))   
print(msize(model.lm_head) / msize(model))  

0.3325811738746633
0.3325811738746633


about 33% are embeddings

## data

In [6]:
dataset_id = "riigikogu"
df=pd.read_excel('data/riigikogu/for_summarization_mbart_2048_chunks_summaries.xlsx')
df_train=df[df.split=='train']
df_test=df[df.split=='test']

train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)
print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Train dataset size: 3516
Test dataset size: 308


In [12]:
train_dataset=train_dataset.filter(lambda example, idx: example['summary'] is not None and example['text'] is not None, with_indices=True)
test_dataset = test_dataset.filter(lambda example, idx: example['summary'] is not None and example['text'] is not None, with_indices=True)

Filter:   0%|          | 0/3516 [00:00<?, ? examples/s]

Filter:   0%|          | 0/308 [00:00<?, ? examples/s]

In [21]:
print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Train dataset size: 3516
Test dataset size: 308


## prep data for training

In [1]:
max_source_length=2048

In [2]:
max_target_length=512

In [36]:
def preprocess_function(sample,padding="max_length"):
    # add prefix to the input for t5
    inputs = ["summarize: " + item for item in sample["text"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text=sample["summary"], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=["text", "summary"])
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True, remove_columns=["text", "summary"])

Map:   0%|          | 0/3516 [00:00<?, ? examples/s]

Map:   0%|          | 0/308 [00:00<?, ? examples/s]

## evaluator

In [29]:
# Metric
metric = evaluate.load("rouge")

# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

In [38]:
# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

## training 

In [39]:
early_stop = EarlyStoppingCallback(3, 0.0)

In [40]:
# Hugging Face repository id
repository_id = f"{model_id.split('/')[1]}-{dataset_id}"#for some reason this was not working

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=repository_id,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    predict_with_generate=True,
    fp16=False, # Overflows with fp16
    learning_rate=5e-5,
    num_train_epochs=20,
    # logging & evaluation strategies
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=500,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    # push to hub parameters
    report_to="tensorboard",
    push_to_hub=False,
    hub_strategy="every_save",
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token(),
    # generation_max_length=40
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[early_stop]
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [None]:
# Start training
trainer.train()



Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,2.6682,1.623124,19.3914,10.5486,17.8641,18.881,19.0
2,2.0278,1.556585,18.8687,10.1349,17.4251,18.4289,19.0
3,1.8577,1.522935,19.5788,10.8327,18.0165,19.07,19.0
4,1.7561,1.490596,19.7577,10.9015,18.2709,19.2736,19.0
5,1.6794,1.474273,19.7409,11.0758,18.3059,19.3082,19.0
6,1.6111,1.479437,19.9551,11.3665,18.6515,19.4783,19.0
7,1.5587,1.463167,20.0035,11.1444,18.5048,19.551,19.0
8,1.5153,1.464189,19.8393,11.1896,18.3865,19.3468,19.0
9,1.4596,1.46467,19.9031,11.3053,18.543,19.5164,19.0
10,1.4136,1.455722,19.6306,11.0157,18.3224,19.2194,19.0




## calculate final metrics

In [5]:
model_id='mlong-t5-tglobal-base-riigikogu/checkpoint-14064/'
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
model=model.to('cuda')

In [27]:
def calc_texts_labels_metrics(texts, summaries, max_input_length=2048, max_output_length=512, batch_size = 10):
    true_labels=tokenizer(
        summaries, return_tensors="pt",padding="max_length", truncation=True, max_length=max_input_length
    ).input_ids.cpu()
    input_ids = tokenizer(
        texts, return_tensors="pt", padding="max_length", truncation=True, max_length=max_input_length
    ).input_ids  
    
    predictions = []
    for i in range(0, input_ids.size(0), batch_size):
        batch_input_ids = input_ids[i:i + batch_size].to('cuda')
        batch_outputs = model.generate(input_ids=batch_input_ids, max_length=max_output_length)
        predictions.extend(batch_outputs.cpu().detach().numpy())
    max_length = max(len(p) for p in predictions)
    padded_predictions = [np.pad(p, (0, max_length - len(p)), mode='constant') for p in predictions]
    outputs = torch.tensor(padded_predictions)
    # outputs = torch.tensor(predictions)
    eval_preds = (outputs, true_labels.cpu())
    metrics = compute_metrics(eval_preds)
    return metrics

In [30]:
eval_metrics=calc_texts_labels_metrics(df_test.text.tolist(), df_test.summary.tolist())

In [31]:
eval_metrics

{'rouge1': 36.8194,
 'rouge2': 16.7082,
 'rougeL': 29.155,
 'rougeLsum': 34.3118,
 'gen_len': 109.56818181818181}

## example usage

In [51]:
model_id='mlong-t5-tglobal-base-riigikogu/checkpoint-14064/'
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

In [52]:
model=model.to('cuda')

In [13]:
def summarize(text, model, tokenizer, max_new_tokens=512):
    input_ids = tokenizer(
         text, return_tensors="pt"
    ).input_ids  # Batch size 1
    outputs = model.generate(input_ids=input_ids.to('cuda'), max_new_tokens=max_new_tokens)
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [14]:
summarize(df_test.text.tolist()[0], model, tokenizer)

- Sven Sester annab ülevaate riigieelarve seaduse eelnõu ettevalmistamisest teiseks lugemiseks.
- Eelnõu eesmärk on võetud üle Euroopa Nõukogu direktiiv 2011/85/EL liikmesriikide eelarveraamistiku miinimumnõuete kohta.
- Muudatusettepanekud hõlmavad ka tasakaalureeglit valitsuse ja teiste valitsussektori institutsioonide tegevusele ning kompenseerimismehhanismi leevendamist.


In [17]:
summarize("summarize: Studies have shown that owning a dog is good for you", model, tokenizer)

- Studies have shown that owning a dog is good for you
- Studies have shown that owning a dog is good for you
- A dog is a kind, and a pet is a kind


In [18]:
summarize(df_test.text.tolist()[1], model, tokenizer)

- Sven Sester selgitab Eesti Panga hinnangu andmise aja reguleerimise eelnõu sisu
- Muudatusettepanekud seoses rahastamisotsuste lisamisega arengukavasse
- Riigikogu peaks kaaluma poliitika põhialuste kinnitamist arengudokumendina


In [19]:
summarize(df_test.text.tolist()[2], model, tokenizer)

- Sven Sester rõhutas vajadust tuua riigi eelarvestrateegia riiklikult tähtsa küsimusena Riigikogus arutusele ning selgitada eelarve kulutuste tulemuslikkust.
- Komisjon käsitles töötuskindlustusmakse määrade kehtestamise regulatsiooni ja Euroopa Keskpanga arvamust eelnõu kohta.
- Rahanduskomisjon koostas muudatusettepanekuid eelarvestrateegia ja riigi eelarvestrateegia süsteemi regulatsioonide täiendamiseks ning otsustas võtta eelnõu teiseks lugemiseks täiskogu istungi päevakorda 15. jaanuaril.


In [20]:
summarize(df_test.text.tolist()[3], model, tokenizer)

- Jaak Allik küsib, millise sisu annab eelnõu lausele valdkonna arengukava esitamiseks enne kinnitamist Riigikogule arutamiseks.
- Kalev Kotkas küsib, kuidas tasakaalus on eelarveaasta põhitegevuse tulude ja kulude vahe ning kuidas see tasakaalus on.
- Rannar Vassiljev küsib, miks rahandusministrile antakse piiramatu õigus ajutiselt piirata riigieelarves ettenähtud väljamaksete tegemist.


In [21]:
text="""Veel veebruaris oli Soome kõige populaarsem partei Koonderakond, kuid kaotas märtsis selle tiitli SDP-le. Märtsis langes peaministripartei  toetus ühe protsendi võrra ning Koonderakonna toetus on nüüd 20,6 protsenti.
SDP suurendas toetust naiste ja noorte hulgas. Märtsis tõusis SDP toetus 1,9 protsenti ning erakonna toetus on nüüd 21,7 protsenti. 
Koonderakonna kannul on Põlissoomlased, rahandusminister Riikka Purra kodupartei toetus on 17,4 protsenti."""
summarize(text, model, tokenizer)

- Koonderakond kaotas märtsis tiitli SDP-le
- SDP suurendas toetust naiste ja noorte hulgas
- Põlissoomlased ja rahandusminister Riikka Purra kodupartei toetus on 17,4 protsenti
