In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, EarlyStoppingCallback, MBartConfig
from transformers import MBartForConditionalGeneration, MBartTokenizerFast,MBartTokenizer,Trainer, TrainingArguments

from transformers import DataCollatorForSeq2Seq
from hftrim.TokenizerTrimmer import TokenizerTrimmer
from hftrim.ModelTrimmers import MBartTrimmer

from transformers import PreTrainedTokenizerFast
# from transformers.convert_slow_tokenizer import convert_slow_tokenizer
from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_dataset
from datasets import concatenate_datasets
from random import randrange
import evaluate
import nltk
import numpy as np
from collections import Counter
from lsg_converter import LSGConverter
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

[nltk_data] Downloading package punkt to /home/risto/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
# !pip install peft

In [4]:
# !pip install nltk

In [5]:
# !pip install evaluate

In [6]:
# !pip install rouge_score

In [7]:
# !pip install py7zr

In [8]:
# !pip install transformers --upgrade

## model

In [5]:
model_id="facebook/mbart-large-cc25"
src_lang = "et_EE" # Example source language code
tgt_lang = "et_EE" # Example target language code

In [9]:
tokenizer_slow = MBartTokenizer.from_pretrained(model_id)
model = MBartForConditionalGeneration.from_pretrained(model_id)

In [10]:
# Important: Set both source and target languages
tokenizer.src_lang = src_lang
tokenizer.tgt_lang = tgt_lang

tokenizer_slow.src_lang = src_lang
tokenizer_slow.tgt_lang = tgt_lang

In [11]:
def msize(m):
    return sum(p.numel() for p in m.parameters())
#share of params in embedding
print(msize(model.model.shared) / msize(model))   
print(msize(model.lm_head) / msize(model))  

0.41913215486098887
0.41913215486098887


## data

In [3]:
dataset_id = "TalTechNLP/samsum_ee"
dataset = load_dataset(dataset_id)

print(f"Train dataset size: {len(dataset['train'])}")
print(f"Test dataset size: {len(dataset['test'])}")

Train dataset size: 14732
Test dataset size: 819


In [4]:
dataset['train']=dataset['train'].filter(lambda example, idx: example['summary'] is not None and example['dialogue'] is not None, with_indices=True)
dataset['test'] = dataset['test'].filter(lambda example, idx: example['summary'] is not None and example['dialogue'] is not None, with_indices=True)

In [5]:
print(f"Train dataset size: {len(dataset['train'])}")
print(f"Test dataset size: {len(dataset['test'])}")

Train dataset size: 13199
Test dataset size: 809


In [6]:
train_dataset = dataset['train'].filter(lambda example, idx: example['summary'] is not None and example['dialogue'] is not None, with_indices=True)
train_dataset

Dataset({
    features: ['id', 'dialogue', 'summary', 'en_dialogue', 'en_summary'],
    num_rows: 13199
})

In [7]:
test_dataset = dataset['test'].filter(lambda example, idx: example['summary'] is not None and example['dialogue'] is not None, with_indices=True)
test_dataset

Dataset({
    features: ['id', 'dialogue', 'summary', 'en_dialogue', 'en_summary'],
    num_rows: 809
})

## trim models vocab

In [17]:
texts4vocab=train_dataset['dialogue']+train_dataset['summary']
len(texts4vocab)

26398

In [18]:
config = MBartConfig.from_pretrained(model_id)

In [None]:
# trim tokenizer
tokenizer_slow_trim = TokenizerTrimmer(tokenizer_slow)
tokenizer_slow_trim.make_vocab(texts4vocab)
tokenizer_slow_trim.make_tokenizer()

# trim model
model_trim = MBartTrimmer(model, config, tokenizer_slow_trim.trimmed_tokenizer)
model_trim.make_weights(tokenizer_slow_trim.trimmed_vocab_ids)
model_trim.make_model()

Token indices sequence length is longer than the specified maximum sequence length for this model (1981 > 1024). Running this sequence through the model will result in indexing errors
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 250000/250000 [00:00<00:00, 1167225.66it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Save trimmed model and tokenizer

In [None]:
tokenizer_slow_trim.trimmed_tokenizer.save_pretrained('mbart-large-cc25_et')
model_trim.trimmed_model.save_pretrained('mbart-large-cc25_et')

## Load model and tokenizer

In [3]:
# tokenizer = MBartTokenizer.from_pretrained('mbart-large-cc25_et')
tokenizer = MBartTokenizerFast.from_pretrained("mbart-large-cc25_et/", from_slow=True)
tokenizer.src_lang = src_lang
tokenizer.tgt_lang = tgt_lang

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
model = MBartForConditionalGeneration.from_pretrained('mbart-large-cc25_et', max_length=1024)

## Load model and make it accepting longer sequences

In [11]:
converter = LSGConverter(max_sequence_length=2048)

model_id='mbart-large-cc25_et'
model, tokenizer = converter.convert_from_pretrained(model_id, num_global_tokens=7)
tokenizer.src_lang = src_lang
tokenizer.tgt_lang = tgt_lang
print(type(model))

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
Some weights of LSGMBartForConditionalGeneration were not initialized from the model checkpoint at mbart-large-cc25_et and are newly initialized: ['model.encoder.global_embeddings.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


<class 'lsg_converter.mbart.modeling_lsg_mbart.LSGMBartForConditionalGeneration'>


In [12]:
model.save_pretrained('mbart-large-cc25_lsg_2048_et')

In [26]:
tokenizer.save_pretrained('mbart-large-cc25_lsg_2048_et')

('/opt/work/storage2/localdata/datasience_dev/text/summarization/transformers/mbart-large-cc25_lsg_2048_et/tokenizer_config.json',
 '/opt/work/storage2/localdata/datasience_dev/text/summarization/transformers/mbart-large-cc25_lsg_2048_et/special_tokens_map.json',
 '/opt/work/storage2/localdata/datasience_dev/text/summarization/transformers/mbart-large-cc25_lsg_2048_et/sentencepiece.bpe.model',
 '/opt/work/storage2/localdata/datasience_dev/text/summarization/transformers/mbart-large-cc25_lsg_2048_et/added_tokens.json',
 '/opt/work/storage2/localdata/datasience_dev/text/summarization/transformers/mbart-large-cc25_lsg_2048_et/tokenizer.json')

## prep data for training

In [8]:
max_source_length = 2048

In [9]:
max_target_length=512

In [10]:
def preprocess_function(sample,padding="max_length"):
    # add prefix to the input for t5
    inputs = ["summarize: " + item for item in sample["dialogue"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text=sample["summary"], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [19]:
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=["dialogue", "summary", "id"])

In [20]:
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True, remove_columns=["dialogue", "summary", "id"])

## evaluator

In [11]:
# Metric
metric = evaluate.load("rouge")

# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

In [22]:
# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

## training 

In [23]:
early_stop = EarlyStoppingCallback(3, 0.0)

In [24]:
# Hugging Face repository id
#save to other disc where chmod is supported
repository_id = f"mbart-large-cc25_lsg_2048_et_TalTechNLP/{dataset_id}"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=repository_id,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=2,
    predict_with_generate=True,
    fp16=True,
    learning_rate=5e-5,
    num_train_epochs=10,
    # logging & evaluation strategies
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=500,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    # push to hub parameters
    report_to="tensorboard",
    push_to_hub=False,
    hub_strategy="every_save",
    hub_model_id=repository_id,
)


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[early_stop]
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [25]:
trainer.train()

You're using a MBartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,2.0696,1.933898,36.3398,15.3207,30.8468,33.4511,29.572311
2,1.5221,1.894038,37.5666,15.3059,31.7082,34.7062,26.925834
3,1.0855,1.983595,37.6738,16.1216,32.1039,34.9453,25.63288
4,0.9386,2.337484,34.909,14.002,28.889,32.1824,72.166873
5,0.4993,2.539751,37.4824,15.0629,31.0213,34.4806,31.426452


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=16500, training_loss=1.2926952958540483, metrics={'train_runtime': 18278.2764, 'train_samples_per_second': 7.221, 'train_steps_per_second': 1.805, 'total_flos': 2.8045274990444544e+17, 'train_loss': 1.2926952958540483, 'epoch': 5.0})

## Calc metrics

In [11]:
model_id="mbart-large-cc25_lsg_2048_et_TalTechNLP/TalTechNLP/samsum_ee/checkpoint-6600/"
tokenizer = MBartTokenizerFast.from_pretrained('transformers/mbart-large-cc25_lsg_2048_et',
                                               from_slow=True)
model = MBartForConditionalGeneration.from_pretrained(model_id)

In [12]:
model=model.to('cuda')

In [15]:
def calc_texts_labels_metrics(texts, summaries, model, tokenizer, max_input_length=1024, max_output_length=512, batch_size = 10):
    true_labels=tokenizer(
        summaries, return_tensors="pt",padding="max_length", truncation=True, max_length=max_input_length
    ).input_ids.cpu()
    input_ids = tokenizer(
        texts, return_tensors="pt", padding="max_length", truncation=True, max_length=max_input_length
    ).input_ids  
    
    predictions = []
    for i in range(0, input_ids.size(0), batch_size):
        batch_input_ids = input_ids[i:i + batch_size].to('cuda')
        batch_outputs = model.generate(input_ids=batch_input_ids, max_length=max_output_length)
        predictions.extend(batch_outputs.cpu().detach().numpy())
    max_length = max(len(p) for p in predictions)
    padded_predictions = [np.pad(p, (0, max_length - len(p)), mode='constant') for p in predictions]
    outputs = torch.tensor(padded_predictions)
    eval_preds = (outputs, true_labels.cpu())
    metrics = compute_metrics(eval_preds)
    return metrics

eval_metrics=calc_texts_labels_metrics(dataset['test']['dialogue'], dataset['test']['summary'], model, tokenizer)
eval_metrics

  outputs = torch.tensor(padded_predictions)


{'rouge1': 37.0547,
 'rouge2': 14.8011,
 'rougeL': 30.5113,
 'rougeLsum': 34.1101,
 'gen_len': 456.85784919653895}

## example usage

In [14]:
def summarize(text, model, tokenizer, max_input_length=2048, max_new_tokens=512):
    input_ids = tokenizer(
         text, return_tensors="pt",
        max_length=max_input_length
    ).input_ids  # Batch size 1
    outputs = model.generate(input_ids=input_ids.to('cuda'), max_new_tokens=max_new_tokens)
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))
    
summarize(dataset['test'][0]['dialogue'], model, tokenizer)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Amanda ei leia Betty telefoninumbrit. Hannah saadab talle s√µnumi Larrylt. Larry helistas talle viimati, kui nad koos pargis oliid. Hannah ei tunne teda h√§sti.


In [30]:
 dataset['test'][0]['dialogue']

'Hannah: Hei, kas sul on Betty number?\nAmanda: Vaatan j√§rele.\nHannah: <file_gif>\nAmanda: Vabandust, ei leia seda.\nAmanda: K√ºsi Larrylt.\nAmanda: Ta helistas talle viimati, kui me koos pargis olime.\nHannah: Ma ei tunne teda h√§sti.\nHannah: <file_gif>\nAmanda: √Ñra ole h√§belik, ta on v√§ga tore.\nHannah: Kui sa √ºtled nii..\nHannah: Ma eelistaksin, et sa talle s√µnumi saadaksid.\nAmanda: Lihtsalt saada talle s√µnum üôÇ\nHannah: Urgh.. Olgu siis\nHannah: N√§gemist\nAmanda: N√§gemist-n√§gemist'

In [18]:
summarize(dataset['test'][1]['dialogue'], model, tokenizer)

Rob ja Eric vaatavad Masin'i. Nad kohtuvad siis.


In [32]:
input_text

"Eric: MASIN!\nRob: See on nii lahe!\nEric: Ma tean! Ja n√§itab, kuidas ameeriklased n√§evad venelasi ;)\nRob: Ja see on t√µesti naljakas!\nEric: Ma tean! Mulle meeldib eriti rongi osa!\nRob: Hahaha! Keegi ei r√§√§gi masinaga nii!\nEric: Kas see on tema ainus etendus?\nRob: Ei tea. Ma vaatan j√§rele.\nEric: Kindlasti.\nRob: Selgub, et ei! M√µned tema etendused on Youtube'is.\nEric: Lahe! Ma vaatan neid kohe!\nRob: Mina ka!\nEric: MASIN!\nRob: MASIN!\nEric: Kuni kohtumiseni?\nRob: Muidugi :)"

In [33]:
dataset['test'][1]['summary']

"Eric ja Rob l√§hevad vaatama stand-up'i YouTube'is."

In [19]:
input_text="""Veel veebruaris oli Soome k√µige populaarsem partei Koonderakond, kuid kaotas m√§rtsis selle tiitli SDP-le. M√§rtsis langes peaministripartei  toetus √ºhe protsendi v√µrra ning Koonderakonna toetus on n√º√ºd 20,6 protsenti.
SDP suurendas toetust naiste ja noorte hulgas. M√§rtsis t√µusis SDP toetus 1,9 protsenti ning erakonna toetus on n√º√ºd 21,7 protsenti. 
Koonderakonna kannul on P√µlissoomlased, rahandusminister Riikka Purra kodupartei toetus on 17,4 protsenti."""
summarize(input_text, model, tokenizer)

M√§rtsis langes peaministripartei toetus √ºhe protsendi v√µrra. M√§rtsis t√µusis SDP toetus 1,9 protsenti ja erakonna toetus 21 21,7 protsenti. M√§rtsis langes kasvas toetust naiste ja Koonderakonna toetus √ºhe protsendi v√µrra. Koonderakonna toetus 1 protsendi v√µrra. M√§rtsis. M√§rtsis langes t√µusis langes SDP toetus √ºhe protsendi v√µrra. M√§rtsis langes M√§rtsis langes M√§rtsis langes SDP toetus √ºhe protsendi v√µrra. M√§rtsis langes SDP toetus 1, M√§rtsis t√µusis SDP toetus 1, M√§rtsis t√µusis SDP toetus 1, SDP toetus 1, SDP toetus 1, M√§rtsis 1, SDP toetus 1, M√§rtsis 1, M√§rtsis 1, M√§rtsis 1, M√§rtsis 1 9 protsenti ja erakonna toetus 1, M√§rtsis 1, SDP toetus 1, SDP toetus 1, M√§rtsis 1, M√§rtsis 1, M√§rtsis 1, M√§rtsis 1, M√§rtsis 1, M√§rtsis 1, M√§rtsis. M√§rtsis. M√§rtsis. M√§rtsis. M√§rtsis. M√§rtsis. M√§rtsis. M√§rtsis. M√§rtsis. M√§rtsis. M√§rtsis. M√§rtsis. M√§rtsis. M√§rtsis. M√§rtsis. M√§rtsis. M√§rtsis. M√§rtsis. M√§rtsis. M√§rtsis. M√§rtsis 1, SDP suurendas 1, SDP s

In [15]:
%time summarize(dataset['test'][2]['dialogue'], model, tokenizer)

Lenny peaks valima esimese v√µi kolmanda paari. Bob saadab talle pilte. Lennyle meeldib k√µige rohkem esimesed p√ºksid, kuid talle meeldivad k√µige rohkem esimesed lillad p√ºksid.
CPU times: user 481 ms, sys: 271 ¬µs, total: 482 ms
Wall time: 480 ms


In [16]:
%time summarize(dataset['test'][3]['dialogue'], model, tokenizer)

Emma ja Will s√∂√∂vad t√§na √µhtust. Emma valmistab toitu. Emma j√µuab koju varsti. Will ei taha, et ta teda j√§rele tulla.
CPU times: user 267 ms, sys: 0 ns, total: 267 ms
Wall time: 266 ms
