In [2]:
from tabulate import tabulate
import nltk
from datetime import datetime
import torch
import numpy as np
import datasets
from datasets import load_dataset

In [3]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

In [4]:
lang = 'english'

In [5]:
model_name = "sshleifer/distilbart-xsum-12-3"

In [6]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [7]:
max_enxoder_length =1024
max_decoder_length = 256


In [8]:
data = load_dataset('wiki_lingua', name = lang, split = 'train[:2000]')

Found cached dataset wiki_lingua (C:/Users/ermil/.cache/huggingface/datasets/wiki_lingua/english/1.1.1/6fdaa844abe35a3a2a79e5a1cf9e546f32ad234d59756bcf9cfeadff6c89240e)


In [9]:
data

Dataset({
    features: ['url', 'article'],
    num_rows: 2000
})

In [10]:
def preprocess_corpus(corpus) -> dict:
    return {"document": corpus["article"]["document"],
        "summary": corpus["article"]["summary"]}

In [11]:
dataset = data.map(preprocess_corpus, remove_columns=['article', 'url'])

Loading cached processed dataset at C:\Users\ermil\.cache\huggingface\datasets\wiki_lingua\english\1.1.1\6fdaa844abe35a3a2a79e5a1cf9e546f32ad234d59756bcf9cfeadff6c89240e\cache-e02963387f4edd20.arrow


In [12]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 3060 Laptop GPU'

In [13]:
dataset

Dataset({
    features: ['document', 'summary'],
    num_rows: 2000
})

In [14]:
dataset['summary'][0]

['Walk to the venue where you will be drinking if it is close enough. Take public transit. Show up in style by hiring a limo or black car service. Flag a taxi cab for a convenient option to get where you’re going. Request a rideshare service like Uber or Lyft using an app on your phone. Reserve a designated driver service.',
 'Plan in advance. Assign a designated driver. Leave your car at home. Leave the venue with your designated driver.',
 'Pay attention to your body. Give up your keys. Listen to other people. Accept help. Stay where you are. Have an emergency back-up plan. Make sure that your phone is charged.']

In [15]:
def list2samples(example):
    documents = []
    summaries = []
    for sample in zip(example["document"], example["summary"]):
        if len(sample[0]) > 0:
            documents += sample[0]
            summaries += sample[1]
    return {"document": documents, "summary": summaries}

In [16]:
dataset = dataset.map(list2samples, batched=True)

Loading cached processed dataset at C:\Users\ermil\.cache\huggingface\datasets\wiki_lingua\english\1.1.1\6fdaa844abe35a3a2a79e5a1cf9e546f32ad234d59756bcf9cfeadff6c89240e\cache-4a248abe970dba8d.arrow


In [17]:
train_data_txt, validation_data_txt = dataset.train_test_split(test_size=0.1).values()

In [18]:
train_data_txt, validation_data_txt

(Dataset({
     features: ['document', 'summary'],
     num_rows: 4351
 }),
 Dataset({
     features: ['document', 'summary'],
     num_rows: 484
 }))

In [19]:
def batch_tokenize_preprocess(batch, tokenizer, max_source_length, max_target_length):
    source, target = batch["document"], batch["summary"]
    source_tokenized = tokenizer(source, padding="max_length", truncation=True, max_length=max_source_length)
    target_tokenized = tokenizer(target, padding="max_length", truncation=True, max_length=max_target_length)
    batch = {k: v for k, v in source_tokenized.items()}
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in l]
        for l in target_tokenized["input_ids"]
    ]
    return batch

In [20]:
train_data = train_data_txt.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, max_enxoder_length, max_decoder_length
    ),
    batched=True,
    remove_columns=train_data_txt.column_names,
)

validation_data = validation_data_txt.map(
    lambda batch: batch_tokenize_preprocess(
         batch, tokenizer, max_enxoder_length, max_decoder_length
    ),
    batched=True,
    remove_columns=validation_data_txt.column_names,
)

                                                                

In [21]:
nltk.download("punkt", quiet=True)

metric = datasets.load_metric("rouge")

  metric = datasets.load_metric("rouge")


In [22]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

In [23]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [24]:
training_args = Seq2SeqTrainingArguments(
    output_dir ="results" ,
    num_train_epochs=1,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=3, 
    per_device_eval_batch_size=3,
    warmup_steps=500,
    weight_decay=0.1,
    label_smoothing_factor=0.1,
    predict_with_generate=True,
    logging_dir="logs",
    logging_steps=50,
    save_total_limit=3,
    gradient_checkpointing=True
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_data,
    eval_dataset=validation_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [25]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msouth_memphis[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
50,6.4684
100,5.3833
150,4.8779
200,4.736
250,4.6259
300,4.5986
350,4.4561
400,4.4681
450,4.4255
500,4.4483


TrainOutput(global_step=1451, training_loss=4.428775562244312, metrics={'train_runtime': 21565.4451, 'train_samples_per_second': 0.202, 'train_steps_per_second': 0.067, 'total_flos': 5387912164147200.0, 'train_loss': 4.428775562244312, 'epoch': 1.0})

In [26]:
trainer.evaluate()

{'eval_loss': 3.950443983078003,
 'eval_rouge1': 35.3713,
 'eval_rouge2': 13.777,
 'eval_rougeL': 27.9843,
 'eval_rougeLsum': 34.0308,
 'eval_gen_len': 26.6281,
 'eval_runtime': 1212.9193,
 'eval_samples_per_second': 0.399,
 'eval_steps_per_second': 0.134,
 'epoch': 1.0}

In [28]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [29]:
def generate_summary(test_samples, model):
    inputs = tokenizer(
        test_samples["document"],
        truncation=True,
        max_length=max_enxoder_length,
        return_tensors="pt",
        padding=True
    ).to(device)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    outputs = model.generate(input_ids, attention_mask=attention_mask)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return outputs, output_str


model_before_tuning = AutoModelForSeq2SeqLM.from_pretrained(model_name)
test_samples = validation_data_txt.select(range(5))
summaries_after_tuning = generate_summary(test_samples, model)[1]

In [30]:
print(
    tabulate(
        zip(
            range(len(summaries_after_tuning)),
            summaries_after_tuning,
            
        ),
        headers=["Id", "Summary after"],
    )
)
print("\nTarget summaries:\n")
print(
    tabulate(list(enumerate(test_samples["summary"])), headers=["Id", "Target summary"])
)
print("\nSource documents:\n")
print(tabulate(list(enumerate(test_samples["document"])), headers=["Id", "Document"]))

  Id  Summary after
----  ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------
   0  Start working on your project in advance. Use a step sequencer. Select a mixer. Use the Riff Machine.
   1  Decide whether you need to say anything. Avoid criticism that goes for someone's personality. Adjust your expectations. Avoid making assumptions about someone's character.
   2  Use green tea bags. Apply peppermint oil. Apply a cold compress.
   3  Understand the purpose of a medical second opinion. Ask for a second opinion if you are concerned about your child’s condition.
   4  Check the color inversion. Rasterize a layer. Select the layer.

Target summaries:

  Id  Target summary
----  --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------