In [1]:
from tabulate import tabulate
import nltk
from datetime import datetime
import torch
import numpy as np
import datasets
from datasets import load_dataset
import accelerate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [21]:
torch.cuda.is_available()

True

In [22]:
lang = 'english'

In [3]:
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer
)

In [24]:
model_name = "sshleifer/distilbart-xsum-6-6"

In [25]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [26]:
max_enxoder_length =1024
max_decoder_length = 256


In [27]:
data = load_dataset('wiki_lingua', name = lang, split = 'train[:2000]')

Found cached dataset wiki_lingua (C:/Users/yaroslav/.cache/huggingface/datasets/wiki_lingua/english/1.1.1/6fdaa844abe35a3a2a79e5a1cf9e546f32ad234d59756bcf9cfeadff6c89240e)


In [28]:
data

Dataset({
    features: ['url', 'article'],
    num_rows: 2000
})

In [29]:
data['url'][0]

'https://www.wikihow.com/Avoid-Drinking-and-Driving'

In [30]:
data['article'][0]

{'section_name': ['Finding Other Transportation',
  'Designating a Driver',
  'Staying Safe'],
 'document': ['make sure that the area is a safe place, especially if you plan on walking home at night.  It’s always a good idea to practice the buddy system.  Have a friend meet up and walk with you. Research the bus, train, or streetcar routes available in your area to find safe and affordable travel to your destination.  Make sure you check the schedule for your outgoing and return travel.  Some public transportation will cease to run late at night.  Be sure if you take public transportation to the venue that you will also be able to get home late at night. Check the routes.  Even if some public transit is still running late at night, the routing may change.  Some may run express past many of the stops, or not travel all the way to the ends.  Be sure that your stop will still be available when you need it for your return trip. If you are taking public transit in a vulnerable state after d

In [31]:
def preprocess_corpus(corpus) -> dict:
    return {"document": corpus["article"]["document"],
        "summary": corpus["article"]["summary"]}

In [32]:
dataset = data.map(preprocess_corpus, remove_columns=['article', 'url'])

Loading cached processed dataset at C:\Users\yaroslav\.cache\huggingface\datasets\wiki_lingua\english\1.1.1\6fdaa844abe35a3a2a79e5a1cf9e546f32ad234d59756bcf9cfeadff6c89240e\cache-874ab2fc9f5c9bca.arrow


In [33]:
torch.cuda.is_available()

True

In [34]:
torch.cuda.current_device()

0

In [35]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 3060 Laptop GPU'

In [36]:
dataset

Dataset({
    features: ['document', 'summary'],
    num_rows: 2000
})

In [37]:
dataset['summary'][0]

['Walk to the venue where you will be drinking if it is close enough. Take public transit. Show up in style by hiring a limo or black car service. Flag a taxi cab for a convenient option to get where you’re going. Request a rideshare service like Uber or Lyft using an app on your phone. Reserve a designated driver service.',
 'Plan in advance. Assign a designated driver. Leave your car at home. Leave the venue with your designated driver.',
 'Pay attention to your body. Give up your keys. Listen to other people. Accept help. Stay where you are. Have an emergency back-up plan. Make sure that your phone is charged.']

In [38]:
def list2samples(example):
    documents = []
    summaries = []
    for sample in zip(example["document"], example["summary"]):
        if len(sample[0]) > 0:
            documents += sample[0]
            summaries += sample[1]
    return {"document": documents, "summary": summaries}

In [39]:
dataset = dataset.map(list2samples, batched=True)

Loading cached processed dataset at C:\Users\yaroslav\.cache\huggingface\datasets\wiki_lingua\english\1.1.1\6fdaa844abe35a3a2a79e5a1cf9e546f32ad234d59756bcf9cfeadff6c89240e\cache-97845262552237d9.arrow


In [40]:
train_data_txt, validation_data_txt = dataset.train_test_split(test_size=0.1).values()

In [41]:
train_data_txt, validation_data_txt

(Dataset({
     features: ['document', 'summary'],
     num_rows: 4351
 }),
 Dataset({
     features: ['document', 'summary'],
     num_rows: 484
 }))

In [42]:
def batch_tokenize_preprocess(batch, tokenizer, max_source_length, max_target_length):
    source, target = batch["document"], batch["summary"]
    source_tokenized = tokenizer(source, padding="max_length", truncation=True, max_length=max_source_length)
    target_tokenized = tokenizer(target, padding="max_length", truncation=True, max_length=max_target_length)
    batch = {k: v for k, v in source_tokenized.items()}
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in l]
        for l in target_tokenized["input_ids"]
    ]
    return batch

In [43]:
train_data = train_data_txt.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, max_enxoder_length, max_decoder_length
    ),
    batched=True,
    remove_columns=train_data_txt.column_names,
)

validation_data = validation_data_txt.map(
    lambda batch: batch_tokenize_preprocess(
         batch, tokenizer, max_enxoder_length, max_decoder_length
    ),
    batched=True,
    remove_columns=validation_data_txt.column_names,
)

                                                                

In [44]:
nltk.download("punkt", quiet=True)
metric = datasets.load_metric("rouge")

  metric = datasets.load_metric("rouge")


In [45]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

In [46]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [47]:
training_args = Seq2SeqTrainingArguments(
    output_dir ="results" ,
    num_train_epochs=1,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=4, 
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.1,
    label_smoothing_factor=0.1,
    predict_with_generate=True,
    logging_dir="logs",
    logging_steps=50,
    save_total_limit=3,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_data,
    eval_dataset=validation_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [48]:
trainer.train()

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
50,5.7466
100,4.626
150,4.2953
200,4.2414
250,4.0437
300,4.095
350,4.0593
400,4.0105
450,3.9907
500,4.0488


TrainOutput(global_step=1088, training_loss=4.059148788452148, metrics={'train_runtime': 9442.0973, 'train_samples_per_second': 0.461, 'train_steps_per_second': 0.115, 'total_flos': 4714590810144768.0, 'train_loss': 4.059148788452148, 'epoch': 1.0})

### TrainOutput(global_step=1088, training_loss=4.116244386224186, metrics={'train_runtime': 18348.2797, 'train_samples_per_second': 0.237, 'train_steps_per_second': 0.059, 'total_flos': 2357295405072384.0, 'train_loss': 4.116244386224186, 'epoch': 1.0})

In [49]:
trainer.evaluate()

{'eval_loss': 3.7342700958251953,
 'eval_rouge1': 34.1167,
 'eval_rouge2': 13.6136,
 'eval_rougeL': 27.1978,
 'eval_rougeLsum': 33.0099,
 'eval_gen_len': 32.8657,
 'eval_runtime': 1487.0012,
 'eval_samples_per_second': 0.325,
 'eval_steps_per_second': 0.081,
 'epoch': 1.0}

In [50]:
trainer.save_model("C:/Users/yaroslav/Desktop/model")


In [51]:
tokenizer.save_pretrained("C:/Users/yaroslav/Desktop/model")

('C:/Users/yaroslav/Desktop/model\\tokenizer_config.json',
 'C:/Users/yaroslav/Desktop/model\\special_tokens_map.json',
 'C:/Users/yaroslav/Desktop/model\\vocab.json',
 'C:/Users/yaroslav/Desktop/model\\merges.txt',
 'C:/Users/yaroslav/Desktop/model\\added_tokens.json',
 'C:/Users/yaroslav/Desktop/model\\tokenizer.json')

In [4]:
model = AutoModelForSeq2SeqLM.from_pretrained("C:/Users/yaroslav/Desktop/model")

In [5]:
tokenizer = AutoTokenizer.from_pretrained("C:/Users/yaroslav/Desktop/model")

In [32]:
def generate_summary(test_samples, model):
    inputs = tokenizer(
        test_samples["document"],
        truncation=True,
        max_length=max_enxoder_length,
        return_tensors="pt",
        padding=True
    ).to(device)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    outputs = model.generate(input_ids, attention_mask=attention_mask)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return outputs, output_str


model_before_tuning = AutoModelForSeq2SeqLM.from_pretrained(model_name)
test_samples = validation_data_txt.select(range(5))
summaries_after_tuning = generate_summary(test_samples, model)[1]

In [33]:
print(
    tabulate(
        zip(
            range(len(summaries_after_tuning)),
            summaries_after_tuning,
            
        ),
        headers=["Id", "Summary after"],
    )
)
print("\nTarget summaries:\n")
print(
    tabulate(list(enumerate(test_samples["summary"])), headers=["Id", "Target summary"])
)
print("\nSource documents:\n")
print(tabulate(list(enumerate(test_samples["document"])), headers=["Id", "Document"]))

  Id  Summary after
----  ---------------------------------------------------------------------------------------------------------------------------------------
   0  Ask a hypnotherapist to guide you through past life regression.
   1  Put the scrap wood on top of the tree stump. Light the fire. Let the fire burn until it's gone.
   2  Turn on the wireless function switch.
   3  Go over your relationship to find the answers you need. Identify what went wrong. Learn from your relationship. Look back on your past.
   4  Exercise to improve mobility, posture, and balance. Consider deep brain stimulation.

Target summaries:

  Id  Target summary
----  -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
   0  Visit a hypnotherapist. Congratulations!
   1  Construct a teepee-like shape over the stump with scrap woo