In [None]:
! pip install datasets evaluate transformers rouge-score nltk py7zr

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=a9a7b559a70ea8d01e05c355b5cbfe1d2bb55955d98bcb5583056f90381d7c5e
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score, evaluate
Successfully installed evaluate-0.4.2 rouge-score-0.1.2


In [None]:
import numpy as np
from tqdm import tqdm
from datasets import load_dataset, load_metric
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq,  \
        Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
random_state = 42

### Define Metric For evaluating Performance

In [None]:
rouge_metric = load_metric("rouge", trust_remote_code=True)

  rouge_metric = load_metric("rouge", trust_remote_code=True)


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

### Load Dataset

In [None]:
dataset = load_dataset("samsum", trust_remote_code=True)

Downloading builder script:   0%|          | 0.00/3.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.04k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.94M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

### View Sample

In [None]:
def get_random_sample(random_state=None):
    return dataset["train"].shuffle(seed=random_state)[1]

In [None]:
sample = get_random_sample(random_state=random_state)
print("Dialogue:")
print(sample["dialogue"])
print(f'\nSummary: {sample["summary"]}')

Dialogue:
Wendy: What's up?
Simon: Nothing much. I'm painting my cupboards. 
Angela: Cool what colour?
Simon: Green.
Ben: I'm just chilling in the garden. 
Angela: Nice weekend! I'm about to meet Chris.
Wendy: Say hello from me!
Angela: Will do! And how is your weekend, Wendy?
Wendy: Very lazy... The week was hard at work, I really needed some rest. 
Ben: We should all come and visit Simon in his new apartment!
Simon: You are welcome, guys! Whenever you wish.
Ben: I should be in Bournemouth next week. 
Simon: I'm not going anywhere :-)
Ben: Cool, I'll call you next week. 

Summary: This weekend Wendy is very lazy because she worked hard at work, and Angela is meeting Chris. Simon is chilling in the garden and painting his cupboards green. Next week, Ben, Angela, Chris and Wendy will visit him in his new apartament.


### Setup Evaluation and preprocessing functions

In [None]:
def evaluate_baseline_summaries(dataset, metric, column_text="dialogue",
                                column_summary="summary"):
    summaries = [three_sentence_summary(text) for text in dataset[column_text]]
    metric.add_batch(predictions=summaries, references=dataset[column_summary])
    score = metric.compute()
    return score

def chunks(list_of_elements, batch_size):
    """Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

def evaluate_summaries_baseline(dataset, metric,
     column_text="article",
     column_summary="highlights"):
     summaries = [three_sentence_summary(text) for text in dataset[column_text]]
     metric.add_batch(predictions=summaries,
     references=dataset[column_summary])
     score = metric.compute()
     return score

def evaluate_model_summaries(dataset, metric, model, tokenizer, batch_size=16,
                             device="cuda", column_text="dialogue", column_summary="summary"):
    dialogue_batches = list(chunks(dataset[column_text], batch_size))
    target_batches = list(chunks(dataset[column_summary], batch_size))

    for dialogue_batch, target_batch in tqdm(
        zip(dialogue_batches, target_batches), total=len(dialogue_batches)):
        inputs = tokenizer(dialogue_batch, max_length=1024,
                        padding=True, truncation=True, return_tensors="pt")

        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                                attention_mask=inputs["attention_mask"].to(device),
                                length_penalty=0.8, num_beams=8, max_length=128)

        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                            clean_up_tokenization_spaces=True)
                            for s in summaries]

        decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries]

        metric.add_batch(predictions=decoded_summaries, references=target_batch)

    score = metric.compute(use_stemmer=True, use_aggregator=True)
    return score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = rouge_metric.compute(predictions=decoded_preds,
                    references=decoded_labels, use_stemmer=True,
                    use_aggregator=True)

    # # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

def preprocess_function(example_batch, max_input_length=1024, max_target_length=128):
    input_encodings = tokenizer(example_batch["dialogue"], max_length=max_input_length,
                                truncation=True)

    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch["summary"], max_length=max_target_length,
                                truncation=True)

    input_encodings["labels"] = target_encodings["input_ids"]
    return input_encodings

### Evaluate Baseline performance on the test set
For comparisons with fine-tuned model.

In [None]:
Evaluate Model's performance on the test set
For comparisons with fine-tuned model.

### Try out a Pretrained Model

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
model_ckpt = "google/pegasus-cnn_dailymail"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)
pipe = pipeline("summarization", model=model_ckpt, device=device)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
print(sample["dialogue"])

Wendy: What's up?
Simon: Nothing much. I'm painting my cupboards. 
Angela: Cool what colour?
Simon: Green.
Ben: I'm just chilling in the garden. 
Angela: Nice weekend! I'm about to meet Chris.
Wendy: Say hello from me!
Angela: Will do! And how is your weekend, Wendy?
Wendy: Very lazy... The week was hard at work, I really needed some rest. 
Ben: We should all come and visit Simon in his new apartment!
Simon: You are welcome, guys! Whenever you wish.
Ben: I should be in Bournemouth next week. 
Simon: I'm not going anywhere :-)
Ben: Cool, I'll call you next week. 


In [None]:
sample["summary"]

'This weekend Wendy is very lazy because she worked hard at work, and Angela is meeting Chris. Simon is chilling in the garden and painting his cupboards green. Next week, Ben, Angela, Chris and Wendy will visit him in his new apartament.'

In [None]:
pipe_out = pipe(sample["dialogue"])
pipe_out

[{'summary_text': 'Simon is painting his cupboards. Ben is chilling in the garden. Angela is about to meet Chris. Wendy is lazy. Simon should be in Bournemouth next week.'}]

### Evaluate Model's performance on the test set

For comparisons with fine-tuned model.

In [None]:
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
score = evaluate_model_summaries(dataset["test"], rouge_metric, model, tokenizer,
            batch_size=2, device=device, column_text="dialogue", column_summary="summary")
rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
rouge_dict

100%|██████████| 410/410 [11:39<00:00,  1.71s/it]


{'rouge1': 0.3056591654023485,
 'rouge2': 0.09154776946834249,
 'rougeL': 0.23558434106338993,
 'rougeLsum': 0.23538792149845683}

### Fine-Tuning Model

In [None]:
# tokenize dataset
preprocess_kwargs = {
    "max_input_length": 1024,
    "max_target_length": 128
}
tokenized_datasets = dataset.map(preprocess_function, fn_kwargs=preprocess_kwargs, batched=True)
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/14732 [00:00<?, ? examples/s]

In [None]:
# Define training arguments
batch_size = 4
model_name = model_ckpt.split("/")[-1]
fine_tuned_model_name = f"{model_name}-finetuned-samsum"
args = Seq2SeqTrainingArguments(
    fine_tuned_model_name,
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    report_to="none"
)



In [None]:
# setup data collator
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
# create a smaller subset of the dataset to speed up the fine tuning
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["validation"].shuffle(seed=42).select(range(500))

In [None]:
# setup trainer
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    data_collator=seq2seq_data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [None]:
# train and save model
trainer.train
trainer.save_model(fine_tuned_model_name)

  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,1.782825,38.6421,17.0625,30.9643,35.3967,42.252


Non-default generation parameters: {'max_length': 128, 'min_length': 32, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}


TrainOutput(global_step=250, training_loss=2.220920166015625, metrics={'train_runtime': 446.6151, 'train_samples_per_second': 2.239, 'train_steps_per_second': 0.56, 'total_flos': 708336399384576.0, 'train_loss': 2.220920166015625, 'epoch': 1.0})

### Trying out fine-tuned Model

In [None]:
tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(
        fine_tuned_model_name).to(device)
pipe = pipeline("summarization", model=fine_tuned_model_name, device=device)

In [None]:
print(sample["dialogue"])

Wendy: What's up?
Simon: Nothing much. I'm painting my cupboards. 
Angela: Cool what colour?
Simon: Green.
Ben: I'm just chilling in the garden. 
Angela: Nice weekend! I'm about to meet Chris.
Wendy: Say hello from me!
Angela: Will do! And how is your weekend, Wendy?
Wendy: Very lazy... The week was hard at work, I really needed some rest. 
Ben: We should all come and visit Simon in his new apartment!
Simon: You are welcome, guys! Whenever you wish.
Ben: I should be in Bournemouth next week. 
Simon: I'm not going anywhere :-)
Ben: Cool, I'll call you next week. 


In [None]:
sample["summary"]

'This weekend Wendy is very lazy because she worked hard at work, and Angela is meeting Chris. Simon is chilling in the garden and painting his cupboards green. Next week, Ben, Angela, Chris and Wendy will visit him in his new apartament.'

In [None]:
pipe_out = pipe(sample["dialogue"])
pipe_out

[{'summary_text': 'Simon is painting his cupboards. Ben is chilling in the garden. Angela is about to meet Chris. Wendy is lazy. Simon should be in Bournemouth next week.'}]

### Evaluate Fine-tuned model's performance on the test set

In [None]:
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
score = evaluate_model_summaries(dataset["test"], rouge_metric, model, tokenizer,
            batch_size=2, device=device, column_text="dialogue", column_summary="summary")
rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
rouge_dict

100%|██████████| 410/410 [08:05<00:00,  1.18s/it]


{'rouge1': 0.3830135144299003,
 'rouge2': 0.16436031776335036,
 'rougeL': 0.30285047062777837,
 'rougeLsum': 0.3033148106739478}

The results are better than that of the pretrained model.

#### Play Around

In [None]:
max_length = 48
sample = get_random_sample()
print("Dialogue:")
print(sample["dialogue"])
print("\nSummary:")
print(sample["summary"])
print("\n\nGenerated Summary")
pipe_out = pipe(sample["dialogue"], max_length=max_length)
print(pipe_out[0]["summary_text"].replace("<n>", "\n"))


Dialogue:
Aline: Ali, where did you put the keys to the basement?
Ali: Oh, I have them with me... Sorry, I forgot to out them back...
Aline: ...
Aline: What time are you coming back?
Ali: 7:00
Aline: Don't do that again, please...

Summary:
Aline wonders where Ali left the keys to the basement, and he has them with him. Ali is coming back at 7:00, which Aline isn't happy about. 


Generated Summary
Ali forgot to put the keys to the basement in the basement. He will return to the basement at 7 p.m. on Friday. Aline will be waiting for him at the basement.
