In [2]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, Trainer, TrainingArguments
import torch
import time
import evaluate
import pandas as pd
import numpy as np

# Load dataset and LLM

In [3]:
huggingface_dataset_name = "knkarthick/dialogsum"
dataset = load_dataset(huggingface_dataset_name)
dataset

Found cached dataset csv (/home/tslab/phusaeng/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-c8fac5d84cd35861/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [5]:
model_name = 'google/flan-t5-base'
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16) # bfloat16 is faster than fp32
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [6]:
original_model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [7]:
def count_parameters(model):
    trainable_params = 0
    all_params = 0
    for _, param in model.named_parameters():
        all_params += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()

    def num_to_str(num):
        return format(num, ',')

    return f"trainable_params: {num_to_str(trainable_params)}\nall_params: {num_to_str(all_params)}\npercentage of trainable params: {100*trainable_params/all_params}%"
print(count_parameters(original_model))

trainable_params: 247,577,856
all_params: 247,577,856
percentage of trainable params: 100.0%


# Test the model with Zero Shot inferencing

In [8]:
index = 201

dialogue = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: {summary}
"""

inputs = tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(
    original_model.generate(
        inputs["input_ids"],
        max_new_tokens=200,
    )[0],
    skip_special_tokens=True,
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following conversation.

#Person1#: Where to, miss?
#Person2#: Hi! Crenshaw and Hawthorne, at the Holiday Inn that is on that corner.
#Person1#: Sure thing. So, where are you flying in from?
#Person2#: From China.
#Person1#: Really? You don't look very Chinese to me, if you don't mind me saying so.
#Person2#: It's fine. I am actually from Mexico. I was in China on a business trip, visiting some local companies that manufacture bathroom products.
#Person1#: Wow sounds interesting! Excuse me if I am being a bit nosy but, how old are you?
#Person2#: Don't you know it's rude to ask a lady her age?
#Person1#: Don't get me wrong! It's just that you seem so young and already doing business overseas!
#Person2#: Well thank you! In that case, I am 26 years old, and what about yourself?
#Person1#: I am 40 years old and was born and raised here in the good old U. S of A,

# Preprocess the Dialog-Summary dataset
- We need to convert the dialog-summary (propmpt-response) pair into explicit instructions for the LLM.
- Then, preprocess it into tokens.

In [19]:
def tokenize_function(example):
    start_prompt = "Summarize the following conversation.\n\n"
    end_prompt = "\n\nSummary:"
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example['dialogue']]
    example['input_ids'] = tokenizer(prompt, return_tensors='pt', padding="max_length", truncation=True).input_ids
    example['labels'] = tokenizer(example['summary'], return_tensors='pt', padding="max_length", truncation=True).input_ids
    return example

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(['id', 'topic', 'dialogue', 'summary'])

Loading cached processed dataset at /home/tslab/phusaeng/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-c8fac5d84cd35861/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-0bf82545fc4d7f13.arrow
Loading cached processed dataset at /home/tslab/phusaeng/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-c8fac5d84cd35861/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-2b45718e91643ac2.arrow
Loading cached processed dataset at /home/tslab/phusaeng/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-c8fac5d84cd35861/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-3d46fa1def5e1a17.arrow


In [20]:
print(f'original shape ...')
print(f'training shape: {tokenized_dataset["train"].shape}')
print(f'validation shape: {tokenized_dataset["validation"].shape}')
print(f'test shape: {tokenized_dataset["test"].shape}')

original shape ...
training shape: (12460, 2)
validation shape: (500, 2)
test shape: (1500, 2)


In [21]:
# To save some time in this notebook, we will only use a subset of the training data
tokenized_dataset = tokenized_dataset.filter(lambda example, idx: idx % 100 == 0, with_indices=True)

Loading cached processed dataset at /home/tslab/phusaeng/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-c8fac5d84cd35861/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-a5e66bd7fe194863.arrow
Loading cached processed dataset at /home/tslab/phusaeng/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-c8fac5d84cd35861/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-926f9493df5b3a3d.arrow
Loading cached processed dataset at /home/tslab/phusaeng/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-c8fac5d84cd35861/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-6c33a48e0488a674.arrow


In [22]:
# checking size of the dataset
print(f'training shape: {tokenized_dataset["train"].shape}')
print(f'validation shape: {tokenized_dataset["validation"].shape}')
print(f'test shape: {tokenized_dataset["test"].shape}')

training shape: (125, 2)
validation shape: (5, 2)
test shape: (15, 2)


# Fine-Tune the model with the preprocessed dataset

In [29]:
output_dir = f'./weights/dialogue-summary-training-{str(int(time.time()))}'

# training arguments: also show evaluation loss
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=100, # epochs vs steps
    weight_decay=0.01,
    # logging_steps=5,
    # eval_steps=5,
    evaluation_strategy='epoch',
    logging_strategy='epoch',
)

trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
)

In [30]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,47.8906,51.25
2,46.875,50.0
3,45.7969,48.75
4,45.0938,47.75
5,44.6562,46.75
6,43.8594,46.0
7,43.0156,45.25
8,42.4375,44.75
9,41.7656,44.0
10,41.375,43.25


TrainOutput(global_step=1600, training_loss=35.875703125, metrics={'train_runtime': 545.4709, 'train_samples_per_second': 22.916, 'train_steps_per_second': 2.933, 'total_flos': 8559466905600000.0, 'train_loss': 35.875703125, 'epoch': 100.0})

In [32]:
weight_path = "/net/papilio/storage6/phusaeng/fun/speech-tutorial/instructGPT/weights/dialogue-summary-training-1690377246/checkpoint-1500/"
instruct_model = AutoModelForSeq2SeqLM.from_pretrained(weight_path, torch_dtype=torch.bfloat16).cuda()

# Evaluate the Model Qualitatively (Human Evaluation)

In [33]:
orginal_model = original_model.cuda()

In [35]:
index = 500
dialogue = dataset['test'][index]['dialogue'] 
human_baseline_summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

input_ids = tokenizer(prompt, return_tensors='pt').input_ids.cuda()

original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=4))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=4))
instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)

print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{human_baseline_summary}')
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(dash_line)
print(f'INSTRUCTED MODEL:\n{instruct_model_text_output}')

---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person2# tells David the plan for a tour and #Person2# will celebrate #Person2#'s brother's fortieth birthday when at Salt Lake City.
---------------------------------------------------------------------------------------------------
ORIGINAL MODEL:
David and his family are going on a four day drive to Salt Lake City this Friday. They'll spend the nights in hotels and enjoy local food as they pass by. They'll also visit Five Lake Strict and the Wall Street.
---------------------------------------------------------------------------------------------------
INSTRUCTED MODEL:
David and his family are going on a four day drive to Salt Lake City this Friday to celebrate his brother's fortieth birthday. They'll spend the nights in hotels and enjoy local food as they pass by.


# Evaluate the model quatitatively using ROUGE

In [36]:
rogue = evaluate.load('rouge')

In [37]:
# Generate the outputs for the sample of the test dataset (only 10 dialogues and summaries to save time), and save the results
dialogues = dataset['test'][:10]['dialogue']
human_baseline_summaries = dataset['test'][:10]['summary']

original_model_summaries = []
instruct_model_summaries = []

for _, dialogue in enumerate(dialogues):
    prompt = f"""
    Summarize the following conversation.

    {dialogue}

    Summary:
    """

    input_ids = tokenizer(prompt, return_tensors='pt').input_ids.cuda()

    original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=4))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

    instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=4))
    instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)

    original_model_summaries.append(original_model_text_output)
    instruct_model_summaries.append(instruct_model_text_output)
zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, instruct_model_summaries))

df = pd.DataFrame(zipped_summaries, columns=['human_baseline_summaries', 'original_model_summaries', 'instruct_model_summaries'])
df

Unnamed: 0,human_baseline_summaries,original_model_summaries,instruct_model_summaries
0,Ms. Dawson helps #Person1# to write a memo to ...,This memo is to be distributed to all employee...,This memo should go out as an intra-office mem...
1,In order to prevent employees from wasting tim...,This memo is to be distributed to all employee...,This memo should go out as an intra-office mem...
2,Ms. Dawson takes a dictation for #Person1# abo...,This memo is to be distributed to all employee...,This memo should go out as an intra-office mem...
3,#Person2# arrives late because of traffic jam....,Taking public transport to work is a good idea.,Taking public transport to work is a good idea.
4,#Person2# decides to follow #Person1#'s sugges...,Taking public transport to work is a good idea.,Taking public transport to work is a good idea.
5,#Person2# complains to #Person1# about the tra...,Taking public transport to work is a good idea.,Taking public transport to work is a good idea.
6,#Person1# tells Kate that Masha and Hero get d...,Masha and Hero are getting divorced.,Masha and Hero are getting divorced.
7,#Person1# tells Kate that Masha and Hero are g...,Masha and Hero are getting divorced.,Masha and Hero are getting divorced.
8,#Person1# and Kate talk about the divorce betw...,Masha and Hero are getting divorced.,Masha and Hero are getting divorced.
9,#Person1# and Brian are at the birthday party ...,"#Person1#: Happy Birthday, Brian. #Person2#: I...","#Person1#: Happy Birthday, Brian. #Person2#: I..."


In [39]:
original_model_results = rogue.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)
instruct_model_results = rogue.compute(
    predictions=instruct_model_summaries,
    references=human_baseline_summaries[0:len(instruct_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)
print('ORIGINAl: MODEL:')
print(original_model_results)
print('INSTRUCTED MODEL:')
print(instruct_model_results)

ORIGINAl: MODEL:
{'rouge1': 0.27762474437974993, 'rouge2': 0.13537012263099218, 'rougeL': 0.22084590778973423, 'rougeLsum': 0.22203391888875756}
INSTRUCTED MODEL:
{'rouge1': 0.2442821159449488, 'rouge2': 0.1254253996877685, 'rougeL': 0.1960773044171789, 'rougeLsum': 0.19803018042938708}


In [40]:
print("Absolute percentage improvement of Instructed Model over Human Baseline:")

improvement = (np.array(list(instruct_model_results.values())) - np.array(list(original_model_results.values())))
for key, value in zip(instruct_model_results.keys(), improvement):
    print(f"{key}: {value*100}%")

Absolute percentage improvement of Instructed Model over Human Baseline:
rouge1: -3.3342628434801123%
rouge2: -0.9944722943223683%
rougeL: -2.4768603372555322%
rougeLsum: -2.4003738459370485%


# Perform Parameter Efficient Fine-tuning (PEFT)

In [44]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

In [46]:
# Get the PEFT version of this model
peft_model = get_peft_model(original_model,
                            lora_config)
print(count_parameters(peft_model))

trainable_params: 3,538,944
all_params: 251,116,800
percentage of trainable params: 1.4092820552029972%


In [49]:
# Train PEFT adapter
output_dir = f'./weights/peft-dialogue-summary-training-{str(int(time.time()))}'

peft_training_args = TrainingArguments(
    output_dir=output_dir, 
    auto_find_batch_size=True,
    learning_rate=1e-3,
    num_train_epochs=50,
    logging_steps=1,
    # max_steps=1
)
peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
)

In [50]:
peft_trainer.train()

Step,Training Loss
1,32.0
2,30.375
3,27.375
4,24.375
5,21.25
6,17.625
7,14.125
8,12.5
9,6.6875
10,5.5312


TrainOutput(global_step=800, training_loss=0.4731878662109375, metrics={'train_runtime': 240.7234, 'train_samples_per_second': 25.963, 'train_steps_per_second': 3.323, 'total_flos': 4347681177600000.0, 'train_loss': 0.4731878662109375, 'epoch': 50.0})

In [54]:
from peft import PeftModel, PeftConfig

peft_model_base = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base", torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

peft_weight = "/net/papilio/storage6/phusaeng/fun/speech-tutorial/instructGPT/weights/peft-dialogue-summary-training-1690379059/checkpoint-500"
peft_model = PeftModel.from_pretrained(peft_model_base, 
                                        peft_weight,
                                        torch_dtype=torch.bfloat16,
                                        is_trainable=False).cuda()

In [55]:
print(count_parameters(peft_model))

trainable_params: 0
all_params: 251,116,800
percentage of trainable params: 0.0%


# Evaluate the model qualitatively (Human Evaluation)

In [56]:
index = 500
dialogue = dataset['test'][index]['dialogue'] 
human_baseline_summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

input_ids = tokenizer(prompt, return_tensors='pt').input_ids.cuda()

original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=4))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=4))
instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)

peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=4))
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{human_baseline_summary}')
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(dash_line)
print(f'INSTRUCTED MODEL:\n{instruct_model_text_output}')
print(dash_line)
print(f'PEFT MODEL:\n{peft_model_text_output}')

---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person2# tells David the plan for a tour and #Person2# will celebrate #Person2#'s brother's fortieth birthday when at Salt Lake City.
---------------------------------------------------------------------------------------------------
ORIGINAL MODEL:
David and #Person2# are going to have a tour in Salt Lake City, where they will join David's brother and his family on their fortieth birthday. #Person1# and #Person2# plan a four day drive to Salt Lake City.
---------------------------------------------------------------------------------------------------
INSTRUCTED MODEL:
David and his family are going on a four day drive to Salt Lake City this Friday to celebrate his brother's fortieth birthday. They'll spend the nights in hotels and enjoy local food as they pass by.
--------------------------------------------------------------------------------------------------

# Evaluate the model quantitatively (ROUGE)

In [57]:
# Generate the outputs for the sample of the test dataset (only 10 dialogues and summaries to save time), and save the results
dialogues = dataset['test'][:10]['dialogue']
human_baseline_summaries = dataset['test'][:10]['summary']

original_model_summaries = []
instruct_model_summaries = []
peft_model_summaries = []

for _, dialogue in enumerate(dialogues):
    prompt = f"""
    Summarize the following conversation.

    {dialogue}

    Summary:
    """

    input_ids = tokenizer(prompt, return_tensors='pt').input_ids.cuda()

    original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=4))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

    instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=4))
    instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)

    peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=4))
    peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

    original_model_summaries.append(original_model_text_output)
    instruct_model_summaries.append(instruct_model_text_output)
    peft_model_summaries.append(peft_model_text_output)
zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, instruct_model_summaries, peft_model_summaries))

df_with_peft = pd.DataFrame(zipped_summaries, columns=['human_baseline_summaries', 'original_model_summaries', 'instruct_model_summaries', 'peft_model_summaries'])
df_with_peft

Unnamed: 0,human_baseline_summaries,original_model_summaries,instruct_model_summaries,peft_model_summaries
0,Ms. Dawson helps #Person1# to write a memo to ...,#Person1# asks Ms. Dawson to take a dictation ...,This memo should go out as an intra-office mem...,Ms. Dawson asks #Person1# to take a dictation ...
1,In order to prevent employees from wasting tim...,#Person1# tells #Person2# #Person1# restricts ...,This memo should go out as an intra-office mem...,Ms. Dawson asks #Person1# to take a dictation ...
2,Ms. Dawson takes a dictation for #Person1# abo...,#Person1# asks #Person2# to send an intra-offi...,This memo should go out as an intra-office mem...,Ms. Dawson asks #Person1# to take a dictation ...
3,#Person2# arrives late because of traffic jam....,#Person1# tells #Person2# #Person2# is stuck i...,Taking public transport to work is a good idea.,"#Person2# got stuck in traffic again, because ..."
4,#Person2# decides to follow #Person1#'s sugges...,#Person1# is stuck in a traffic jam near the C...,Taking public transport to work is a good idea.,"#Person2# got stuck in traffic again, because ..."
5,#Person2# complains to #Person1# about the tra...,#Person1# tells #Person2#'s car is adding to t...,Taking public transport to work is a good idea.,"#Person2# got stuck in traffic again, because ..."
6,#Person1# tells Kate that Masha and Hero get d...,Masha and Hero are getting divorced. Masha and...,Masha and Hero are getting divorced.,Masha and Hero are getting divorced. They are ...
7,#Person1# tells Kate that Masha and Hero are g...,Masha and Hero are getting divorced. #Person1#...,Masha and Hero are getting divorced.,Masha and Hero are getting divorced. They are ...
8,#Person1# and Kate talk about the divorce betw...,Masha and Hero are getting divorced. Masha and...,Masha and Hero are getting divorced.,Masha and Hero are getting divorced. They are ...
9,#Person1# and Brian are at the birthday party ...,#Person1# and #Person2# are celebrating Brian'...,"#Person1#: Happy Birthday, Brian. #Person2#: I...",Brian is celebrating his birthday. #Person1# a...


In [58]:
# with rogue metric
original_model_results = rogue.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)
instruct_model_results = rogue.compute(
    predictions=instruct_model_summaries,
    references=human_baseline_summaries[0:len(instruct_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)
peft_model_results = rogue.compute(
    predictions=peft_model_summaries,
    references=human_baseline_summaries[0:len(peft_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAl: MODEL:')
print(original_model_results)
print('INSTRUCTED MODEL:')
print(instruct_model_results)
print('PEFT MODEL:')
print(peft_model_results)

ORIGINAl: MODEL:
{'rouge1': 0.32481948192935695, 'rouge2': 0.08787961412574757, 'rougeL': 0.24721236103577302, 'rougeLsum': 0.2466490743589574}
INSTRUCTED MODEL:
{'rouge1': 0.2442821159449488, 'rouge2': 0.1254253996877685, 'rougeL': 0.1960773044171789, 'rougeLsum': 0.19803018042938708}
PEFT MODEL:
{'rouge1': 0.3252612237395953, 'rouge2': 0.11332108390433336, 'rougeL': 0.2526098179382635, 'rougeLsum': 0.2516474235225047}


In [59]:
print("Absolute percentage improvement of Instructed Model over Human Baseline:")

improvement = (np.array(list(peft_model_results.values())) - np.array(list(original_model_results.values())))
for key, value in zip(peft_model_results.keys(), improvement):
    print(f"{key}: {value*100}%")

Absolute percentage improvement of Instructed Model over Human Baseline:
rouge1: 0.04417418102383719%
rouge2: 2.5441469778585786%
rougeL: 0.5397456902490466%
rougeLsum: 0.49983491635473%
