In [None]:
# %pip install -U datasets==2.17.0

# %pip install --upgrade pip
# %pip install --disable-pip-version-check \
#     torch==1.13.1 \
#     torchdata==0.5.1 --quiet
# !pip install peft==0.13.0
%pip install \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    loralib==0.1.1 \
    peft==0.3.0 --quiet

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [None]:
# !pip uninstall transformers torch
# !pip install transformers torch

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np

In [None]:
from datasets import load_dataset

In [None]:
huggingface_dataset_name = "knkarthick/dialogsum"

dataset = load_dataset(huggingface_dataset_name)

dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [None]:
model_name='google/flan-t5-base'

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [None]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(original_model))

trainable model parameters: 247577856
all model parameters: 247577856
percentage of trainable model parameters: 100.00%


In [None]:
index = 200

dialogue = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

In [None]:
inputs = tokenizer(prompt, return_tensors='pt')

In [None]:
len(inputs['input_ids'][0])

231

In [None]:
output_token = original_model.generate(
    inputs["input_ids"],
    max_new_tokens=200,
)[0]

output_token

tensor([    0,  1713,   345, 13515,   536,  4663,    10,    27,    31,    51,
         1631,    13, 21066,    82,  1218,     5,     1])

In [None]:
output = tokenizer.decode(
    output_token,
    skip_special_tokens=True
)

output

"#Person1#: I'm thinking of upgrading my computer."

In [None]:
dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following conversation.

#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

Summary:

-------------------------------------------------------------------

In [None]:
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=original_model)

In [None]:
def tokenize_function(example):
    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids

    return example

# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary',])

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 500
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 1500
    })
})

In [None]:
tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 50 == 0, with_indices=True)


Filter:   0%|          | 0/623 [00:00<?, ? examples/s]

Filter:   0%|          | 0/25 [00:00<?, ? examples/s]

Filter:   0%|          | 0/75 [00:00<?, ? examples/s]

In [None]:
# reduced the size
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 13
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 1
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 2
    })
})

In [None]:
print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

print(tokenized_datasets)

Shapes of the datasets:
Training: (13, 2)
Validation: (1, 2)
Test: (2, 2)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 13
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 1
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 2
    })
})


In [None]:
import nltk
# Set up Rouge score for evaluation
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

In [None]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # decode preds and labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # rougeLSum expects newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    return result

In [None]:
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer


# Global Parameters
L_RATE = 3e-4
BATCH_SIZE = 8
PER_DEVICE_EVAL_BATCH = 4
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 3
NUM_EPOCHS = 5

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=L_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
    weight_decay=WEIGHT_DECAY,
    save_total_limit=SAVE_TOTAL_LIM,
    num_train_epochs=NUM_EPOCHS,
    predict_with_generate=True,
    push_to_hub=False
)



In [None]:
# Set up trainer
trainer = Seq2SeqTrainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


In [None]:
import torch
torch.cuda.empty_cache()

# Trigger the model training
trainer.train()

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
# !zip -r results.zip results

In [None]:
# !ls results

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

last_checkpoint = "/content/results/checkpoint-16"

finetuned_model = T5ForConditionalGeneration.from_pretrained(last_checkpoint)
tokenizer = T5Tokenizer.from_pretrained(last_checkpoint)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


## Test the prompt

In [None]:
index = 200

dialogue = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

In [None]:
prompt

"\nSummarize the following conversation.\n\n#Person1#: Have you considered upgrading your system?\n#Person2#: Yes, but I'm not sure what exactly I would need.\n#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.\n#Person2#: That would be a definite bonus.\n#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.\n#Person2#: How can we do that?\n#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?\n#Person2#: No.\n#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.\n#Person2#: That sounds great. Thanks.\n\nSummary:\n"

In [None]:
inputs = tokenizer(prompt, return_tensors="pt")

In [None]:
outputs = finetuned_model.generate(**inputs)
answer = tokenizer.decode(outputs[0])

print("Answer:", answer)

Answer: <pad> Upgrade your computer.</s>


In [None]:
inputs = tokenizer(prompt, return_tensors='pt')
output_token = finetuned_model.generate(
    inputs["input_ids"],
    max_new_tokens=200,
)[0]

output = tokenizer.decode(
    output_token,
    skip_special_tokens=True
)
output

'Upgrade your computer.'

## Setup the PEFT/LoRA model for Fine-Tuning

In [None]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

In [None]:
peft_model = get_peft_model(original_model,
                            lora_config)
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 3538944
all model parameters: 251116800
percentage of trainable model parameters: 1.41%


In [None]:
output_dir = f'./peft-dialogue-summary-training'

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3, # Higher learning rate than full fine-tuning.
    num_train_epochs=100,
    logging_steps=1,
    max_steps=100
)

peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets["train"],
)

max_steps is given, it will override any value given in num_train_epochs


In [None]:
# peft_model = PeftModel.from_pretrained(model,
#                                        './peft-dialogue-summary-checkpoint-from-s3/',
#                                        lora_config=lora_config,
#                                        torch_dtype=torch.bfloat16,
#                                        device_map="auto",
#                                        is_trainable=True)

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
peft_trainer.train()

Step,Training Loss
1,3.9688
2,4.5312
3,3.4062
4,3.6719
5,3.2031
6,2.8594
7,2.7188
8,2.6406
9,2.4219
10,2.2969


TrainOutput(global_step=100, training_loss=0.776796875, metrics={'train_runtime': 231.1561, 'train_samples_per_second': 3.461, 'train_steps_per_second': 0.433, 'total_flos': 543981868941312.0, 'train_loss': 0.776796875, 'epoch': 6.25})

In [None]:
peft_model_path="./peft-dialogue-summary-checkpoint-5"

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

('./peft-dialogue-summary-checkpoint-5/tokenizer_config.json',
 './peft-dialogue-summary-checkpoint-5/special_tokens_map.json',
 './peft-dialogue-summary-checkpoint-5/spiece.model',
 './peft-dialogue-summary-checkpoint-5/added_tokens.json')

In [None]:
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 3538944
all model parameters: 251116800
percentage of trainable model parameters: 1.41%


In [None]:
index = 207
dialogue = dataset['test'][index]['dialogue']
human_baseline_summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """


# Ensure the model is on the correct device (e.g., 'cuda' if using GPU)
device = "cuda" if torch.cuda.is_available() else "cpu"
peft_model = peft_model.to(device)

# Move input_ids to the same device as the model
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

# Generate the summary using the model
peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))

# Decode the output
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

# Print the results
dash_line = "-" * 50
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{human_baseline_summary}')
print(dash_line)
print(f'PEFT MODEL: {peft_model_text_output}')

--------------------------------------------------
BASELINE HUMAN SUMMARY:
James reserves a dining room for eight at a restaurant. #Person1# will ask the waitress to show him the way.
--------------------------------------------------
PEFT MODEL: The waiter will show the waiter the dinning room.


In [None]:
rouge = evaluate.load('rouge')

In [None]:
dialogues = dataset['test'][0:10]['dialogue']
human_baseline_summaries = dataset['test'][0:10]['summary']

device = "cuda" if torch.cuda.is_available() else "cpu"
original_model = original_model.to(device)
peft_model = peft_model.to(device)  # Ensure peft_model is also on the same device

original_model_summaries = []
peft_model_summaries = []

for _, dialogue in enumerate(dialogues):
    prompt = f"""
    Summarize the following conversation.

    {dialogue}

    Summary: """

    # Move input_ids to the same device as the models
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

    # Generate summaries from original model
    original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
    original_model_summaries.append(original_model_text_output)

    # Generate summaries from PEFT model
    peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)
    peft_model_summaries.append(peft_model_text_output)

# Combine summaries and create a DataFrame
zipped_summaries = list(zip(original_model_summaries, peft_model_summaries))
df = pd.DataFrame(zipped_summaries, columns=['original_model_summaries', 'peft_model_summaries'])

In [None]:
df.head()

Unnamed: 0,original_model_summaries,peft_model_summaries
0,#Person1# needs to take a dictation for #Perso...,#Person1#: #Person1# should go out as an intra...
1,@Person1#: #Person1#: #Person2# needs to take ...,#Person1# and #Person2# are ready to take a memo.
2,Your memo is required to be sent to all employ...,#Person1# needs to take a dictation for #Perso...
3,#Person1# thinks it's better to take public tr...,#Person1# got stuck in traffic again. #Person2...
4,#Person1# is stuck in traffic. #Person2# think...,#Person1#: #Person2# says it's a good idea to ...


In [None]:
original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

peft_model_results = rouge.compute(
    predictions=peft_model_summaries,
    references=human_baseline_summaries[0:len(peft_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)
print('INSTRUCT MODEL:')
print(peft_model_results)

ORIGINAL MODEL:
{'rouge1': 0.2747984016649684, 'rouge2': 0.09269802188031745, 'rougeL': 0.24171404248401465, 'rougeLsum': 0.24418600378227706}
INSTRUCT MODEL:
{'rouge1': 0.29361520826062987, 'rouge2': 0.07750834428843217, 'rougeL': 0.24291335678922354, 'rougeLsum': 0.2432030364058525}


In [None]:
print("Absolute percentage improvement of PEFT MODEL over ORIGINAL MODEL")

improvement = (np.array(list(peft_model_results.values())) - np.array(list(original_model_results.values())))
for key, value in zip(peft_model_results.keys(), improvement):
    print(f'{key}: {value*100:.2f}%')

Absolute percentage improvement of PEFT MODEL over ORIGINAL MODEL
rouge1: 1.88%
rouge2: -1.52%
rougeL: 0.12%
rougeLsum: -0.10%
