In [1]:
%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch==1.13.1 \
    torchdata==0.5.1 --quiet
%pip install \
    transformers==4.27.2 \
    datasets==2.11.0 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    loralib==0.1.1 \
    peft==0.3.0 --quiet

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install -U datasets

Collecting datasets
  Using cached datasets-2.16.1-py3-none-any.whl.metadata (20 kB)
Using cached datasets-2.16.1-py3-none-any.whl (507 kB)
Installing collected packages: datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 2.11.0
    Uninstalling datasets-2.11.0:
      Successfully uninstalled datasets-2.11.0
Successfully installed datasets-2.16.1
Note: you may need to restart the kernel to use updated packages.


In [3]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np



Init Plugin
Init Graph Optimizer
Init Kernel


#### 1.2 - Load Dataset and LLM

In [4]:
huggingface_dataset_name = "knkarthick/dialogsum"
dataset = load_dataset(huggingface_dataset_name)

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [6]:
model_name = 'google/flan-t5-base'

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [7]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _,param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"Trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {round((trainable_model_params/all_model_params)*100,2)} %"

print(print_number_of_trainable_model_parameters(original_model))

Trainable model parameters: 247577856
all model parameters: 247577856
percentage of trainable model parameters: 100.0 %


#### 1.3 - Test the Model with Zero shot Inferencing

In [8]:
index = 200

dialogue = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

inputs = tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(
        original_model.generate(
            inputs['input_ids'],
            max_new_tokens = 200
        )[0],
        skip_special_tokens = True
)

horizontal = '-'*100
print(horizontal)
print(f"INPUT PROMPT:\n{prompt}\n")
print(horizontal)
print(f"BASELINE HUMAN SUMMARY:\n{summary}\n")
print(horizontal)
print(f"MODEL GENERATION - ZERO SHOT:\n{output}\n")

----------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following conversation.

#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

Summary:


-----------------------------------------------------------------

#### 2 - Perform Full Fine-Tuning

##### 2.1 - Preprocess the Dialog-Summary Dataset

In [9]:
def tokenize_function(example):
    start_prompt = "Summarize the following conversation.\n\n"
    end_prompt = "\n\nSummary:"
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example['dialogue']]
    example['input_ids'] = tokenizer(prompt, padding = "max_length", truncation =True, return_tensors='pt').input_ids
    example['labels'] = tokenizer(example['summary'], padding = 'max_length', truncation=True, return_tensors='pt').input_ids

    return example

# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.
tokenize_datasets = dataset.map(tokenize_function, batched = True)
tokenize_datasets = tokenize_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary'])

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [11]:
# To save some time in full fine tuning we will subsample the dataset:
tokenize_datasets = tokenize_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

In [13]:
# Shape of all three datasets : train, validation, test
print(f"Shapes of the datasets:")
print(f"Training: {tokenize_datasets['train'].shape}")
print(f"Validation: {tokenize_datasets['validation'].shape}")
print(f"Test: {tokenize_datasets['test'].shape}\n")

print(tokenize_datasets)

Shapes of the datasets:
Training: (125, 2)
Validation: (5, 2)
Test: (15, 2)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 125
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 5
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 15
    })
})


##### 2.2 - Full Fine-Tuning Model with the Preprocesssed Dataset 

In [14]:
output_dir = f'./dialogue-summary-training-{str(int(time.time()))}'

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=1
)

trainer = Trainer(
    model = original_model,
    args = training_args,
    train_dataset = tokenize_datasets['train'],
    eval_dataset = tokenize_datasets['validation']
)

In [63]:
trainer.train()

trainer_model_path = './flan-dialogue-summary-checkpoint-local'

trainer.save_model(trainer_model_path)
tokenizer.save_model(trainer_model_path)



  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
instruct_model = AutoModelForSeq2SeqLM.from_pretrained('./flan-dialogue-summary-checkpoint-local', torch_dtype=torch.bfloat16)

#### 2.3 - Evaluate the Model Qualitatively (Human Evaluation)

In [None]:
index = 200
dialogue = dataset['test'][index]['dialogue']
human_baseline_summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

input_ids = tokenizer(prompt, return_tensors='pt').input_ids

original_model_outputs = original_model.generate(input_ids, generation_config = GenerationConfig(max_new_tokens=200,num_beams=1))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

instruct_model_outputs = instruct_model.generate(input_ids, generation_config = GenerationConfig(max_new_tokens=200,num_beams=1))
instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)

print(horizontal)
print(f"BASELINE HUMAN SUMMARY:\n{human_baseline_summary}")
print(horizontal)
print(f"ORIGINAL MODEL:\n{original_model_text_output}")
print(horizontal)
print(f"INSTRUCT MODEL:\n{instruct_model_text_output}")

##### 2.4 - Evaluate the Model Quantitatively (with ROUGE Metric)

In [None]:
rouge = evaluate.load("rouge")

In [None]:
# Generate the outputs from the sample fo the test dataset (only 10 dialogues and summaries to save time)
dialogues = dataset['test'][0:10]['dialogue']
human_baseline_summaries = dataset['test'][0:10]['summary']

original_model_summaries = []
instruct_model_summaries = []

for _,dialogue in enumerate(dialogues):
    prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """
    input_ids = tokenizer(prompt, return_tensors = 'pt').input_ids
    
    original_model_outputs = original_model.generate(input_ids=input_ids,generation_config = GenerationConfig(max_new_tokens=200))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
    original_model_summaries.append(original_model_text_output)

    instruct_model_outputs = instruct_model.generate(input_ids=input_ids,generation_config = GenerationConfig(max_new_tokens=200))
    instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)
    instruct_model_summaries.append(instruct_model_text_output)

zipped_summaries = list(zip(human_baseline_summaries,original_model_summaries,instruct_model_summaries))

df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_summaries','original_model_summaries', 'instruct_model_summaries'])
df

In [None]:
# Evaluate the models computing ROUGE metrics.

original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator = True,
    use_stemmer = True
)

instruct_model_results = rouge.compute(
    predictions=instruct_model_summaries,
    references=human_baseline_summaries[0:len(instruct_model_summaries)],
    use_aggregator = True,
    use_stemmer = True
)

print(f'ORIGINAL MODEL:')
print(original_model_results)
print(f'INSTRUCT MODEL:')
print(instruct_model_results)

#### 3 - Perform Parameter Efficient Fine-Tuning (PEFT)

##### 3.1 - Setup the PEFT/LoRA model for Fine Tuning

In [21]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=["q","v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

In [24]:
# Add LoRA adapter layers/parameters to the original LLM to be trained
peft_model = get_peft_model(original_model, lora_config)
print(print_number_of_trainable_model_parameters(peft_model))

Trainable model parameters: 3538944
all model parameters: 251116800
percentage of trainable model parameters: 1.41 %


##### 3.2 Train PEFT Adapter

In [25]:
# Define traininng arguments and create Trainer instance
output_dir = f'./peft-dialogue-summary-training-{str(int(time.time()))}'

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3, # Higher learning rate than full fine-tuning
    num_train_epochs=1,
    logging_steps=1,
    max_steps=1
)

peft_trainer = Trainer(
    model = peft_model,
    args = peft_training_args,
    train_dataset=tokenize_datasets["train"]
)

In [26]:
# Now everything is ready to train the PEFT adapter and save the model
peft_trainer.train()

peft_model_path = "./peft-dialogue-summary-checkpoint-local"

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)



  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
from peft import PeftModel, PeftConfig

peft_model_base = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base", torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

peft_model = PeftModel.from_pretrained(peft_model_base,
                                       './peft-dialogue-summary-checkpoint-local',
                                       torch_dtype = torch.bfloat16,
                                       is_trainable=False)

In [None]:
# The number of trainable parameter will be 0 due to is_trainable = False setting
print(print_number_of_trainable_model_parameters(peft_model))

##### 3.3 - Evaluate the Model Qualitatively (Human Evaluation)

In [None]:
# Make inferences for the same example as in section 1.3 and 2.3, with the original model, fully fine-tuned and PEFT model
index = 200
dialogue = dataset['test'][index]['dialogue']
human_baseline_summary = dataset['test'][index]['summary']

prompt=f"""
Summarize the following conversation.

{dialogue}

Summary:"""

input_ids = tokenizer(prompt, return_tensors='pt').input_ids

original_model_outputs = original_model.generate(input_ids, generation_config = GenerationConfig(max_new_tokens=200, num_beams=1))
original_model_output = tokenizer.decode(original_model_outputs[0],skip_special_tokens=True)

instruct_model_outputs = instruct_model.generate(input_ids, generation_config = GenerationConfig(max_new_tokens=200, num_beams=1))
instruct_model_output = tokenizer.decode(instruct_model_outputs[0],skip_special_tokens=True)

peft_model_outputs = peft_model.generate(input_ids, generation_config = GenerationConfig(max_new_tokens=200, num_beams=1))
peft_model_output = tokenizer.decode(peft_model_outputs[0],skip_special_tokens=True)

print(horizontal)
print(f'BASELINE HUMAN SUMMARY:\n{human_baseline_summary}')
print(horizontal)
print(f'MODEL SUMMARY:\n{original_model_output}')
print(horizontal)
print(f'INSTRUCT MODEL SUMMARY:\n{instruct_model_output}')
print(horizontal)
print(f'PEFT MODEL SUMMARY:\n{peft_model_output}')

##### 3.4 - Evaluate the Model Quantitatively (with ROUGE Metric)

In [None]:
# Perform inferences for the sample of the test dataset (only 10 dialogues and summaries to save time)

dialogues = dataset['test'][0:10]['dialogue']
human_baseline_summaries = dataset['test'][0:10]['summary']

original_model_summaries = []
instruct_model_summaries = []
peft_model_summaries = []

for _,dialogue in enumerate(dialogues):
    prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """
    
    input_ids = tokenizer(prompt, return_tensors='pt').input_ids

    original_model_output = original_model.generate(input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    original_model_text_output = tokenizer.decode(original_model_output,skip_special_tokens=True)
    original_model_summaries.append(original_model_text_output)

    instruct_model_outputs = instruct_model.generate(input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    instruct_model_text_output = tokenizer.decode(instruct_model_outputs, skip_special_tokens=True)
    instruct_model_summaries.append(instruct_model_text_output)

    peft_model_outputs = peft_model.generate(input_ids, generation_config = GenerationConfig(max_new_tokens=200))
    peft_model_text_output = tokenizer.decode(peft_model_output)
    peft_model_summaries.append(peft_model_text_output)

zipped_summaries = list(zip(human_baseline_summaries,original_model_summaries,instruct_model_summaries,peft_model_summaries))

df = pd.Dataframe(zipped_summaries,
                  columns = ['human_baseline_summaries','original_model_summaries','instruct_model_summaries','peft_model_summaries'])
df

In [None]:
# Compute ROUGE score for this subset of the data.
rouge = evaluate.load('rouge')

original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True
)

instruct_model_results = rouge.compute(
    predictions=instruct_model_summaries,
    references=human_baseline_summaries[0:len(instruct_model_summaries)],
    use_aggregator=True,
    use_stemmer=True
)

peft_model_results = rouge.compute(
    predictions=peft_model_summaries,
    references=human_baseline_summaries[0:len(peft_model_summaries)],
    use_aggregator=True,
    use_stemmer=True
)

print('ORIGINAL MODEL')
print(original_model_results)
print('INSTRUCT MODEL')
print(instruct_model_results)
print('PEFT MODEL')
print(peft_model_results)