<a href="https://colab.research.google.com/github/ShilpaNipunage/Learning_AI/blob/main/Lab2_PEFT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Fine Tuning (PEFT) a generative AI model for Dialogue summerization

* This notebook will use existing FLAN-T5 model from Hugging Face.
* It will perform Parameter Efficient Fine Tuning (PEFT) and evalute the results with ROUGE metrics

In [None]:
!pip install --upgrade pip
!pip install --disable-pip-version-check \
  torch==1.13.1 \
  torchdata==0.5.1 --quiet
!pip install transformers==4.27.2 \
  datasets==2.11.0 \
  evaluate==0.4.0 \
  rouge_score==0.1.2 \
  loralib==0.1.1 \
  peft==0.3.0 --quiet

Collecting pip
  Downloading pip-24.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.1.2
    Uninstalling pip-23.1.2:
      Successfully uninstalled pip-23.1.2
Successfully installed pip-24.0
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m887.5/887.5 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m81.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.1/317.1 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.0/21.0 MB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m849.3/849.3 kB[0m [31m38.4 MB/s[0m eta [36m0:00:00[0m
[2K 

In [None]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import  evaluate
import pandas as pd
import numpy as np

In [None]:
#Load Dataset
dataset = load_dataset("knkarthick/dialogsum")
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/4.65k [00:00<?, ?B/s]

Downloading and preparing dataset csv/knkarthick--dialogsum to /root/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-cd36827d3490488d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/442k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-cd36827d3490488d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
})

In [None]:
#load LLM
model_name = "google/flan-t5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name,
                                              device_map = "auto",
                                              torch_dtype = torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast = True)

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [None]:
#Prepare the dataset for fine tuning
def tokenize_data(input):
  instruction = "Summarize the below dialogue:\n\n"
  summary = "Summary:\n\n"

  prompt = [instruction + dialogue + summary for dialogue in input["dialogue"]]

  input['input_ids'] = tokenizer(prompt,
                                 padding = "max_length",
                                 truncation = True,
                                 return_tensors = 'pt').input_ids

  input['labels'] = tokenizer(input["summary"],
                              padding = "max_length",
                              truncation = True,
                              return_tensors = 'pt').input_ids

  return input

tokenized_data = dataset.map(tokenize_data, batched = True)
tokenized_data

Map:   0%|          | 0/12460 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'labels'],
        num_rows: 12460
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'labels'],
        num_rows: 1500
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'labels'],
        num_rows: 500
    })
})

In [None]:
#remove un-necessary colums
tokenized_data = tokenized_data.remove_columns(['id', 'dialogue', 'summary', 'topic'])
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 12460
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 1500
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 500
    })
})

In [None]:
#subsize the dataset to save the time
tokenized_data = tokenized_data.filter(lambda ex, index: index % 100 == 0, with_indices = True)
tokenized_data

Filter:   0%|          | 0/12460 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 125
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 15
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 5
    })
})

### Parameter Efficient Fine Tuning
Perform PEFT on the model


In [None]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r = 8, #Rank
    lora_alpha = 32,
    target_modules = ["q", "v"],
    lora_dropout = 0.1,
    bias = "none",
    task_type = TaskType.SEQ_2_SEQ_LM #Flan-t5
)
lora_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, base_model_name_or_path=None, task_type=<TaskType.SEQ_2_SEQ_LM: 'SEQ_2_SEQ_LM'>, inference_mode=False, r=32, target_modules=['q', 'v'], lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True)

In [None]:
def print_trainable_model_parameters(model):
  trainable_model_params = 0
  all_model_params = 0

  for _, param in model.named_parameters():
    all_model_params += param.numel()
    if param.requires_grad:
      trainable_model_params += param.numel()

  print(f"trainable model params:{trainable_model_params}, \n all model params:{all_model_params}")
  print(f"% of trainable params:{100 * trainable_model_params / all_model_params}")

print_trainable_model_parameters(model)

trainable model params:247577856, 
 all model params:247577856
% of trainable params:100.0


In [None]:
# Add LoRA adapter/parameters to the original LLM to be trained
peft_model = get_peft_model(model, lora_config)
print_trainable_model_parameters(peft_model)

trainable model params:3538944, 
 all model params:251116800
% of trainable params:1.4092820552029972


In [None]:
#Train peft adapter
output_dir = f"./peft_dialog_summary_training_model"
peft_training_args = TrainingArguments(
    output_dir = output_dir,
    auto_find_batch_size = True,
    learning_rate = 1e-3,
    num_train_epochs = 1,
    logging_steps = 1,
    max_steps = 1
)

peft_trainer = Trainer(
    model = peft_model,
    args = peft_training_args,
    train_dataset = tokenized_data["train"],
    eval_dataset = tokenized_data["validation"]
)

peft_trainer.train()

NameError: name 'TrainingArguments' is not defined

In [None]:
peft_model_path = "./peft_saved_new_model/"
peft_trainer.save_model(peft_model_path)

### Re-build PEFT model just the adapter model created above

In [None]:
from peft import PeftModel, PeftConfig

peft_base_model = AutoModelForSeq2SeqLM.from_pretrained(model_name,
                                                        torch_dtype = torch.bfloat16)
peft_model = PeftModel.from_pretrained(peft_base_model,
                                       peft_model_path,
                                       is_trainable = False)

#### 2.3 Evaluate the model using ROUGE metrics

In [None]:
rouge = evaluate.load('rouge')

In [None]:
sample_dialogues = dataset['test'][0:10]['dialogue']
sample_summaries = dataset['test'][:10]['summary']
new_model_generated_summaries = []

for i, dialogue in enumerate(sample_dialogues):

  prompt = """
  Summarize the following conversation:

{dialogue}

  summary: """

  input = tokenizer(prompt,
                    return_tensors = 'pt').to('cuda')

  output_ids = original_model.generate(input['input_ids'],
                                       max_new_tokens = 50)

  output = tokenizer.decode(output_ids[0],
                            skip_special_tokens = True)

  new_model_generated_summaries.append(output)
  print(dash)
  print(f"Human Summary: {sample_summaries[i]}\n{dash}\n")
  print(f"Generated Text: fine tuned: {output}\n{dash}\n")
  print(dash)

In [None]:
#Evaluate the model computing ROUGE metric.
fine_tuned_model_results = rouge.compute(
    predictions = new_model_generated_summaries,
    referencs = sample_summaries,
    use_aggregator = True,
    use_stemmer = True
)

print(f"reuslts: {fine_tuned_model_results}")