## 1. Installing relevant packages

https://github.dev/amitkumaryada/BIA/blob/main/Fine_tune_FLAN_T5_with_PEFT_LoRA_(deeplearning_ai).ipynb

In [1]:
!pip install -r requirements.txt

[0m

In [2]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np

  _torch_pytree._register_pytree_node(
W0810 07:02:04.553000 6840 torch/distributed/elastic/multiprocessing/redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


#### a. Loading datasets from hugging face

In [3]:
# huggingface_dataset_name = "knkarthick/dialogsum"

# dataset = load_dataset(huggingface_dataset_name)

# dataset
huggingface_dataset_name = "knkarthick/dialogsum"
dataset = load_dataset(huggingface_dataset_name)


#### b. Loading Google's Flaunt T5 model from hugging face

In [4]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

original_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")





#### c. Providing input

In [5]:
# Step 1: Convert the input prompt into token IDs that the model can understand
inputs = tokenizer("A step by step recipe to make pasta:", return_tensors="pt")

# Step 2: Use the model to generate a continuation of the prompt
# This outputs a tensor containing token IDs of the generated text
outputs = original_model.generate(**inputs)

# Step 3: Convert the output token IDs back into readable text, removing special tokens
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))


['To make pasta, you can use a steamed pasta.']




#### d. 

In [6]:
def print_number_of_trainable_model_parameters(model):
    """
    Returns the number and percentage of trainable parameters in a PyTorch model.
    """
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\n" \
           f"all model parameters: {all_model_params}\n" \
           f"percentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

# Call the function on your model
print(print_number_of_trainable_model_parameters(original_model))


trainable model parameters: 76961152
all model parameters: 76961152
percentage of trainable model parameters: 100.00%


#### e. 

In [7]:
# Select the 200th test example from the dataset
index = 200

# Extract the dialogue (input text) and summary (ground truth) from the test set
dialogue = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']

# Create a prompt in instruction format to guide the model for summarization
prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

# Tokenize the prompt and return it as PyTorch tensors (required for model input)
inputs = tokenizer(prompt, return_tensors='pt')

# Generate a summary using the model
output = tokenizer.decode(
    original_model.generate(
        inputs["input_ids"],     # Provide tokenized input IDs to the model
        max_new_tokens=200       # Set a cap on how many tokens the model should generate
    )[0],                        # Get the first (and only) generated sequence
    skip_special_tokens=True     # Remove tokens like <pad>, <eos> from output
)

# Create a visual separator for better readability of printed output
dash_line = '-' * 100  # Corrected from original code

# Print the original input prompt
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)

# Print the human-written (reference) summary
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)

# Print the summary generated by the model (zero-shot)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')


----------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following conversation.

#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

Summary:

------------------------------------------------------------------

#### f. 

In [9]:
# Define a custom tokenization function to prepare the input and target sequences for training
def tokenize_function(example):
    # Define the beginning and end of the prompt to frame the input
    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '

    # Combine the prompt with each dialogue in the batch
    # Note: example["dialogue"] is a list when batched=True
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]

    # Tokenize the input prompts with padding and truncation
    # Output is a dict with keys like input_ids, attention_mask etc.
    # We're extracting only input_ids here and assigning them to the example
    example['input_ids'] = tokenizer(
        prompt,                            # list of full prompts
        padding="max_length",              # pad all to max length in tokenizer config
        truncation=True,                   # truncate if it exceeds max model input
        return_tensors="pt"                # return PyTorch tensors
    ).input_ids

    # Tokenize the reference summaries (targets) similarly
    example['labels'] = tokenizer(
        example["summary"],                # list of ground truth summaries
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    ).input_ids

    # Return the modified example with input_ids and labels
    return example


In [10]:

# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary',])
tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)


In [11]:
print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

print(tokenized_datasets)

Shapes of the datasets:
Training: (125, 2)
Validation: (5, 2)
Test: (15, 2)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 125
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 5
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 15
    })
})


In [12]:
import os
os.environ["WANDB_DISABLED"] = "true"

# output_dir = f'./dialogue-summary-training-{str(int(time.time()))}'
output_dir="./results"

from transformers import TrainingArguments, Trainer

# Set up training arguments for the Trainer API
training_args = TrainingArguments(
    output_dir=output_dir,       # Where to save checkpoints and logs
    learning_rate=1e-5,          # Learning rate for optimizer
    num_train_epochs=2,          # Total number of training epochs
    weight_decay=0.01,           # Regularization to avoid overfitting
    logging_steps=1,             # Log every 1 step (frequent logging for debugging)
    max_steps=1,                 # üîÅ WARNING: Only runs 1 training step! Use for quick test only!
    report_to=None,              # Disables logging to WandB, TensorBoard etc.
    # save = "./checkpoint"      # ‚ùå Not a valid Trainer arg; remove or replace with `save_strategy`
)

# Initialize the Trainer class with model, training arguments, and datasets
trainer = Trainer(
    model=original_model,                     # The model to fine-tune
    args=training_args,                       # Training configurations
    train_dataset=tokenized_datasets['train'],   # Training data
    eval_dataset=tokenized_datasets['validation'] # Validation data for evaluation
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


#### g. 

In [13]:
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [14]:
# --------------------------
# Define Global Parameters
# --------------------------
L_RATE = 3e-4                  # Learning rate for the AdamW optimizer
BATCH_SIZE = 8                 # Training batch size per device (GPU/CPU)
PER_DEVICE_EVAL_BATCH = 4      # Evaluation batch size per device
WEIGHT_DECAY = 0.01            # L2 regularization strength to prevent overfitting
SAVE_TOTAL_LIM = 3             # Keep only the latest 3 saved checkpoints
NUM_EPOCHS = 3                 # Total number of training epochs


from transformers import Seq2SeqTrainingArguments

# --------------------------
# Set up Training Arguments
# --------------------------
training_args = Seq2SeqTrainingArguments(
   output_dir="./results",             # Directory to save checkpoints and logs
   save_strategy="epoch",              # Save model at the end of every epoch
   evaluation_strategy="epoch",        # Run evaluation at the end of every epoch
   learning_rate=L_RATE,               # Learning rate for training
   per_device_train_batch_size=BATCH_SIZE,  # Training batch size per GPU/CPU
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,  # Evaluation batch size
   weight_decay=WEIGHT_DECAY,          # Weight decay for regularization
   save_total_limit=SAVE_TOTAL_LIM,    # Keep only last N checkpoints (avoid clutter)
   num_train_epochs=NUM_EPOCHS,        # Number of epochs to train the model
   predict_with_generate=True,         # Generate predictions during evaluation (needed for seq2seq)
   push_to_hub=False                   # Don‚Äôt push model to Hugging Face Hub after training
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [15]:
from transformers import Seq2SeqTrainer

# Initialize the trainer for sequence-to-sequence fine-tuning
trainer = Seq2SeqTrainer(
   model=original_model,                # The pre-trained model to fine-tune (e.g., FLAN-T5)
   args=training_args,                  # TrainingArguments or Seq2SeqTrainingArguments configured earlier
   train_dataset=tokenized_datasets["train"],        # Tokenized training dataset
   eval_dataset=tokenized_datasets["validation"],    # Tokenized validation dataset
   tokenizer=tokenizer,                # Required so `generate()` can decode input/output properly

   # OPTIONAL: You can use this if you're manually defining a data collator
   # This is especially useful if using mixed-length input sequences and want dynamic padding
   # data_collator=data_collator,

   # OPTIONAL: Use this to compute custom metrics like ROUGE during evaluation
   # Needs to be a function that takes EvalPrediction and returns a dict of metric values
   # compute_metrics=compute_metrics
)


In [16]:
trainer.train()

  0%|          | 0/48 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/2 [00:00<?, ?it/s]

Checkpoint destination directory ./results/checkpoint-16 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'eval_loss': 5.053508281707764, 'eval_runtime': 0.2883, 'eval_samples_per_second': 17.342, 'eval_steps_per_second': 6.937, 'epoch': 1.0}




  0%|          | 0/2 [00:00<?, ?it/s]

Checkpoint destination directory ./results/checkpoint-32 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'eval_loss': 3.704914093017578, 'eval_runtime': 0.1838, 'eval_samples_per_second': 27.2, 'eval_steps_per_second': 10.88, 'epoch': 2.0}




  0%|          | 0/2 [00:00<?, ?it/s]

Checkpoint destination directory ./results/checkpoint-48 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'eval_loss': 3.192478656768799, 'eval_runtime': 0.1836, 'eval_samples_per_second': 27.231, 'eval_steps_per_second': 10.893, 'epoch': 3.0}
{'train_runtime': 55.3771, 'train_samples_per_second': 6.772, 'train_steps_per_second': 0.867, 'train_loss': 9.056327819824219, 'epoch': 3.0}


TrainOutput(global_step=48, training_loss=9.056327819824219, metrics={'train_runtime': 55.3771, 'train_samples_per_second': 6.772, 'train_steps_per_second': 0.867, 'train_loss': 9.056327819824219, 'epoch': 3.0})

In [17]:
!ls

1. FineTuning & Eval.ipynb
2. Fine tuning_PEFT_LoRA - Google's Flaunt T5.ipynb
[34mDataSet[m[m
[34mpeft-dialogue-summary-checkpoint-local[m[m
[34mpeft-dialogue-summary-training-1753710836[m[m
[34mpeft-dialogue-summary-training-1753795343[m[m
requirements.txt
[34mresults[m[m


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [18]:
original_model = original_model.to('cpu') #manually moving the model to the CPU

In [19]:
instruct_model = AutoModelForSeq2SeqLM.from_pretrained("/Users/sidharthrao/Documents/Documents - Sidharth‚Äôs MacBook Pro/GitHub/Project-Dash/2. GenAI and AgenticAI/ii. Class Sessions/d. Finetuning & Evaluation/results/checkpoint-16", torch_dtype=torch.bfloat16).to('cpu')

In [20]:
# --------------------------------------------------
# Step 1: Select test example from dataset
# --------------------------------------------------
index = 200  # Index of the sample you want to evaluate

# Extract the dialogue and its human-written summary
dialogue = dataset['test'][index]['dialogue']
human_baseline_summary = dataset['test'][index]['summary']

# --------------------------------------------------
# Step 2: Prepare the summarization prompt
# --------------------------------------------------
# This prompt works well with instruction-tuned models like FLAN-T5
prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

# --------------------------------------------------
# Step 3: Tokenize the prompt to get input IDs
# --------------------------------------------------
# Convert the prompt into PyTorch tensors compatible with the model
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

# --------------------------------------------------
# Step 4: Generate output from the original model
# --------------------------------------------------
from transformers import GenerationConfig

original_model_outputs = original_model.generate(
    input_ids=input_ids,
    generation_config=GenerationConfig(
        max_new_tokens=200,     # Cap the number of generated tokens
        num_beams=1             # Greedy decoding (no beam search)
    )
)

# Decode model output (token IDs) to readable text
original_model_text_output = tokenizer.decode(
    original_model_outputs[0], skip_special_tokens=True
).strip()

# --------------------------------------------------
# Step 5: Generate output from the instruction-tuned model
# --------------------------------------------------
instruct_model_outputs = instruct_model.generate(
    input_ids=input_ids,
    generation_config=GenerationConfig(
        max_new_tokens=200,
        num_beams=1
    )
)

# Decode instruct model output
instruct_model_text_output = tokenizer.decode(
    instruct_model_outputs[0], skip_special_tokens=True
).strip()

# --------------------------------------------------
# Step 6: Print results side-by-side for qualitative comparison
# --------------------------------------------------

# Create a horizontal line separator for cleaner printing
dash_line = '-' * 100

# Print the human reference summary
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{human_baseline_summary}')

# Print the output from the original model
print(dash_line)
print(f'ORIGINAL MODEL (e.g., base FLAN-T5):\n{original_model_text_output}')

# Print the output from the instruction-tuned model
print(dash_line)
print(f'INSTRUCT MODEL (e.g., fine-tuned FLAN-T5):\n{instruct_model_text_output}')


----------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
----------------------------------------------------------------------------------------------------
ORIGINAL MODEL (e.g., base FLAN-T5):
Sharing programs are a great way to make up your own flyers and banners. #Person1#Person2#Person2#Person3#Person3#Person3#Person3#Person3#Person3#Person3#Person2#Person3#Person3#Person3#Person3#Person2#Person3#Person3#Person3#Person2#Person3#Person3#Person2#Person3#Person3#Person2#Person3#Person3#Person3#Person2#Person3#Person3#Person2#Person3#Person3#Person2#Person3#Person3#Person2#Person3#Person3#Person2#Person3#Person3#Person2#P
----------------------------------------------------------------------------------------------------
INSTRUCT MODEL (e.g., fine-tuned FLAN-T5):
Person is a person.


#### Evaluation

In [21]:
rouge = evaluate.load('rouge')
rouge

EvaluationModule(name: "rouge", module_type: "metric", features: [{'predictions': Value('string'), 'references': List(Value('string'))}, {'predictions': Value('string'), 'references': Value('string')}], usage: """
Calculates average rouge scores for a list of hypotheses and references
Args:
    predictions: list of predictions to score. Each prediction
        should be a string with tokens separated by spaces.
    references: list of reference for each prediction. Each
        reference should be a string with tokens separated by spaces.
    rouge_types: A list of rouge types to calculate.
        Valid names:
        `"rouge{n}"` (e.g. `"rouge1"`, `"rouge2"`) where: {n} is the n-gram based scoring,
        `"rougeL"`: Longest common subsequence based scoring.
        `"rougeLsum"`: rougeLsum splits text using `"
"`.
        See details in https://github.com/huggingface/datasets/issues/617
    use_stemmer: Bool indicating whether Porter stemmer should be used to strip word suffixes.
 

https://github.dev/amitkumaryada/BIA/blob/main/Fine_tune_FLAN_T5_with_PEFT_LoRA_(deeplearning_ai).ipynb

In [22]:
# Extract the first 10 dialogues from the test dataset
dialogues = dataset['test'][0:10]['dialogue']

# Extract the corresponding first 10 human-written summaries (baseline)
human_baseline_summaries = dataset['test'][0:10]['summary']

# Initialize empty lists to store summaries generated by each model
original_model_summaries = []
instruct_model_summaries = []

# Loop through each dialogue to generate summaries
for _, dialogue in enumerate(dialogues):
    
    # Create a summarization prompt for the current dialogue
    prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """
    
    # Tokenize the prompt into input IDs for the model
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    # ---------- Original Model Summarization ----------
    # Generate output tokens from the original model
    original_model_outputs = original_model.generate(
        input_ids=input_ids,
        generation_config=GenerationConfig(max_new_tokens=200)  # Limit output length
    )
    # Decode the token IDs into human-readable text
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
    # Append the generated summary to the original model's list
    original_model_summaries.append(original_model_text_output)

    # ---------- Instruct Model Summarization ----------
    # Generate output tokens from the instruct-tuned model
    instruct_model_outputs = instruct_model.generate(
        input_ids=input_ids,
        generation_config=GenerationConfig(max_new_tokens=200)  # Limit output length
    )
    # Decode the token IDs into human-readable text
    instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)
    # Append the generated summary to the instruct model's list
    instruct_model_summaries.append(instruct_model_text_output)

# Combine human, original model, and instruct model summaries into tuples
zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, instruct_model_summaries))

# Convert the combined list into a pandas DataFrame for comparison and analysis
df = pd.DataFrame(zipped_summaries, columns=[
    'human_baseline_summaries',
    'original_model_summaries',
    'instruct_model_summaries'
])

# Display the DataFrame to compare all three summary types
df

Unnamed: 0,human_baseline_summaries,original_model_summaries,instruct_model_summaries
0,Ms. Dawson helps #Person1# to write a memo to ...,@Person1#Person1#Person2#Person2#Person3#Perso...,Employees should be allowed to use Instant Mes...
1,In order to prevent employees from wasting tim...,@Person1#Person1#Person2#Person2#Person3#Perso...,Employees should be allowed to use Instant Mes...
2,Ms. Dawson takes a dictation for #Person1# abo...,@Person1#Person1#Person2#Person2#Person3#Perso...,Employees should be allowed to use Instant Mes...
3,#Person2# arrives late because of traffic jam....,@Person1#Person2#Person2#Person2#Person2#Perso...,Person is a person.
4,#Person2# decides to follow #Person1#'s sugges...,@Person1#Person2#Person2#Person2#Person2#Perso...,Person is a person.
5,#Person2# complains to #Person1# about the tra...,@Person1#Person2#Person2#Person2#Person2#Perso...,Person is a person.
6,#Person1# tells Kate that Masha and Hero get d...,@Person1#Person2#Person2#Person2#Person3#Perso...,Masha and Hero are getting divorced. #Person
7,#Person1# tells Kate that Masha and Hero are g...,@Person1#Person2#Person2#Person2#Person3#Perso...,Masha and Hero are getting divorced. #Person
8,#Person1# and Kate talk about the divorce betw...,@Person1#Person2#Person2#Person2#Person3#Perso...,Masha and Hero are getting divorced. #Person
9,#Person1# and Brian are at the birthday party ...,@Person1#Person2#Person2#Person3#Person3#Perso...,"Happy Birthday, Brian. #Person"


In [23]:
# ---------- ROUGE Evaluation for Original Model ----------
# Compute ROUGE scores comparing:
# - predictions: summaries generated by the original model
# - references: corresponding human-written summaries (baseline)
# The use of `use_aggregator=True` returns the average score across all samples,
# and `use_stemmer=True` ensures words are reduced to their base/root form for better matching.
original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

# ---------- ROUGE Evaluation for Instruct Model ----------
# Same process as above, but evaluating the instruct-tuned model's summaries.
instruct_model_results = rouge.compute(
    predictions=instruct_model_summaries,
    references=human_baseline_summaries[0:len(instruct_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

# ---------- Display Results ----------
# Print the ROUGE scores for both models for comparison.
print('ORIGINAL MODEL:')
print(original_model_results)

print('INSTRUCT MODEL:')
print(instruct_model_results)

ORIGINAL MODEL:
{'rouge1': np.float64(0.04119581122195633), 'rouge2': np.float64(0.0), 'rougeL': np.float64(0.033130979994113734), 'rougeLsum': np.float64(0.03301162146846246)}
INSTRUCT MODEL:
{'rouge1': np.float64(0.22960386633398955), 'rouge2': np.float64(0.09172222222222223), 'rougeL': np.float64(0.19423905073401154), 'rougeLsum': np.float64(0.19505676605340655)}


#### Performing PEFT (Above steps we did Full fine tuning)

In [27]:
from peft import LoraConfig, get_peft_model, TaskType

# Define the LoRA (Low-Rank Adaptation) configuration for fine-tuning
lora_config = LoraConfig(
    r=32,                  # Rank of the low-rank matrices.
                           # Higher values allow capturing more task-specific information,
                           # but increase the number of trainable parameters.

    lora_alpha=32,         # Scaling factor for LoRA updates.
                           # Controls how much the LoRA weights influence the final output.

    target_modules=["q", "v"],  # List of target submodules inside the model where LoRA will be applied.
                                # "q" and "v" refer to query and value projection layers in the attention mechanism.

    lora_dropout=0.05,     # Dropout probability applied to LoRA layers during training to reduce overfitting.

    bias="none",           # Specifies how to handle biases in target modules:
                           #   - "none" means no bias parameters are adapted.
                           #   - "all" means all bias parameters are adapted.
                           #   - "lora_only" adapts only biases for LoRA layers.

    task_type=TaskType.SEQ_2_SEQ_LM  # Task type ‚Äî here it's Sequence-to-Sequence Language Modeling
                                     # which matches FLAN-T5 (encoder-decoder architecture).
)

#### Reducing parameters to fine tune (Earlier - We took entire 78337408 parameters)

In [28]:
# Apply the LoRA configuration to the original model
# This wraps the base model with LoRA layers so only selected parameters are fine-tuned.
peft_model = get_peft_model(original_model, lora_config)

# Print the number of trainable parameters vs total parameters
# This helps confirm that LoRA is freezing most of the model's weights,
# and only a small percentage is actually trainable (reducing compute needs).
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 1376256
all model parameters: 78337408
percentage of trainable model parameters: 1.76%


In [30]:
# Create a unique output directory for saving model checkpoints
# The directory name includes a Unix timestamp so each run gets its own folder.
output_dir = f'./peft-dialogue-summary-training-{str(int(time.time()))}'

# Define training arguments for the Hugging Face Trainer
peft_training_args = TrainingArguments(
    output_dir=output_dir,        # Where to save checkpoints and logs
    auto_find_batch_size=True,    # Automatically find the largest batch size that fits in GPU memory
    learning_rate=1e-3,           # Higher learning rate because LoRA fine-tunes fewer parameters
    num_train_epochs=1,           # Number of training epochs (full passes through dataset)
    logging_steps=1,               # Log metrics after every step (useful for debugging)
    max_steps=1                    # Limit total steps to 1 (likely for a quick smoke test)
)

# Create a Trainer instance for PEFT fine-tuning
peft_trainer = Trainer(
    model=peft_model,                              # LoRA-wrapped base model
    args=peft_training_args,                       # Training configuration
    train_dataset=tokenized_datasets["train"],     # Tokenized training dataset
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [33]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

adapter_dir = "./peft-dialogue-summary-checkpoint-local"

# 1) Read adapter metadata to get the original base checkpoint
peft_cfg = PeftConfig.from_pretrained(adapter_dir)
print("Adapter expects base:", peft_cfg.base_model_name_or_path)

# 2) Load the exact same base + tokenizer used for training
base_id = peft_cfg.base_model_name_or_path  # e.g., "google/flan-t5-base"
peft_model_base = AutoModelForSeq2SeqLM.from_pretrained(base_id)
tokenizer = AutoTokenizer.from_pretrained(base_id)

# 3) Stitch the adapter onto the correct base
peft_model = PeftModel.from_pretrained(
    peft_model_base,
    adapter_dir,
    is_trainable=False
)

# 4) Device for Mac Silicon (MPS) or CPU fallback
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
peft_model = peft_model.to(device)
peft_model.eval()

tokenizer.padding_side = "left"
tokenizer.truncation_side = "right"

Adapter expects base: google/flan-t5-base


In [34]:
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 0
all model parameters: 251116800
percentage of trainable model parameters: 0.00%


In [35]:
# Move LoRA-adapted PEFT model to CPU
peft_model = peft_model.to('cpu')

# Move instruct-tuned model to CPU
instruct_model = instruct_model.to('cpu')

# Move original base model to CPU
original_model = original_model.to('cpu')

In [36]:
# Select an example index from the test dataset
index = 200

# Retrieve the dialogue text for the selected index
dialogue = dataset['test'][index]['dialogue']

# Retrieve the human-written baseline summary for the selected index
baseline_human_summary = dataset['test'][index]['summary']

# Build the prompt that will be passed to each model for summarization
prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """

# Tokenize the prompt into input IDs for the models
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

# ---------------- ORIGINAL MODEL ----------------
# Generate the summary using the original model with specified generation settings
original_model_outputs = original_model.generate(
    input_ids=input_ids,
    generation_config=GenerationConfig(max_new_tokens=200, num_beams=1)  # max tokens and beam search config
)
# Decode the generated token IDs back into text
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

# ---------------- INSTRUCT MODEL ----------------
# Generate the summary using the instruct-tuned model
instruct_model_outputs = instruct_model.generate(
    input_ids=input_ids,
    generation_config=GenerationConfig(max_new_tokens=200, num_beams=1)
)
# Decode the generated token IDs into text
instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)

# ---------------- PEFT (LoRA) MODEL ----------------
# Generate the summary using the LoRA fine-tuned model
peft_model_outputs = peft_model.generate(
    input_ids=input_ids,
    generation_config=GenerationConfig(max_new_tokens=200, num_beams=1)
)
# Decode the generated token IDs into text
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

# Print a dashed line separator
print(dash_line)

# Print the human baseline summary
print(f'BASELINE HUMAN SUMMARY:\n{human_baseline_summary}')  # NOTE: variable name should match baseline_human_summary above

# Print separator
print(dash_line)

# Print original model's generated summary
print(f'ORIGINAL MODEL:\n{original_model_text_output}')

# Print separator
print(dash_line)

# Print instruct model's generated summary
print(f'INSTRUCT MODEL:\n{instruct_model_text_output}')

# Print separator
print(dash_line)

# Print PEFT model's generated summary
print(f'PEFT MODEL: {peft_model_text_output}')

----------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
----------------------------------------------------------------------------------------------------
ORIGINAL MODEL:
Sharing programs are a great way to make up your own flyers and banners. #Person1#Person2#Person2#Person3#Person3#Person3#Person3#Person3#Person3#Person3#Person2#Person3#Person3#Person3#Person3#Person2#Person3#Person3#Person3#Person2#Person3#Person3#Person2#Person3#Person3#Person2#Person3#Person3#Person3#Person2#Person3#Person3#Person2#Person3#Person3#Person2#Person3#Person3#Person2#Person3#Person3#Person2#Person3#Person3#Person2#P
----------------------------------------------------------------------------------------------------
INSTRUCT MODEL:
Person is a person.
-------------------------------------------------------------------------------------------------

In [37]:
# Extract the first 10 dialogues from the test dataset
dialogues = dataset['test'][0:10]['dialogue']

# Extract the corresponding first 10 human-written summaries (baseline)
human_baseline_summaries = dataset['test'][0:10]['summary']

# Initialize empty lists to store model-generated summaries
original_model_summaries = []
instruct_model_summaries = []
peft_model_summaries = []

# Loop through each dialogue to generate summaries from all models
for idx, dialogue in enumerate(dialogues):

    # Build the summarization prompt for the current dialogue
    prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """

    # Tokenize the prompt into input IDs for the models
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    # Retrieve the corresponding human-written summary for reference
    human_baseline_text_output = human_baseline_summaries[idx]

    # ---------------- ORIGINAL MODEL ----------------
    # Generate summary using the original (base) model
    original_model_outputs = original_model.generate(
        input_ids=input_ids,
        generation_config=GenerationConfig(max_new_tokens=200)
    )
    # Decode tokens to text
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

    # ---------------- INSTRUCT MODEL ----------------
    # Generate summary using the instruct-tuned model
    instruct_model_outputs = instruct_model.generate(
        input_ids=input_ids,
        generation_config=GenerationConfig(max_new_tokens=200)
    )
    # Decode tokens to text
    instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)

    # ---------------- PEFT (LoRA) MODEL ----------------
    # Generate summary using the LoRA fine-tuned model
    peft_model_outputs = peft_model.generate(
        input_ids=input_ids,
        generation_config=GenerationConfig(max_new_tokens=200)
    )
    # Decode tokens to text
    peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

    # Append results to respective lists
    original_model_summaries.append(original_model_text_output)
    instruct_model_summaries.append(instruct_model_text_output)
    peft_model_summaries.append(peft_model_text_output)

# Combine all summaries into a single list of tuples
zipped_summaries = list(zip(
    human_baseline_summaries,
    original_model_summaries,
    instruct_model_summaries,
    peft_model_summaries
))

# Convert the zipped summaries into a DataFrame for easy comparison
df = pd.DataFrame(
    zipped_summaries,
    columns=['human_baseline_summaries', 'original_model_summaries', 'instruct_model_summaries', 'peft_model_summaries']
)

# Display the DataFrame
df

Unnamed: 0,human_baseline_summaries,original_model_summaries,instruct_model_summaries,peft_model_summaries
0,Ms. Dawson helps #Person1# to write a memo to ...,@Person1#Person1#Person2#Person2#Person3#Perso...,Employees should be allowed to use Instant Mes...,This memo is to be distributed to all employee...
1,In order to prevent employees from wasting tim...,@Person1#Person1#Person2#Person2#Person3#Perso...,Employees should be allowed to use Instant Mes...,This memo is to be distributed to all employee...
2,Ms. Dawson takes a dictation for #Person1# abo...,@Person1#Person1#Person2#Person2#Person3#Perso...,Employees should be allowed to use Instant Mes...,This memo is to be distributed to all employee...
3,#Person2# arrives late because of traffic jam....,@Person1#Person2#Person2#Person2#Person2#Perso...,Person is a person.,Take public transport to work.
4,#Person2# decides to follow #Person1#'s sugges...,@Person1#Person2#Person2#Person2#Person2#Perso...,Person is a person.,Take public transport to work.
5,#Person2# complains to #Person1# about the tra...,@Person1#Person2#Person2#Person2#Person2#Perso...,Person is a person.,Take public transport to work.
6,#Person1# tells Kate that Masha and Hero get d...,@Person1#Person2#Person2#Person2#Person3#Perso...,Masha and Hero are getting divorced. #Person,Masha and Hero are getting divorced.
7,#Person1# tells Kate that Masha and Hero are g...,@Person1#Person2#Person2#Person2#Person3#Perso...,Masha and Hero are getting divorced. #Person,Masha and Hero are getting divorced.
8,#Person1# and Kate talk about the divorce betw...,@Person1#Person2#Person2#Person2#Person3#Perso...,Masha and Hero are getting divorced. #Person,Masha and Hero are getting divorced.
9,#Person1# and Brian are at the birthday party ...,@Person1#Person2#Person2#Person3#Person3#Perso...,"Happy Birthday, Brian. #Person",Brian's birthday is coming up.


In [38]:
# Load the ROUGE evaluation metric from Hugging Face's 'evaluate' library
rouge = evaluate.load('rouge')

# ---------------- ORIGINAL MODEL ----------------
# Compute ROUGE scores comparing:
# - predictions: summaries generated by the original model
# - references: corresponding human-written baseline summaries
# The slicing ensures matching lengths between predictions and references.
# 'use_aggregator=True' ‚Üí returns average scores across all samples.
# 'use_stemmer=True' ‚Üí applies stemming to handle different word forms (e.g., "run" vs. "running").
original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

# ---------------- INSTRUCT MODEL ----------------
# Compute ROUGE scores for the instruct-tuned model vs. human summaries
instruct_model_results = rouge.compute(
    predictions=instruct_model_summaries,
    references=human_baseline_summaries[0:len(instruct_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

# ---------------- PEFT (LoRA) MODEL ----------------
# Compute ROUGE scores for the LoRA fine-tuned model vs. human summaries
peft_model_results = rouge.compute(
    predictions=peft_model_summaries,
    references=human_baseline_summaries[0:len(peft_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

# ---------------- PRINT RESULTS ----------------
# Print the aggregated ROUGE scores for each model
print('ORIGINAL MODEL:')
print(original_model_results)

print('INSTRUCT MODEL:')
print(instruct_model_results)

print('PEFT MODEL:')
print(peft_model_results)

ORIGINAL MODEL:
{'rouge1': np.float64(0.04119581122195633), 'rouge2': np.float64(0.0), 'rougeL': np.float64(0.033130979994113734), 'rougeLsum': np.float64(0.03301162146846246)}
INSTRUCT MODEL:
{'rouge1': np.float64(0.22960386633398955), 'rouge2': np.float64(0.09172222222222223), 'rougeL': np.float64(0.19423905073401154), 'rougeLsum': np.float64(0.19505676605340655)}
PEFT MODEL:
{'rouge1': np.float64(0.29970979020979016), 'rouge2': np.float64(0.14344664031620552), 'rougeL': np.float64(0.24626456876456876), 'rougeLsum': np.float64(0.24932465682465677)}
