### 1. Hugging Face Token

In [None]:
# Set your Hugging Face token as an environment variable
# For Linux/Mac: export HF_TOKEN="your_hf_token_here"
# For Windows: set HF_TOKEN=your_hf_token_here
# Or use: os.environ['HF_TOKEN'] = "your_hf_token_here"

zsh:1: bad assignment


For window - 
import os
os.environ['HF_TOKEN'] = "token"

### 2. Check for latest version of datasets library in pypl, if it is different from the one in requriements.txt upgrade the same

### 3.A loading datasets from hugging face

Link - https://huggingface.co/datasets


Dataset used - knkarthick/dialogsum (Check in search bar)


**Reason for selecting this dataset →**  
a. Data contains long-form conversations paired with human-written summaries, which act as reference sentences. Using these references,  
b. we evaluate Large Language Models (LLMs) on their ability to generate accurate summaries.  
c. The LLM-generated summaries (candidate sentences) are compared against the human references.  
d. BLEU and ROUGE scores are then calculated to measure similarity and assess the quality of the model’s summarization.  

In [2]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from datasets import load_dataset
huggingface_dataset_name = "knkarthick/dialogsum"

dataset = load_dataset("knkarthick/dialogsum")

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [5]:
# Training dataset
dataset['train'][0]

{'id': 'train_0',
 'dialogue': "#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. Why are you here today?\n#Person2#: I found it would be a good idea to get a check-up.\n#Person1#: Yes, well, you haven't had one for 5 years. You should have one every year.\n#Person2#: I know. I figure as long as there is nothing wrong, why go see the doctor?\n#Person1#: Well, the best way to avoid serious illnesses is to find out about them early. So try to come at least once a year for your own good.\n#Person2#: Ok.\n#Person1#: Let me see here. Your eyes and ears look fine. Take a deep breath, please. Do you smoke, Mr. Smith?\n#Person2#: Yes.\n#Person1#: Smoking is the leading cause of lung cancer and heart disease, you know. You really should quit.\n#Person2#: I've tried hundreds of times, but I just can't seem to kick the habit.\n#Person1#: Well, we have classes and some medications that might help. I'll give you more information before you leave.\n#Person2#: Ok, thanks doctor.",
 'summary': "Mr. Smith'

In [6]:
# Validation dataset
dataset["validation"][0]

{'id': 'dev_0',
 'dialogue': "#Person1#: Hello, how are you doing today?\n#Person2#: I ' Ve been having trouble breathing lately.\n#Person1#: Have you had any type of cold lately?\n#Person2#: No, I haven ' t had a cold. I just have a heavy feeling in my chest when I try to breathe.\n#Person1#: Do you have any allergies that you know of?\n#Person2#: No, I don ' t have any allergies that I know of.\n#Person1#: Does this happen all the time or mostly when you are active?\n#Person2#: It happens a lot when I work out.\n#Person1#: I am going to send you to a pulmonary specialist who can run tests on you for asthma.\n#Person2#: Thank you for your help, doctor.",
 'summary': '#Person2# has trouble breathing. The doctor asks #Person2# about it and will send #Person2# to a pulmonary specialist.',
 'topic': 'see a doctor'}

### 3.B loading model from hugging face to help summarize the conversation 

We use model from Hugging face -> Flan-T5 (Base model)

In [7]:
model_name = "google/flan-t5-base"

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
#This will load the model from hugging face and convert it to bfloat16 precision, which is faster for training. without any changes to the model architecture or parameters.

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name) # this is the tokenizer related to flanT5 model (we know tokenizer is different for each model)




### 4. Finetuning

In [8]:
# This def function to check the percentage of trainable model parameters is finetuned

def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(original_model))

trainable model parameters: 247577856
all model parameters: 247577856
percentage of trainable model parameters: 100.00%


In [9]:
index = 200

dialogue = dataset["test"][index]["dialogue"]
summary = dataset["test"][index]["summary"] # Reference sentence

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""


inputs = tokenizer(prompt, return_tensors="pt") # Converting the prompt into tokens
output = tokenizer.decode(original_model.generate(inputs["input_ids"], max_new_tokens=200)[0], skip_special_tokens=True) 

print(prompt)
print(output)


Summarize the following conversation.

#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

Summary:

#Person1#: I'm thinking of upgrading my computer.


In [10]:
dash_line = '-'.join('' for x in range(100))


print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')

print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')

print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following conversation.

#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

Summary:

-------------------------------------------------------------------

#### a. Calcualte BLEU and ROUGE score directly on the dataset

#### b. Perform Full Finetuning and Calcualte BLEU and ROUGE score

#### c. Perform PEFT and Calcualte BLEU and ROUGE score

In [11]:
#parameter Efficient Fine-Tuning (PEFT)
#Setup the PEFT/LoRA model for Fine-Tuning

from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05, # Dropping out 5% of the neurons in DL network
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)



In [12]:
#Add LoRA adapter layers/parameters to the original LLM to be trained.
peft_model = get_peft_model(original_model,
                            lora_config)

print(print_number_of_trainable_model_parameters(peft_model)) # This is fine tuning only 1.41% of the model parameters.

trainable model parameters: 3538944
all model parameters: 251116800
percentage of trainable model parameters: 1.41%


In [13]:
def tokenize_function(example):
    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]



    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids

    return example



tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary',])
    

Map: 100%|██████████| 500/500 [00:00<00:00, 3538.70 examples/s]


In [14]:
tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True) # this code helps to filter the dataset to only 100th dialogue in the dataset. This is done to reduce the dataset size to 100.


Filter: 100%|██████████| 500/500 [00:00<00:00, 6527.86 examples/s]


In [15]:
print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

print(tokenized_datasets)

Shapes of the datasets:
Training: (125, 2)
Validation: (5, 2)
Test: (15, 2)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 125
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 5
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 15
    })
})


In [16]:
output_dir = f'./peft-dialogue-summary-training-{str(int(time.time()))}'

# Define the fraction of the dataset you want to use (e.g., 10%)
subset_fraction = 0.1

# Calculate the number of samples to use
subset_size = int(len(dataset["train"]) * subset_fraction)

# Create a subset of the dataset
subset_dataset = dataset["train"].shuffle(seed=42).select(range(subset_size))

# Tokenize the subset
tokenized_subset = subset_dataset.map(tokenize_function, batched=True)


print(tokenized_subset)
# Define training arguments and create Trainer instance.

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=4,
    learning_rate=1e-3, # Higher learning rate than full fine-tuning.
    num_train_epochs=10,
    logging_steps=1,
    max_steps=10
)

peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_subset,
)


Dataset({
    features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'labels'],
    num_rows: 1246
})


In [None]:
peft_trainer.train()

peft_model_path="./peft-dialogue-summary-checkpoint-local"





peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

 10%|█         | 1/10 [09:39<1:26:55, 579.52s/it]

{'loss': 49.75, 'learning_rate': 0.0009000000000000001, 'epoch': 0.01}


 20%|██        | 2/10 [19:38<1:18:49, 591.25s/it]

{'loss': 48.0, 'learning_rate': 0.0008, 'epoch': 0.01}


 30%|███       | 3/10 [29:16<1:08:15, 585.05s/it]

{'loss': 42.0, 'learning_rate': 0.0007, 'epoch': 0.02}


 40%|████      | 4/10 [38:53<58:10, 581.68s/it]  

{'loss': 40.0, 'learning_rate': 0.0006, 'epoch': 0.03}


 50%|█████     | 5/10 [48:27<48:15, 579.04s/it]

{'loss': 34.75, 'learning_rate': 0.0005, 'epoch': 0.03}


 60%|██████    | 6/10 [58:02<38:30, 577.70s/it]

{'loss': 33.25, 'learning_rate': 0.0004, 'epoch': 0.04}


 70%|███████   | 7/10 [1:17:59<38:59, 779.99s/it]

{'loss': 31.0, 'learning_rate': 0.0003, 'epoch': 0.04}


 80%|████████  | 8/10 [1:37:26<30:06, 903.24s/it]

{'loss': 29.375, 'learning_rate': 0.0002, 'epoch': 0.05}


 90%|█████████ | 9/10 [2:05:10<19:01, 1141.17s/it]

{'loss': 28.875, 'learning_rate': 0.0001, 'epoch': 0.06}


100%|██████████| 10/10 [2:14:53<00:00, 809.35s/it]

{'loss': 29.125, 'learning_rate': 0.0, 'epoch': 0.06}
{'train_runtime': 8093.451, 'train_samples_per_second': 0.01, 'train_steps_per_second': 0.001, 'train_loss': 36.6125, 'epoch': 0.06}





('./peft-dialogue-summary-checkpoint-local/tokenizer_config.json',
 './peft-dialogue-summary-checkpoint-local/special_tokens_map.json',
 './peft-dialogue-summary-checkpoint-local/tokenizer.json')

In [18]:
from peft import PeftModel, PeftConfig

# Base model
peft_model_base = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base", torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

#Finetuned model
peft_model = PeftModel.from_pretrained(peft_model_base,
                                       './peft-dialogue-summary-checkpoint-local',
                                       torch_dtype=torch.bfloat16,
                                       is_traiable=False)

In [20]:
# We will use the same 200th Index to find the results of the finetuned model. and calculate the BLEU and ROUGE scores.

index = 200

dialogue = dataset["test"][index]["dialogue"]
summary = dataset["test"][index]["summary"] # Reference sentence

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

# Tokenize the input prompt and make sure it's on the same device as the model
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(original_model.device)

# Generate text from the original model
original_model_outputs = original_model.generate(
    input_ids=input_ids,
    generation_config=GenerationConfig(
        max_new_tokens=200,
        num_beams=2,
        do_sample=False
    )
)

# Decode the output text
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

# Print results
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}')
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')


---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
---------------------------------------------------------------------------------------------------
ORIGINAL MODEL:
#Person1#: I'd like to upgrade my computer. #Person2#: I'm not sure what exactly I'd need. #Person1#: I'd like to add a painting program to my software. #Person2#: I'd like to add a CD-ROM drive.


In [22]:
# We will use the same 200th Index to find the results of the finetuned model. and calculate the BLEU and ROUGE scores.

index = 200

dialogue = dataset["test"][index]["dialogue"]
summary = dataset["test"][index]["summary"] # Reference sentence

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""



# Tokenize the input prompt and make sure it's on the same device as the models
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(original_model.device)

# Move the PEFT model to the same device as the original model
peft_model = peft_model.to(original_model.device)

# Generate summaries using the original model and PEFT model
with torch.no_grad():
    original_model_outputs = original_model.generate(
        input_ids=input_ids,
        generation_config=GenerationConfig(
            max_new_tokens=200,
            num_beams=2,
            do_sample=False
        )
    )

    peft_model_outputs = peft_model.generate(
        input_ids=input_ids,
        generation_config=GenerationConfig(
            max_new_tokens=200,
            num_beams=1,
            do_sample=False
        )
    )

# Decode the model outputs
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

# Print all results
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}')
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(dash_line)
print(f'PEFT MODEL:\n{peft_model_text_output}')
print(dash_line)

---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
---------------------------------------------------------------------------------------------------
ORIGINAL MODEL:
#Person1#: I'd like to upgrade my computer. #Person2#: I'm not sure what exactly I'd need. #Person1#: I'd like to add a painting program to my software. #Person2#: I'd like to add a CD-ROM drive.
---------------------------------------------------------------------------------------------------
PEFT MODEL:
Upgrade your computer.
---------------------------------------------------------------------------------------------------


In [24]:
# Calculating BLEU and ROUGE scores

rouge_score = evaluate.load('rouge')

print(rouge_score)


EvaluationModule(name: "rouge", module_type: "metric", features: [{'predictions': Value('string'), 'references': List(Value('string'))}, {'predictions': Value('string'), 'references': Value('string')}], usage: """
Calculates average rouge scores for a list of hypotheses and references
Args:
    predictions: list of predictions to score. Each prediction
        should be a string with tokens separated by spaces.
    references: list of reference for each prediction. Each
        reference should be a string with tokens separated by spaces.
    rouge_types: A list of rouge types to calculate.
        Valid names:
        `"rouge{n}"` (e.g. `"rouge1"`, `"rouge2"`) where: {n} is the n-gram based scoring,
        `"rougeL"`: Longest common subsequence based scoring.
        `"rougeLsum"`: rougeLsum splits text using `"
"`.
        See details in https://github.com/huggingface/datasets/issues/617
    use_stemmer: Bool indicating whether Porter stemmer should be used to strip word suffixes.
 