### Lab2 - Fine tuning an LLM with instruction aka supervised learning 
We provide the model with instructions which comprise of dialogue - summary pairs, to fully or partiall fine tune its weights to perform better on the desired data set. 

In [73]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, Trainer, TrainingArguments
import torch
import time
import evaluate
import numpy as np
import pandas as pd

In [74]:
dash_line = "_".join("" for _ in range(100))

#### Lets load and preview the dataset

In [75]:
dataset_name = "knkarthick/dialogsum"
dataset = load_dataset(dataset_name)
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

#### Load the LLM 

In [87]:
# NOTE: This is a smaller model than its larger sibling "google/flan-t5-base" which has 3.2 times more parameters. 
# We will use this instead of "google/flan-t5-base" used in the coursera lab, to avoid memory shortage
# We will also use mps - Apple’s Metal Performance Shaders which is a high-performance framework for GPU-accelerated computations on iOS and macOS.

model_name = "google/flan-t5-small"

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

#### Analyze the model size and number of parameters

In [88]:
def print_trainable_parameters(model):
    total_params = 0
    trainable_params = 0
    for param in model.parameters():
        total_params += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"Total model parameters: {total_params}")
    print(f"Trainable model parameters: {trainable_params}")
    print(f"Trainable parameters percentage: {100 * trainable_params / total_params:.2f}%")

print_trainable_parameters(original_model)


Total model parameters: 76961152
Trainable model parameters: 76961152
Trainable parameters percentage: 100.00%


#### Perform zero-shot inference 

In [89]:
index = 200

prompt = f"""Summarize the following dialogue:
{dataset["test"][index]["dialogue"]}
"""
encoded_prompt = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

generation_config = GenerationConfig(
    num_beams=1,
    max_new_tokens=64,
)

output = original_model.generate(
    encoded_prompt,
    generation_config=generation_config,
)
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

print(f"Input Prompt: {prompt}")
print(dash_line)
print(f"Human baseline summary: {dataset['test'][index]['summary']}")
print(dash_line)
print(f"Generated summary: {decoded_output}")



Input Prompt: Summarize the following dialogue:
#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

___________________________________________________________________________________________________
Human baseline summary: #Person1# teaches #Person2# how to upgrade software and har

#### Tokenize the dataset into an explicit instruction data set with dialog-summary pair 
Put the tokenized prompt under the key "input_ids" and tokenized summary under the key "labels"

In [None]:
def tokenize_function(example):
    prompt_start = "Summarize the following conversation \n:"
    prompt_end = "Summary: \n"

    prompt = [prompt_start + dialogue + prompt_end for dialogue in example["dialogue"]]
    tokenized_prompt = tokenizer(prompt, padding="max_length", truncation=True, max_length=512).input_ids
    tokenized_summary = tokenizer(example["summary"], padding="max_length", truncation=True, max_length=512).input_ids

    example['input_ids'] = tokenized_prompt
    example['labels'] = tokenized_summary

    return example

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["dialogue", "summary", "id", "topic"])

print(f"Tokenized dataset: {tokenized_dataset}")


Map:   0%|          | 0/12460 [00:00<?, ? examples/s]



Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Tokenized dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 500
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1500
    })
})


#### Take only a small subsample from tokenized_dataset to fine tune the model to save time

In [100]:
tokenized_dataset = tokenized_dataset.filter(lambda example, index: index % 50 == 0, with_indices=True)

Filter:   0%|          | 0/12460 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1500 [00:00<?, ? examples/s]

Check the shapes of all the three parts of the dataset

In [101]:
print("Shapes of the tokenized dataset:")
print(f"Train shape: {tokenized_dataset['train'].shape}")
print(f"Validation shape: {tokenized_dataset['validation'].shape}")
print(f"Test shape: {tokenized_dataset['test'].shape}")

tokenized_dataset

Shapes of the tokenized dataset:
Train shape: (250, 3)
Validation shape: (10, 3)
Test shape: (30, 3)


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 250
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 10
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 30
    })
})

In [102]:
fine_tuned_model_dir = "./fine_tuned_model"

training_args = TrainingArguments(
    output_dir=fine_tuned_model_dir,
    learning_rate=3e-4,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_steps=1,
    # load_best_model_at_end=True,
    # max_steps=1000,
)

trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
)

trainer.train()

Step,Training Loss
1,0.0
2,0.0
3,0.0
4,0.0
5,0.0
6,0.0
7,0.0
8,0.0
9,0.0
10,0.0


TrainOutput(global_step=320, training_loss=0.0, metrics={'train_runtime': 280.5946, 'train_samples_per_second': 8.91, 'train_steps_per_second': 1.14, 'total_flos': 464726261760000.0, 'train_loss': 0.0, 'epoch': 10.0})

In [93]:
# Load the fine-tuned model 
instruct_model = AutoModelForSeq2SeqLM.from_pretrained("./fine_tuned_model/checkpoint-625", torch_dtype=torch.float16).to(device)

#### Evaluate the Model Qualitatively (Human Evaluation)

In [103]:
index = 200

prompt = f"""Summarize the following dialogue:
{dataset["test"][index]["dialogue"]}
"""
encoded_prompt = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

original_model_output = original_model.generate(encoded_prompt, generation_config=generation_config)
original_decoded_output = tokenizer.decode(original_model_output[0], skip_special_tokens=True)

instruct_model_output = instruct_model.generate(encoded_prompt, generation_config=generation_config)
instruct_decoded_output = tokenizer.decode(instruct_model_output[0], skip_special_tokens=True)

print(f"Input Prompt: {prompt}")
print(dash_line)
print(f"Human baseline summary: {dataset['test'][index]['summary']}")
print(dash_line)
print(f"Original model generated summary:")
print(dash_line)
print(f"{original_decoded_output}")
print(dash_line)
print(f"Fine-tuned model generated summary:")
print(dash_line)
print(f"{instruct_decoded_output}")


Input Prompt: Summarize the following dialogue:
#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

___________________________________________________________________________________________________
Human baseline summary: #Person1# teaches #Person2# how to upgrade software and har