## Play with dataset & few-shot learning

In [None]:
!pip install --upgrade pip
!pip install --disable-pip-version-check \
torch==1.13.1 \
torchdata==0.5.1 --quiet

! pip install \
transformers==4.27.2 \
datasets==2.11.0 --quiet

In [None]:
from datasets import load_datasets
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
from transformers import GenerationConfig

In [None]:
hugging_face_dataset_name = 'knkarthick/dialogsum'
dataset = load_datasets(hugging_face_dataset_name)

In [4]:
example_indices = [40, 100]
dash_line = "-"*150

for i, index in enumerate(example_indices):
    print(dash_line)
    print("example ", i+1)
    print(dash_line)
    print("Input Dialog")
    print(dataset["test"][index]["dialogue"])
    print(dash_line)
    print("Baseline Human summary")
    print(dataset["test"][index]["summary"])
    print(dash_line)
    print()

'------------------------------------------------------------------------------------------------------------------------------------------------------'

In [None]:
model_name = "google/flan-t5-base"
model = AutoModelForSeq2SeqLM.frompretrained(model_name)
tokenizer = AutoTokenizer.frompretrained(model_name, use_fast = True)

In [None]:
## check tokenizer
sentence = "what time is it, tom?"
sentence_encoded = tokenizer(sentence, return_tensor = 'pt')
sentence_decoded = tokenizer.decode(sentence_encoded['input_ids'][0], skip_special_tokens = True)

print("encoded sent")
print(sentence_encoded['input_ids'][0])

print("\ndecoded sent")
print(sentence_decoded)

In [None]:
for i, index in enumerate(example_indices):
    dialog = dataset["test"][index]["dialogue"]
    summary = dataset["test"][index]["summary"]
    inputs = tokenizer(dialog, return_tensor = 'pt')
    output = tokenizer.decode(
        model.generate(inputs['input_ids'],
                       max_new_tokens = 50)[0],
    skip_special_tokens = True)
    print(dash_line)
    print("example ", i+1)
    print(dash_line)
    print("Input prompt")
    print(dialog)
    print(dash_line)
    print("Baseline Human summary")
    print(summary)
    print(dash_line)
    print("Model generation without prompt: ", output)
    print()

In [None]:
for i, index in enumerate(example_indices):
    dialog = dataset["test"][index]["dialogue"]
    summary = dataset["test"][index]["summary"]
    prompt = f"""
    Summarize the following conversation.
    {dialog}

    Summary:
    """
    # prompt = f"""
    # Dialogue:
    # {dialog}

    # What was going on?:
    # """
    inputs = tokenizer(prompt, return_tensor = 'pt')
    output = tokenizer.decode(
        model.generate(inputs['input_ids'],
                       max_new_tokens = 50)[0],
    skip_special_tokens = True)
    print(dash_line)
    print("example ", i+1)
    print(dash_line)
    print("Input prompt")
    print(dialog)
    print(dash_line)
    print("Baseline Human summary")
    print(summary)
    print(dash_line)
    print("Model generation without prompt: ", output)
    print()

In [None]:
def make_prompt(example_indices_full, example_index_to_summarize):
    prompt = ''
    for index in example_indices_full:
        dialog = dataset["test"][index]["dialogue"]
        summary = dataset["test"][index]["summary"]

        prompt += f"""
        Dialogue:

        {dialog}

        What was going on?
        {summary}
        """

        dialog = dataset["test"][example_indices_full]["dialogue"]
        prompt += f"""
        Dialogue:

        {dialog}

        What was going on?
        """
        return prompt


In [None]:
example_indices_full = [40]
example_indices_to_summarize = 200
one_shot_prompt = make_prompt(example_indices_full, example_indices_to_summarize)
one_shot_prompt

In [None]:
# generation_config = GenerationConfig(max_new_tokens = 50, do_sample = True, temperature = 0.2)
summary = dataset['test'][example_indices_to_summarize]['summary']
input = tokenizer(one_shot_prompt, return_tensor = 'pt')
output = tokenizer.decode(
    model.generate(inputs['input_ids'],
                       max_new_tokens = 50)[0],
    # generation_config = generation_config,
    skip_special_tokens = True)

print(dash_line)
print(f"baseline human summary \n{summary}")
print(dash_line)
print(f"Model generation one shot \n {output}")

## Fine-tuning

In [None]:
!pip install --upgrade pip
!pip install --disable-pip-version-check \
torch==1.13.1 \
torchdata==0.5.1 --quiet

! pip install \
transformers==4.27.2 \
datasets==2.11.0\
evaluate==0.4.0 \
rouge_score==0.1.1 \
peft==0.3.0 --quiet

In [None]:
from datasets import load_datasets
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np

In [None]:
hugging_face_dataset_name = 'knkarthick/dialogsum'
dataset = load_datasets(hugging_face_dataset_name)
dadaset

In [None]:
model_name = "google/flan-t5-base"
original_model = AutoModelForSeq2SeqLM.frompretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast = True)

In [None]:
def no_trainable_param(model):
    trainable_param = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.require_grad:
            trainable_param += param.numel()
    return trainable_param, all_param

no_trainable_param(original_model)

In [None]:
def tokenize_function(example):
    start_prompt = "Summarize the following convo \n\n"
    end_prompt = "\n\nSummary"
    prompt = [start_prompt+dialogue+end_prompt for dialogue in example['dialogue']]
    example['input_ids'] = tokenizer(prompt, padding='max_length', truncation = True, return_tensors = 'pt').input_ids
    example['labels'] = tokenizer(example['summary'], padding='max_length', truncation = True, return_tensors = 'pt').input_ids
    return example

tokenized_dataset = dataset.map(tokenize_function, batch = True)
tokenized_dataset = tokenized_dataset.remove_columns(['id', 'topic', 'dialog', 'summary'])
tokenized_dataset = tokenized_dataset.filter(lambda example, index: index%100==0, with_indices = True)

In [None]:
out_dir = f"./dialog_summary_traing_{str(int(time.time()))}"
train_args = TrainingArguments(
    output_dir = out_dir,
    learning_rate = 1e-5,
    num_train_epochs = 1,
    weight_decay = 0.01,
    logging_steps = 1,
    max_steps = 1
)

trainer = Trainer(
    model = original_model,
    args = train_args,
    train_dataset = tokenized_dataset['train'],
    eval_dataset = tokenized_dataset['validation']
)

In [None]:
trainer.train()

In [None]:
# ROUGE
rouge = evaluate.load('rouge')

In [None]:
dialogs = dataset["test"][0:10]["dialogue"]
summary = dataset["test"][0:10]["summary"]
original_model_summary = []
for _, dialog in enumerate(dialogs):
    prompt = f"""
    Summarize the following convo:
    {dialog}

    Summary:"""
    input_ids = tokenizer(prompt, return_tensor = 'pt').input_ids
    original_model_outputs = original_model.generate(input_ids = input_ids, generation_config = GenerationConfig(max_new_tokens = 200))
    original_model_text_outputs = tokenizer.decode(original_model_outputs[0], skip_special_tokens = True)
    original_model_summary.append(original_model_text_outputs)

In [None]:
original_model_results = rouge.compute(
    prediction = original_model_summaries,
    reference = human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator = True,
    use_stemmer = True
)
original_model_results

# PEFT

In [3]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r =32, # Rank
    lora_alpha = 32,
    target_modules = ['q', 'v'],
    lora_dropout = 0.05,
    bias = "none",
    task_type = TaskType.seq_2_seq_LM # Flan-T5
)

peft_model = get_peft_model(original_model, lora_config)

[(9, 1), (8, 2), (7, 3)]

In [None]:
out_dir = f"./peft_dialog_summary_traing_{str(int(time.time()))}"
peft_train_args = TrainingArguments(
    output_dir = out_dir,
    learning_rate = 1e-3,
    auto_find_batch_size = True,
    num_train_epochs = 1,
    logging_steps = 1,
    max_steps = 1
)

peft_trainer = Trainer(
    model = peft_model,
    args = peft_train_args,
    train_dataset = tokenized_dataset['train'],
)

In [None]:
peft_trainer.train()
peft_model_path = f"./peft-dialog-summary-checkpoint-local"
peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

In [None]:
# Merging peft adapter with orignal LLM
from peft import PeftModel, PeftConfig
peft_model_base = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base", torch_dtype = torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
peft_model = PeftModel.from_pretrained(peft_model_base,
                                       f"./peft-dialog-summary-checkpoint-local",
                                       torch_dtype = torch.bfloat16,
                                       is_trainable = False)

In [None]:
# Inferences