In [None]:
%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch==1.13.1 \
    torchdata==0.5.1 --quiet

%pip install \
    transformers==4.27.2 \
    datasets==2.11.0 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    loralib==0.1.1 \
    peft==0.3.0 --quiet

In [None]:
from datasets import load_dataset, load_metric
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np


In [None]:
dataset = load_dataset('json', data_files='flan_t5_q2f_dataset.json')

In [None]:
q2f_datasets=dataset.shuffle(seed=42)
q2f_datasets

In [None]:
datasets_train_test = q2f_datasets["train"].train_test_split(test_size=80)
datasets_train_validation = datasets_train_test["train"].train_test_split(test_size=50)

q2f_datasets["train"] = datasets_train_validation["train"]
q2f_datasets["validation"] = datasets_train_validation["test"]
q2f_datasets["test"] = datasets_train_test["test"]
q2f_datasets

In [None]:
example_indices = [9, 40, 50]

dash_line = '-'.join('' for x in range(100))

for i, index in enumerate(example_indices):
    print(dash_line)
    print('Example ', i + 1)
    print(dash_line)
    print('INPUT:')
    print(q2f_datasets['test'][index]['input'])
    print(dash_line)
    print('OUTPUT:')
    print(q2f_datasets['test'][index]['output'])
    print(dash_line)
    print()

In [None]:
model_name='google/flan-t5-base'

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

In [None]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(original_model))

In [None]:
sentence = "miles to kilometers"

sentence_encoded = tokenizer(sentence, return_tensors='pt')

sentence_decoded = tokenizer.decode(
        sentence_encoded["input_ids"][0],
        skip_special_tokens=True
    )

print('ENCODED SENTENCE:')
print(sentence_encoded["input_ids"][0])
print('\nDECODED SENTENCE:')
print(sentence_decoded)

In [None]:
for i, index in enumerate(example_indices):
    input = q2f_datasets['test'][index]['input']
    output = q2f_datasets['test'][index]['output']

    prompt = f"""
Input:

{input}

Output: ?
    """

    # Input constructed prompt instead of the dialogue.

    inputs = tokenizer(prompt, return_tensors='pt')
    generated = tokenizer.decode(
        model.generate(
            inputs["input_ids"],
            max_new_tokens=50,
        )[0],
        skip_special_tokens=True
    )

    print(dash_line)
    print('Example ', i + 1)
    print(dash_line)
    print(f'INPUT:\n{input}')
    print(dash_line)
    print(f'BASELINE OUTPUT:\n{output}')
    print(dash_line)
    print(f'MODEL GENERATION - OUTPUT:\n{generated}\n')

# Few-shot-prompting

In [None]:
def make_prompt(example_indices_full, example_index_to_translate):
    prompt = ''
    for index in example_indices_full:
        input = q2f_datasets['test'][index]['input']
        output = q2f_datasets['test'][index]['output']

        # The stop sequence '{ouptut}\n\n\n' is important for FLAN-T5. Other models may have their own preferred stop sequence.
        prompt += f"""
Input:

{input}

Output:
{output}


"""

    input = q2f_datasets['test'][example_index_to_translate]['input']

    prompt += f"""
Input:

{input}

Output:
"""

    return prompt

In [None]:
example_indices_full = [9, 40, 50]
example_index_to_translate = 60

few_shot_prompt = make_prompt(example_indices_full, example_index_to_translate)

print(few_shot_prompt)

In [None]:
output = q2f_datasets['test'][example_index_to_translate]['output']


generation_config = GenerationConfig(max_new_tokens=50, do_sample=True, temperature=0.00000000000001)
inputs = tokenizer(few_shot_prompt, return_tensors='pt')
translate = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        generation_config=generation_config,
    )[0],
    skip_special_tokens=True
)

print(dash_line)
print(f'BASELINE OUTPUT:\n{output}\n')
print(dash_line)
print(f'MODEL GENERATION - FEW SHOTS:\n{translate}')

# Preprocess Datasets

In [None]:
def tokenize_function(example):
    start_prompt = 'Input:\n\n'
    end_prompt = '\n\nOutput: '
    prompt = [start_prompt + input + end_prompt for input in example["input"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["output"], padding="max_length", truncation=True, return_tensors="pt").input_ids

    return example

# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.
tokenized_datasets = q2f_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['input', 'output',])

In [None]:
tokenized_datasets

In [None]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)
peft_model = get_peft_model(original_model,
                            lora_config)
print(print_number_of_trainable_model_parameters(peft_model))

<a name='3.2'></a>
### 3.2 - Train PEFT Adapter

Define training arguments and create `Trainer` instance.

In [None]:
output_dir = f'./peft-quary-fucntion-training-{str(int(time.time()))}'

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3,
    num_train_epochs=5,
    logging_steps=1,
    max_steps=1
)

peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets["train"],
)

In [None]:
peft_trainer.train()

peft_model_path="./peft-quary-fucntion-checkpoint-local"

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)