In [None]:
! rm -r /content/SearchQuery2FuncCall
!git clone https://github.com/XiaoLIUau/SearchQuery2FuncCall.git

In [None]:
%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch==1.13.1 \
    torchdata==0.5.1

%pip install --upgrade accelerate\
             --upgrade huggingface_hub

%pip install \
    bitsandbytes>=0.39.0 \
    transformers==4.28.1 \
    datasets==2.11.0 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    loralib==0.1.1 \
    peft==0.3.0 --quiet


In [None]:
import os
import time
import pandas as pd
import torch
import evaluate

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, BitsAndBytesConfig, GenerationConfig, TrainingArguments, Trainer

from peft import PeftModel, LoraConfig, get_peft_model, TaskType

In [None]:
from SearchQuery2FuncCall.setup_dataset import text2json, load_n_process_data

text2json('/content/SearchQuery2FuncCall/Dataset.txt')
# q2f_datasets = load_n_process_data('/content/non_search_examples.json')
q2f_datasets = load_n_process_data('/content/q2f_dataset.json')
q2f_datasets

In [None]:
""" # Get model api key """
def load_api_key_from_file(file_path):
    with open(file_path, 'r') as file:
        api_key = file.read().strip()
    return api_key

# Setting a new environment variable
os.environ["HUGGINGFACE_TOKEN"] = load_api_key_from_file('/content/api_key_huggingface.txt')

!huggingface-cli login --token $HUGGINGFACE_TOKEN

In [None]:
# Free GPU memory 
torch.cuda.empty_cache()

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_use_double_quant=True,
    bnb_8bit_quant_type="nf4",
    bnb_8bit_compute_dtype=torch.bfloat16,
    # llm_int8_enable_fp32_cpu_offload=True
)

# Load model directly
model_name = 'google/flan-t5-large'
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.truncation_side = 'left'
original_model = AutoModelForSeq2SeqLM.from_pretrained(
                  model_name,
                  torch_dtype=torch.bfloat16, # need to check is this would work to reduce runtime memory
                  device_map='auto',
                  quantization_config=bnb_config,
                  )

# PEFT Setup
lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)


In [None]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(original_model))

In [None]:
# define prompt format
def create_prompt(input):
    start_prompt = 'Input:'#'<User Query>:'#
    end_prompt = ', Output:'#', <API Call>: '#
    instruction = f"""Instruction: Given a search query, then route to different backend components based on the search intent.
1. If the search is about unit conversion, return API function UnitConvert(SourceUnit, TargetUnit, SourceValue).
2. If the search is about calculation, return API function Calculate(Equation).
3. If the search is about other search intent, return API function Search(). 
* For unit conversion: common unit conversion in length, mass, time, area, speed, temperature, volume should be covered. And it should be consistent for the same unit throughout. E.g. it should always be “foot”, it cannot be “feet” or “ft” in API calls.
* For calculation: common operation such as +, -, *, /, pow, log, ln, exp, tan(h), sin(h), cos(h), factorial should be covered. And it should be consistent for the same operation throughout. E.g. it should always be “ * ”, it cannot be “x” or “X” in API calls.
Handle input queries in different language styles. Cover common unit conversion and calculation operations.

Examples:
{start_prompt} “ft to cm” {end_prompt} “UnitConvert(SourceUnit:foot, TargetUnit:centimeter,
SourceValue:1)”
{start_prompt} “how many ounces in 5.8 kilograms” {end_prompt} “UnitConvert(SourceUnit:kilogram,
TargetUnit:ounce, SourceValue:5.8)”
{start_prompt} “two to the power of 10” {end_prompt} “Calculate(2^10)”
{start_prompt} “2001-1989” {end_prompt} “Calculate(2001-1989)”
{start_prompt} “what is chatgpt” {end_prompt} “Search()”
{start_prompt} “primary year 1 maths calculation checklist” {end_prompt} “Search()”
{start_prompt} “what are different length units” {end_prompt} “Search()”
{start_prompt} “Natural logarithm of -3/18” {end_prompt} “Calculate(ln(-3/18))”

"""
    prompt = instruction + start_prompt + '“{input}”' + end_prompt + '\n'
    return prompt

In [None]:

def model_generate(original_model, inputs):
    generation_config = original_model.generation_config
    generation_config.max_new_tokens = 100
    generation_config.temperature = 0.00000000000001
    generation_config.top_p = 0.7
    generation_config.num_return_sequences = 1
    generation_config.pad_token_id = tokenizer.eos_token_id
    generation_config.eos_token_id = tokenizer.eos_token_id
    return original_model.generate(
            input_ids = inputs.input_ids,
            attention_mask = inputs.attention_mask,
            generation_config = generation_config,
            )

In [None]:
def tokenize_function(example):
    prompt = [create_prompt(input) for input in example["input"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["output"], padding="max_length", truncation=True, return_tensors="pt").input_ids

    return example

# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.
tokenized_datasets = q2f_datasets.shuffle().map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['input', 'output'])
# tokenized_datasets = tokenized_datasets.remove_columns(['Search', 'input', 'output'])
tokenized_datasets

In [None]:
peft_model = get_peft_model(original_model,
                            lora_config)
print(print_number_of_trainable_model_parameters(peft_model))

In [None]:
output_dir = f'./peft-query-function-training-{str(int(time.time()))}'

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3,
    num_train_epochs=40,
    logging_steps=30,
    max_steps=500
)

peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets["train"],
)

In [None]:
peft_trainer.train()

peft_model_path="./peft-query-function-checkpoint-local"

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

In [None]:
peft_model = PeftModel.from_pretrained(original_model,
                                       peft_model_path,
                                       torch_dtype=torch.bfloat16,
                                       is_trainable=False)
tokenizer = AutoTokenizer.from_pretrained(peft_model_path, torch_dtype=torch.bfloat16)
print(print_number_of_trainable_model_parameters(peft_model))

In [None]:
def extractOutputString(input_string):
    input_string = "".join(input_string.split())
    prefixes = ['“', '”', "'", '"']
    if input_string.startswith(tuple(prefixes)):
        input_string = input_string[1:]
    if input_string.endswith(tuple(prefixes)):
        input_string = input_string[:-1]
    return input_string

In [None]:
example_indices = [9, 40, 50]

dash_line = '-'.join('' for x in range(100))

for i, index in enumerate(example_indices):
    input = q2f_datasets['test'][index]['input']
    output = q2f_datasets['test'][index]['output']

    prompt = create_prompt(input)
    inputs = tokenizer(prompt, return_tensors='pt')
    generated = tokenizer.decode(
        model_generate(peft_model, inputs)[0],
        skip_special_tokens=True
    )

    print(dash_line)
    print('Example ', i + 1)
    print(dash_line)
    print(f'INPUT:\n{input}')
    print(dash_line)
    print(f'BASELINE OUTPUT:\n{output}')
    print(dash_line)
    print(f'MODEL GENERATION - OUTPUT:\n{generated}\n')

In [None]:

inputs = q2f_datasets['test'][50:70]['input']
outputs = q2f_datasets['test'][50:70]['output']
# inputs = q2f_datasets['test']['input']
# outputs = q2f_datasets['test']['output']

API_outputs = []

for idx, input in enumerate(inputs):

    API_output = tokenizer.decode(
        model_generate(peft_model, inputs)[0],
        skip_special_tokens=True
    )
    API_output = extractOutputString(API_output)
    API_outputs.append(API_output)


zipped_summaries = list(zip(inputs, outputs, API_outputs))

df = pd.DataFrame(zipped_summaries, columns = ['inputs', 'outputs', 'API_outputs'])
df

In [None]:
rouge = evaluate.load('rouge')

API_model_results = rouge.compute(
    predictions=API_outputs,
    references=outputs[0:len(API_outputs)],
    use_aggregator=True,
    use_stemmer=True,
)

print('API MODEL ROUGE SCORES:')
print(API_model_results)

In [None]:
bleu = evaluate.load('bleu')

API_model_results = bleu.compute(
    predictions=API_outputs,
    references=outputs,
)

print('API MODEL BLEU SCORES:')
print(API_model_results)