# Install and downloads

In [8]:
!rm -r /content/SearchQuery2FuncCall
!git clone https://github.com/XiaoLIUau/SearchQuery2FuncCall.git
!pip install -r /content/SearchQuery2FuncCall/requirements.txt --quiet

Cloning into 'SearchQuery2FuncCall'...
remote: Enumerating objects: 127, done.[K
remote: Counting objects: 100% (127/127), done.[K
remote: Compressing objects: 100% (86/86), done.[K
remote: Total 127 (delta 77), reused 88 (delta 40), pack-reused 0[K
Receiving objects: 100% (127/127), 84.67 KiB | 337.00 KiB/s, done.
Resolving deltas: 100% (77/77), done.
[0m

In [2]:
%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch==1.13.1 \
    torchdata==0.5.1

%pip install --upgrade accelerate\
             --upgrade huggingface_hub

%pip install \
    bitsandbytes>0.37.2 \
    transformers==4.28.1 \
    datasets==2.11.0 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    loralib==0.1.1 \
    peft==0.3.0 \
    trl==0.4.7 --quiet


Collecting pip
  Downloading pip-23.3-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.1.2
    Uninstalling pip-23.1.2:
      Successfully uninstalled pip-23.1.2
Successfully installed pip-23.3
Collecting torch==1.13.1
  Downloading torch-1.13.1-cp310-cp310-manylinux1_x86_64.whl (887.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m887.5/887.5 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchdata==0.5.1
  Downloading torchdata-0.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m94.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-runtime-cu11==11.7.99 (from torch==1.13.1)
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux

# Setup

In [2]:
# load libraries
import os
import time
import pandas as pd
import torch
import evaluate
import tqdm

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, GenerationConfig, TrainingArguments, Trainer
from peft import PeftModel, LoraConfig, get_peft_model, TaskType
from SearchQuery2FuncCall.setup_dataset import text2json, load_n_process_data
from datasets import DatasetDict, Dataset
from trl import SFTTrainer

In [3]:
### Load datasets
text2json('/content/SearchQuery2FuncCall/Dataset.txt')
# q2f_datasets = load_n_process_data('/content/non_search_examples.json')
q2f_datasets = load_n_process_data('/content/q2f_dataset.json')
print(q2f_datasets)


# # Load Huggingface API key
# Here we use Huggingface models
# Note: Please load a text file that contains your model api key to current folder
# >Name your file in either ***'api_key_huggingface.txt'***

### Access Huggingface for loading model
""" # Get model api key """
def load_api_key_from_file(file_path):
  with open(file_path, 'r') as file:
      api_key = file.read().strip()
  return api_key
# Setting a new environment variable
os.environ["HUGGINGFACE_TOKEN"] = load_api_key_from_file('/content/api_key_huggingface.txt')
!huggingface-cli login --token $HUGGINGFACE_TOKEN


### Funciton to print number of trainable mode parameters
def print_number_of_trainable_model_parameters(model):
  trainable_model_params = 0
  all_model_params = 0
  for _, param in model.named_parameters():
      all_model_params += param.numel()
      if param.requires_grad:
          trainable_model_params += param.numel()
  return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"


### Setup for Model Generation
# define prompt format
def create_prompt(input):
  input_prompt = 'Input:'
  output_prompt = ', Output:'
  instruction = f"""Instruction: Given a search query, then route to different backend components based on the search intent.
1. If the search is about unit conversion, return API function UnitConvert(SourceUnit, TargetUnit, SourceValue).
2. If the search is about calculation, return API function Calculate(Equation).
3. If the search is about other search intent, return API function Search().
* For unit conversion: common unit conversion in length, mass, time, area, speed, temperature, volume should be covered. And it should be consistent for the same unit throughout. E.g. it should always be “foot”, it cannot be “feet” or “ft” in API calls.
* For calculation: common operation such as +, -, *, /, pow, log, ln, exp, tan(h), sin(h), cos(h), factorial should be covered. And it should be consistent for the same operation throughout. E.g. it should always be “ * ”, it cannot be “x” or “X” in API calls.
Handle input queries in different language styles. Cover common unit conversion and calculation operations.

"""
  prompt = instruction + input_prompt + f'“{input}”' + output_prompt
  return prompt

# Generate response in tokens with given model and tokenized input
def model_generate(input,model,tokenizer):
  generation_config = model.generation_config
  generation_config.max_new_tokens = 100
  generation_config.temperature = 0.00000000000001
  generation_config.top_p = 0.9
  generation_config.num_return_sequences = 1
  generation_config.pad_token_id = tokenizer.eos_token_id
  generation_config.eos_token_id = tokenizer.eos_token_id
  return model.generate(
          input_ids = input.input_ids,
          attention_mask = input.attention_mask,
          generation_config = generation_config,
          )

# Generate Text with selected model and input text
def generated_text(input,model,tokenizer):
  prompt = create_prompt(input)
  inputs = tokenizer(prompt, return_tensors='pt')
  generated = tokenizer.decode(
      model_generate(inputs,model,tokenizer)[0],
      skip_special_tokens=True
  )
  return generated

# Post generation string processing
def extractOutputString(input_string,output_string):
  import re
  # Use regular expressions to find the matching output for the input query
  output_match = re.search(rf'Input:\s*“{re.escape(input_string)}”\s*,\s*Output:\s*(.*?)(\[\/|$)', output_string, flags=re.MULTILINE)

  # Extract and print the output
  if output_match:
      output_string = output_match.group(1)

  # Remove quotation marks
  prefixes = ['“', '”', "'", '"', '[', '.', ']']
  if output_string.startswith(tuple(prefixes)):
      output_string = output_string[1:]
  while output_string.endswith(tuple(prefixes)):
      output_string = output_string[:-1]
  # Remove all space in output
  output_string = "".join(output_string.split())
  return output_string


Saved 340 examples to 'q2f_dataset.json'.
Separated 87 Search() examples to 'search_examples.json'.
Separated 253 non-Search() examples to 'non_search_examples.json'.
Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-7728f343beb69205/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-7728f343beb69205/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 210
    })
    valid: Dataset({
        features: ['input', 'output'],
        num_rows: 50
    })
    test: Dataset({
        features: ['input', 'output'],
        num_rows: 80
    })
})
Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Setup for test model output and dataset evaluation using ROUGE and BLEU scores

In [4]:
def generate_samples(example_indices,dataset,model,tokenizer):
    dash_line = '-'.join('' for x in range(100))

    for i, index in enumerate(example_indices):
        input = dataset['test'][index]['input']
        output = dataset['test'][index]['output']

        generated = generated_text(input,model,tokenizer)
        generated = extractOutputString(input,generated)

        print(dash_line)
        print('Example ', i + 1)
        print(dash_line)
        print(f'INPUT:\n{input}')
        print(dash_line)
        print(f'BASELINE OUTPUT:\n{output}')
        print(dash_line)
        print(f'PEFT MODEL GENERATION - OUTPUT:\n{generated}\n')
    return


def generate_dataset(start_index,dataset,model,tokenizer):
    end_index=start_index+len(dataset['test'])
    inputs = dataset['test'][start_index:end_index]['input']
    outputs = dataset['test'][start_index:end_index]['output']

    outputs_gen = []

    for idx, input in enumerate(inputs):
        if idx%10==0:
            print(idx)
        output_gen = generated_text(input,model,tokenizer)
        output_gen = extractOutputString(input,output_gen)
        outputs_gen.append(output_gen)

    zipped_summaries = list(zip(inputs, outputs, outputs_gen))
    df = pd.DataFrame(zipped_summaries, columns = ['Inputs', 'Outputs', 'Outputs_generated'])
    return df


def evaluate_generations(outputs_ref,outputs_gen):
    # Rouge
    rouge = evaluate.load('rouge')
    model_results = rouge.compute(
        predictions=outputs_gen,
        references=outputs_ref,
        use_aggregator=True,
        use_stemmer=True,
    )

    print('MODEL ROUGE SCORES:')
    print(model_results)

    # bleu
    bleu = evaluate.load('bleu')
    model_results = bleu.compute(
        predictions=outputs_gen,
        references=outputs_ref,
    )

    print('MODEL BLEU SCORES:')
    print(model_results)
    return

# Setup for datasets for training

In [12]:
### Prepare for training dataset

def create_prompt_training(input,output):
    input_prompt = 'Input:'
    output_prompt = ', Output:'
    start_prompt = '<s>[INST] '
    end_prompt = '[/INST]'
    instruction = f"""Instruction: Given a search query, then route to different backend components based on the search intent.
1. If the search is about unit conversion, return API function UnitConvert(SourceUnit, TargetUnit, SourceValue).
2. If the search is about calculation, return API function Calculate(Equation).
3. If the search is about other search intent, return API function Search().
* For unit conversion: common unit conversion in length, mass, time, area, speed, temperature, volume should be covered. And it should be consistent for the same unit throughout. E.g. it should always be “foot”, it cannot be “feet” or “ft” in API calls.
* For calculation: common operation such as +, -, *, /, pow, log, ln, exp, tan(h), sin(h), cos(h), factorial should be covered. And it should be consistent for the same operation throughout. E.g. it should always be “ * ”, it cannot be “x” or “X” in API calls.
Handle input queries in different language styles. Cover common unit conversion and calculation operations.

"""

    prompt = start_prompt + instruction + input_prompt + f'“{input}”' + output_prompt + f'“{output}”' + end_prompt
    return prompt

# Examples:
# {input_prompt}“ft to cm”{output_prompt}“UnitConvert(SourceUnit:foot, TargetUnit:centimeter,
# SourceValue:1)”
# {input_prompt}“how many ounces in 5.8 kilograms”{output_prompt}“UnitConvert(SourceUnit:kilogram,
# TargetUnit:ounce, SourceValue:5.8)”
# {input_prompt}“two to the power of 10”{output_prompt}“Calculate(2^10)”
# {input_prompt}“2001-1989” {output_prompt}“Calculate(2001-1989)”
# {input_prompt}“what is chatgpt”{output_prompt}“Search()”
# {input_prompt}“primary year 1 maths calculation checklist”{output_prompt}“Search()”
# {input_prompt}“what are different length units”{output_prompt}“Search()”
# {input_prompt}“Natural logarithm of -3/18”{output_prompt}“Calculate(ln(-3/18))”
# {input_prompt}“what is tan of 3/4”{output_prompt}“Calculate(tan(3/4))”


## Add text variable to datasets
def create_text_datasets(example):
    # Define your custom processing logic here
    prompt_text = create_prompt_training(example['input'], example['output'])
    return {"text": prompt_text}
def process_dataset_dict(dataset_dict, processing_function):
    processed_dict = DatasetDict()
    for split_key, split_data in dataset_dict.items():
        processed_data = split_data.map(processing_function)
        processed_dict[split_key] = processed_data
    return processed_dict
# Concatenate "input" and "output" using the custom function
q2f_datasets = process_dataset_dict(q2f_datasets, create_text_datasets)
print(q2f_datasets)


# Iterate over your training set and calculate the length of each sequence.
# For example, you can use the following code:
def get_max_seq_length(dataset):
    max_seq_length = 0
    for sequence in dataset['train']:
        text_len = len(sequence['text'])
        if text_len > max_seq_length:
            max_seq_length = text_len
    return max_seq_length+100
max_seq_length = get_max_seq_length(q2f_datasets)


### Prepare model for training
# Freeze all parameters
def freeze_all_parameters(model):
    for param in model.parameters():
        param.requires_grad = False
    print('\n### After freeze all parameters: ###')
    print(print_number_of_trainable_model_parameters(model))
    return



Map:   0%|          | 0/210 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'text'],
        num_rows: 210
    })
    valid: Dataset({
        features: ['input', 'output', 'text'],
        num_rows: 50
    })
    test: Dataset({
        features: ['input', 'output', 'text'],
        num_rows: 80
    })
})


### Load Base Model and setup model Configs

In [7]:
# Load model function
def load_model(model_name):
    bnb_config = BitsAndBytesConfig(
        load_in_8bit=True,
        bnb_8bit_use_double_quant=True,
        bnb_8bit_quant_type="nf4",
        bnb_8bit_compute_dtype=torch.bfloat16,
        llm_int8_enable_fp32_cpu_offload=True,
    )
    return AutoModelForCausalLM.from_pretrained(
                    model_name,
                    device_map='auto',
                    quantization_config=bnb_config,
                    )
# Load model directly
model_name = "atwine/llama-2-7b-chat-fully-quantized-q4-06092023"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.truncation_side = "left"
# Original model
original_model = load_model(model_name)

# Check trainable parameters
print('### Before freeze all parameters: ###')
print(print_number_of_trainable_model_parameters(original_model))
# Freeze model parameters
freeze_all_parameters(original_model)

(…)92023/resolve/main/tokenizer_config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

(…)-q4-06092023/resolve/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

(…)023/resolve/main/special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

(…)zed-q4-06092023/resolve/main/config.json:   0%|          | 0.00/991 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/7.01G [00:00<?, ?B/s]

Some weights of LlamaForCausalLM were not initialized from the model checkpoint at atwine/llama-2-7b-chat-fully-quantized-q4-06092023 and are newly initialized: ['model.layers.25.self_attn.rotary_emb.inv_freq', 'model.layers.0.self_attn.rotary_emb.inv_freq', 'model.layers.7.self_attn.rotary_emb.inv_freq', 'model.layers.31.self_attn.rotary_emb.inv_freq', 'model.layers.30.self_attn.rotary_emb.inv_freq', 'model.layers.17.self_attn.rotary_emb.inv_freq', 'model.layers.21.self_attn.rotary_emb.inv_freq', 'model.layers.13.self_attn.rotary_emb.inv_freq', 'model.layers.2.self_attn.rotary_emb.inv_freq', 'model.layers.27.self_attn.rotary_emb.inv_freq', 'model.layers.20.self_attn.rotary_emb.inv_freq', 'model.layers.11.self_attn.rotary_emb.inv_freq', 'model.layers.12.self_attn.rotary_emb.inv_freq', 'model.layers.1.self_attn.rotary_emb.inv_freq', 'model.layers.22.self_attn.rotary_emb.inv_freq', 'model.layers.9.self_attn.rotary_emb.inv_freq', 'model.layers.10.self_attn.rotary_emb.inv_freq', 'model.lay

(…)2023/resolve/main/generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

# Try with Base Model

In [14]:
example_idxs = [78]#[5, 9, 24, 78]#[49]#
generate_samples(example_idxs,q2f_datasets,original_model,tokenizer)



---------------------------------------------------------------------------------------------------
Example  1
---------------------------------------------------------------------------------------------------
INPUT:
Convert 3.5 ounces to grams
---------------------------------------------------------------------------------------------------
BASELINE OUTPUT:
UnitConvert(SourceUnit:ounce,TargetUnit:gram,SourceValue:3.5)
---------------------------------------------------------------------------------------------------
PEFT MODEL GENERATION - OUTPUT:
3.5oz=Xgrams



## PEFT training

In [15]:
# PEFT Setup
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],#, "ffn_kernel"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
peft_model = original_model
# peft_model = get_peft_model(original_model, lora_config)
# print(print_number_of_trainable_model_parameters(peft_model))

output_dir = f'./peft-query-function-training-{str(int(time.time()))}'
peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3, # Higher learning rate than full fine-tuning.
    num_train_epochs=5,
    logging_steps=10,
    max_steps=10
)

# Set supervised fine-tuning parameters
peft_trainer = SFTTrainer(
    model=peft_model,
    train_dataset=q2f_datasets['train'],
    eval_dataset=q2f_datasets['valid'],
    peft_config=lora_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=peft_training_args,
    packing=False,
)

## PEFT training
peft_trainer.train()

## Save PEFT Model
peft_model_path="./peft-query-function-checkpoint-local"
peft_trainer.model.save_pretrained(peft_model_path,
                                   local_files_only=True
                                   )
tokenizer.save_pretrained(peft_model_path,
                          local_files_only=True
                          )

Map:   0%|          | 0/210 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
10,0.7616
20,0.0711
30,0.0597
40,0.05


('./peft-query-function-checkpoint-local/tokenizer_config.json',
 './peft-query-function-checkpoint-local/special_tokens_map.json',
 './peft-query-function-checkpoint-local/tokenizer.json')

In [16]:
!ls -al ./peft-query-function-checkpoint-local/adapter_model.bin

-rw-r--r-- 1 root root 67201357 Oct 16 07:05 ./peft-query-function-checkpoint-local/adapter_model.bin


In [9]:
!zip -r /content/SearchQuery2FuncCall/peft-query-function-checkpoint-local.zip /content/SearchQuery2FuncCall/peft-query-function-checkpoint-local
from google.colab import files
files.download("/content/SearchQuery2FuncCall/peft-query-function-checkpoint-local.zip")

  adding: content/SearchQuery2FuncCall/peft-query-function-checkpoint-local/ (stored 0%)
  adding: content/SearchQuery2FuncCall/peft-query-function-checkpoint-local/.ipynb_checkpoints/ (stored 0%)
  adding: content/SearchQuery2FuncCall/peft-query-function-checkpoint-local/adapter_model.bin (deflated 8%)
  adding: content/SearchQuery2FuncCall/peft-query-function-checkpoint-local/adapter_config.json (deflated 39%)
  adding: content/SearchQuery2FuncCall/peft-query-function-checkpoint-local/tokenizer_config.json (deflated 66%)
  adding: content/SearchQuery2FuncCall/peft-query-function-checkpoint-local/special_tokens_map.json (deflated 73%)
  adding: content/SearchQuery2FuncCall/peft-query-function-checkpoint-local/tokenizer.json (deflated 74%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Load PEFT model  for inference
Require restart runtime due to memory issue

In [None]:
# import os
# os.kill(os.getpid(), 9)

del peft_model
del tokenizer
del peft_trainer
del original_model

import gc
n = gc.collect()
torch.cuda.empty_cache()

In [4]:
# Setups
from SearchQuery2FuncCall.load_datasets_n_login_HF import q2f_datasets
from SearchQuery2FuncCall.utilities_main import *

In [2]:
### Load PEFT model from previous saved folder
# Note: It will only work if the colab session is not disconnected from previous training session
# Restart the runtime is fine
model_name = "atwine/llama-2-7b-chat-fully-quantized-q4-06092023"
peft_model_base = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
peft_model_path="./peft-query-function-checkpoint-local"#"/content/SearchQuery2FuncCall/peft-query-function-checkpoint-local"#
peft_model = PeftModel.from_pretrained(peft_model_base,
                                       peft_model_path,
                                       torch_dtype=torch.bfloat16,
                                       is_trainable=False
                                      )
# peft_model = peft_model.merge_and_unload()
tokenizer = AutoTokenizer.from_pretrained(peft_model_path, torch_dtype=torch.bfloat16)
print(print_number_of_trainable_model_parameters(peft_model))

Some weights of LlamaForCausalLM were not initialized from the model checkpoint at atwine/llama-2-7b-chat-fully-quantized-q4-06092023 and are newly initialized: ['model.layers.3.self_attn.rotary_emb.inv_freq', 'model.layers.31.self_attn.rotary_emb.inv_freq', 'model.layers.6.self_attn.rotary_emb.inv_freq', 'model.layers.12.self_attn.rotary_emb.inv_freq', 'model.layers.18.self_attn.rotary_emb.inv_freq', 'model.layers.13.self_attn.rotary_emb.inv_freq', 'model.layers.17.self_attn.rotary_emb.inv_freq', 'model.layers.2.self_attn.rotary_emb.inv_freq', 'model.layers.4.self_attn.rotary_emb.inv_freq', 'model.layers.8.self_attn.rotary_emb.inv_freq', 'model.layers.22.self_attn.rotary_emb.inv_freq', 'model.layers.27.self_attn.rotary_emb.inv_freq', 'model.layers.10.self_attn.rotary_emb.inv_freq', 'model.layers.7.self_attn.rotary_emb.inv_freq', 'model.layers.9.self_attn.rotary_emb.inv_freq', 'model.layers.19.self_attn.rotary_emb.inv_freq', 'model.layers.23.self_attn.rotary_emb.inv_freq', 'model.layer

trainable model parameters: 0
all model parameters: 6755192832
percentage of trainable model parameters: 0.00%


# Generate with selected examples

In [5]:
example_indices = [5, 9, 24, 78]#[49]#
generate_samples(example_indices,q2f_datasets,peft_model,tokenizer)



---------------------------------------------------------------------------------------------------
Example  1
---------------------------------------------------------------------------------------------------
INPUT:
Convert 3.5 ounces to grams
---------------------------------------------------------------------------------------------------
BASELINE OUTPUT:
UnitConvert(SourceUnit:ounce,TargetUnit:gram,SourceValue:3.5)
---------------------------------------------------------------------------------------------------
PEFT MODEL GENERATION - OUTPUT:
UnitConvert(SourceUnit:ounce,TargetUnit:gram,SourceValue:3.5)

---------------------------------------------------------------------------------------------------
Example  2
---------------------------------------------------------------------------------------------------
INPUT:
Convert 1000 grams to kilograms
---------------------------------------------------------------------------------------------------
BASELINE OUTPUT:
UnitConvert(S

# Generate outputs for given test dataset
Here we are using the test dataset

In [None]:
index_s=0 # Start index for example in test dataset
df=generate_dataset(index_s,q2f_datasets,peft_model,tokenizer)
df

0
<re.Match object; span=(1000, 1048), match='Input:“how to bake a pizza”, Output:“Search()”[/'>
<re.Match object; span=(1000, 1052), match='Input:“Best movies of all time”, Output:“Search()>
<re.Match object; span=(1000, 1061), match='Input:“natural logarithm of 10”, Output:“Calculat>
<re.Match object; span=(1000, 1055), match='Input:“top 10 travel destinations”, Output:“Searc>
<re.Match object; span=(1000, 1118), match='Input:“Convert 2 acres to square meters”, Output:>
<re.Match object; span=(1000, 1058), match='Input:“logarithm of 1000”, Output:“Calculate(log(>
<re.Match object; span=(1000, 1058), match='Input:“interesting facts about space”, Output:“Se>
<re.Match object; span=(1000, 1116), match='Input:“cubic feet to cubic meters”, Output:“UnitC>
<re.Match object; span=(1000, 1059), match='Input:“cosine of 45 degrees”, Output:“Calculate(c>
<re.Match object; span=(1000, 1056), match='Input:“inventions by Thomas Edison”, Output:“Sear>
10
<re.Match object; span=(1000, 1071), match='I

Unnamed: 0,inputs,outputs,API_outputs_peft
0,how to bake a pizza,Search(),Search()
1,Best movies of all time,Search(),Search()
2,natural logarithm of 10,Calculate(ln(10)),Calculate(ln(10))
3,top 10 travel destinations,Search(),Search()
4,Convert 2 acres to square meters,"UnitConvert(SourceUnit:acre,TargetUnit:squarem...","UnitConvert(SourceUnit:acre,TargetUnit:squarem..."
...,...,...,...
75,Evaluate the expression 10 + 3,Calculate(10+3),Calculate(10+3)
76,Recipe for chocolate cake,Search(),Search()
77,square root of 100,Calculate(sqrt(100)),Calculate(sqrt(100))
78,15 divided by 3,Calculate(15/3),Calculate(15/3)


# Evaluate using ROUGE and BLEU scores

In [None]:
evaluate_generations(df['Outputs'].values(),df['Outputs_generated'].values())

API MODEL ROUGE SCORES:
{'rouge1': 0.991875, 'rouge2': 0.7375, 'rougeL': 0.991875, 'rougeLsum': 0.991875}
API MODEL BLEU SCORES:
{'bleu': 0.9727731466373349, 'precisions': [0.9926793557833089, 0.9867330016583747, 0.9789674952198852, 0.9729119638826185], 'brevity_penalty': 0.9898034391264364, 'length_ratio': 0.9898550724637681, 'translation_length': 683, 'reference_length': 690}


## Evaluation Results

MODEL ROUGE SCORES:
{'rouge1': 0.991875, 'rouge2': 0.7375, 'rougeL': 0.991875, 'rougeLsum': 0.991875}

MODEL BLEU SCORES:
{'bleu': 0.9727731466373349, 'precisions': [0.9926793557833089, 0.9867330016583747, 0.9789674952198852, 0.9729119638826185], 'brevity_penalty': 0.9898034391264364, 'length_ratio': 0.9898550724637681, 'translation_length': 683, 'reference_length': 690}