In [None]:
! rm -r /content/SearchQuery2FuncCall
!git clone https://github.com/XiaoLIUau/SearchQuery2FuncCall.git

In [None]:
%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch==1.13.1 \
    torchdata==0.5.1

%pip install --upgrade accelerate\
             --upgrade huggingface_hub

%pip install \
    bitsandbytes>=0.39.0 \
    transformers==4.28.1 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    loralib==0.1.1


In [None]:
import os
import pandas as pd
import torch
import evaluate

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig


# Load and process dataset

In [None]:
from SearchQuery2FuncCall.setup_dataset import text2json, load_n_process_data

text2json('/content/SearchQuery2FuncCall/Dataset.txt')
# q2f_datasets = load_n_process_data('/content/non_search_examples.json')
q2f_datasets = load_n_process_data('/content/q2f_dataset.json')
q2f_datasets

# Load Huggingface API key
Here we use Huggingface models

Note: Please load a text file that contains your model api key to current folder

>Name your file in either ***'api_key_huggingface.txt'***


In [None]:

# from huggingface_hub import login
# login()

""" # Get model api key """
def load_api_key_from_file(file_path):
    with open(file_path, 'r') as file:
        api_key = file.read().strip()
    return api_key

# Setting a new environment variable
os.environ["HUGGINGFACE_TOKEN"] = load_api_key_from_file('/content/api_key_huggingface.txt')

!huggingface-cli login --token $HUGGINGFACE_TOKEN

# Load model

In [None]:
# Load model directly
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_use_double_quant=True,
    bnb_8bit_quant_type="nf4",
    bnb_8bit_compute_dtype=torch.bfloat16,
    # llm_int8_enable_fp32_cpu_offload=True
)
model_name="atwine/llama-2-7b-chat-fully-quantized-q4-06092023"
tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype=torch.bfloat16)
original_model = AutoModelForCausalLM.from_pretrained(model_name,
                                             device_map="auto",
                                             quantization_config=bnb_config,
                                             torch_dtype=torch.bfloat16,
                                             cache_dir="/tmp/model_cache/",
                                             offload_folder="/path/to/offload_folder"
                                             )

# Print model trainable parameters

In [None]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(original_model))

# Define prompt template

In [None]:
# define prompt format
def create_prompt(input):
    start_prompt = 'Input:'#'<User Query>:'#
    end_prompt = ', Output:'#', <API Call>: '#
    instruction = f"""Instruction: Given a search query, then route to different backend components based on the search intent.
1. If the search is about unit conversion, return API function UnitConvert(SourceUnit, TargetUnit, SourceValue).
2. If the search is about calculation, return API function Calculate(Equation).
3. If the search is about other search intent, return API function Search().
* For unit conversion: common unit conversion in length, mass, time, area, speed, temperature, volume should be covered. And it should be consistent for the same unit throughout. E.g. it should always be “foot”, it cannot be “feet” or “ft” in API calls.
* For calculation: common operation such as +, -, *, /, pow, log, ln, exp, tan(h), sin(h), cos(h), factorial should be covered. And it should be consistent for the same operation throughout. E.g. it should always be “ * ”, it cannot be “x” or “X” in API calls.
Handle input queries in different language styles. Cover common unit conversion and calculation operations.

Examples:
{start_prompt}“ft to cm”{end_prompt}“UnitConvert(SourceUnit:foot, TargetUnit:centimeter,
SourceValue:1)”
{start_prompt}“how many ounces in 5.8 kilograms”{end_prompt}“UnitConvert(SourceUnit:kilogram,
TargetUnit:ounce, SourceValue:5.8)”
{start_prompt}“two to the power of 10”{end_prompt}“Calculate(2^10)”
{start_prompt}“2001-1989” {end_prompt}“Calculate(2001-1989)”
{start_prompt}“what is chatgpt”{end_prompt}“Search()”
{start_prompt}“primary year 1 maths calculation checklist”{end_prompt}“Search()”
{start_prompt}“what are different length units”{end_prompt}“Search()”
{start_prompt}“Natural logarithm of -3/18”{end_prompt}“Calculate(ln(-3/18))”

Only return output of the the given input.

"""
    prompt = instruction + start_prompt + f'“{input}”' + end_prompt
    return prompt

# Generate response in tokens from loaded model
The input_ids and generations are tokens

In [None]:
def model_generate(original_model, inputs):
    generation_config = original_model.generation_config
    generation_config.max_new_tokens = 30
    generation_config.temperature = 0.00000000000001
    generation_config.top_p = 0.9
    generation_config.num_return_sequences = 1
    generation_config.pad_token_id = tokenizer.eos_token_id
    generation_config.eos_token_id = tokenizer.eos_token_id
    return original_model.generate(
            input_ids = inputs.input_ids,
            attention_mask = inputs.attention_mask,
            generation_config = generation_config,
            )

# Generate respsons in text using giving input query text

In [None]:
def generated_text(input):
    prompt = create_prompt(input)
    inputs = tokenizer(prompt, return_tensors='pt')
    generated = tokenizer.decode(
        model_generate(original_model, inputs)[0],
        skip_special_tokens=True
    )
    return generated

# Post processing generation output from LLM


In [None]:
def extractOutputString(input_string,output_string):
    import re
    # Use regular expressions to find the matching output for the input query
    output_match = re.search(rf'Input:\s*“{re.escape(input_string)}”\s*,\s*Output:\s*“([^"]+)”\s*', output_string, flags=re.MULTILINE)
    # Extract and print the output
    if output_match:
        output_string = output_match.group(1)
    # Remove quotation marks
    prefixes = ['“', '”', "'", '"']
    if output_string.startswith(tuple(prefixes)):
        output_string = output_string[1:]
    if output_string.endswith(tuple(prefixes)):
        output_string = output_string[:-1]
    # Remove all space in output
    output_string = "".join(output_string.split())
    return output_string

# Generate with selected examples

In [None]:
example_indices = [9, 40, 50]

dash_line = '-'.join('' for x in range(100))

for index in example_indices:
    input = q2f_datasets['test'][index]['input']
    output = q2f_datasets['test'][index]['output']

    generated = generated_text(input)
    generated = extractOutputString(input,generated)

    print(dash_line)
    print('Example ', i + 1)
    print(dash_line)
    print(f'INPUT:\n{input}')
    print(dash_line)
    print(f'BASELINE OUTPUT:\n{output}')
    print(dash_line)
    print(f'MODEL GENERATION - OUTPUT:\n{generated}\n')

# Generate outputs for given test dataset
Here we are using the test dataset

In [None]:
index_s=0
index_e=index_s+len(q2f_datasets['test'])
inputs = q2f_datasets['test'][index_s:index_e]['input']
outputs = q2f_datasets['test'][index_s:index_e]['output']
# inputs = q2f_datasets['test']['input']
# outputs = q2f_datasets['test']['output']

API_outputs = []

for idx, input in enumerate(inputs):
    API_output = generated_text(input)
    API_output = extractOutputString(input,API_output)
    API_outputs.append(API_output)


zipped_summaries = list(zip(inputs, outputs, API_outputs))

df = pd.DataFrame(zipped_summaries, columns = ['inputs', 'outputs', 'API_outputs'])
df

# Evaluate using ROUGE and BLEU scores

In [None]:
# Rouge
rouge = evaluate.load('rouge')
API_model_results = rouge.compute(
    predictions=API_outputs,
    references=outputs[0:len(API_outputs)],
    use_aggregator=True,
    use_stemmer=True,
)

print('API MODEL ROUGE SCORES:')
print(API_model_results)

# bleu
bleu = evaluate.load('bleu')
API_model_results = bleu.compute(
    predictions=API_outputs,
    references=outputs,
)

print('API MODEL BLEU SCORES:')
print(API_model_results)

## Results

* Text examples for index 50:70


### API MODEL ROUGE SCORES:

>{'rouge1': 0.9371428571428572, 'rouge2': 0.5419047619047619, 'rougeL': 0.9399999999999998, 'rougeLsum': 0.9371428571428572}

### API MODEL BLEU SCORES:

>{'bleu': 0.867918734298719, 'precisions': [0.9074074074074074, 0.8802816901408451, 0.8524590163934426, 0.8333333333333334], 'brevity_penalty': 1.0, 'length_ratio': 1.0657894736842106, 'translation_length': 162, 'reference_length': 152}