Function-calling capabilities in large language models open up new possibilities for automation and engineering. Here I fine-tuned 'google/gemma-1.1-2b-it' for function calling.

See original papers for methods used in this notebook: [LoRA](https://arxiv.org/abs/2106.09685) and [QLoRA](https://arxiv.org/abs/2305.14314) and reinforcement learning from human feedback (RLHF) [1](https://arxiv.org/pdf/2203.02155.pdf), [2](https://arxiv.org/pdf/2009.01325.pdf).

----------------------------------------------------
Do not forget to paste your huggingface API key `HF_TOKEN` and W&B API key `WB` in google colab secrets.

Do not forget to use GPU (not CPU).



# Model and Data

In [None]:
!pip install -qU transformers datasets peft trl flash-attn einops wandb
!pip install -qU accelerate bitsandbytes # Do not need it when using just cpu

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.2/245.2 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m57.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.3 MB

In [None]:
import os
from google.colab import userdata

#os.environ["LANGCHAIN_API_KEY"] = userdata.get('LANGCHAIN_API_KEY')
os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN')

import wandb
wandb.login(key=userdata.get('WB'))

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
import torch

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    AutoConfig,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
    GenerationConfig,
    TextIteratorStreamer,
    StoppingCriteria,
    StoppingCriteriaList,
    Trainer,
    DataCollatorForLanguageModeling
)

from datasets import load_dataset, DatasetDict
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model
import bitsandbytes as bnb

#from huggingface_hub import notebook_login # notebook_login()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
model_id = 'google/gemma-1.1-2b-it'

# bnb_config = BitsAndBytesConfig(
#         load_in_4bit=True,
#         bnb_4bit_use_double_quant=True,
#         bnb_4bit_quant_type="nf4",
#         bnb_4bit_compute_dtype=torch.bfloat16
# )

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    #quantization_config=bnb_config,
    #attn_implementation="flash_attention_2",
    device_map="auto",           # Automatically distribute the model across available devices
    torch_dtype=torch.bfloat16,  # Use bfloat16 precision for model parameters - good for most devices including TPU
)                                # But generally float16 has better precision than bfloat16
#model.config.use_cache = False

max_length = 2099
print("max_length", max_length)

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    padding_side = 'right',
    max_length=max_length,  # Set the maximum length for tokenization
    add_eos_token=True,
)

In [None]:
# print( len( list( model.parameters() ) ) , model.__dict__['_modules']['model'] )
# for param in model.parameters():
#      print(type(param), param.size())

model.config

GemmaConfig {
  "_name_or_path": "google/gemma-1.1-2b-it",
  "architectures": [
    "GemmaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 2,
  "eos_token_id": 1,
  "head_dim": 256,
  "hidden_act": "gelu_pytorch_tanh",
  "hidden_activation": "gelu_pytorch_tanh",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 16384,
  "max_position_embeddings": 8192,
  "model_type": "gemma",
  "num_attention_heads": 8,
  "num_hidden_layers": 18,
  "num_key_value_heads": 1,
  "pad_token_id": 0,
  "rms_norm_eps": 1e-06,
  "rope_theta": 10000.0,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.41.1",
  "use_cache": true,
  "vocab_size": 256000
}

In [None]:
#HF: https://huggingface.co/docs/transformers/main/en/peft
# AND more here: https://discuss.huggingface.co/t/correct-way-to-save-load-adapters-and-checkpoints-in-peft/77836/8

lora_config = LoraConfig(
    r=8,
    bias="none",
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    layers_to_transform=[16, 17],      ## do more here if your GPU ressources allow
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config, adapter_name="function_calling")
model.print_trainable_parameters()


#model.add_adapter(lora_config, adapter_name="function_calling")

#### NOTE: we do this from TrainingArguments to controlleverything from one place
# model.add_adapter('function_calling', peft_config=lora_config)
# model.train_adapter('function_calling', peft_config=lora_config)
# model.gradient_checkpointing_enable()

#### To make model also trainable
# from peft import PeftModel
# model = PeftModel.from_pretrained(model, "some_name", is_trainable=True,)

trainable params: 1,089,536 || all params: 2,507,261,952 || trainable%: 0.0435


In [None]:
data_id = "NickyNicky/function-calling_chatml_gemma_v1"

def prepare_dataset(dataset_name=data_id, split="train"):
    ####  dataset = load_dataset(data_id, split="train[:1%]")
    raw_dataset = load_dataset(dataset_name, split=split)
    def format_dataset(example):

        return example

    dataset = raw_dataset.map(format_dataset, batched=True)

    #TRAIN-TEST-SPLIT
    dataset = DatasetDict({
        # 'train': dataset.shuffle(seed=1024).select(range(98000)),         # for full data
        # 'test': dataset.shuffle(seed=1024).select(range(98000, 100187))
        'train': dataset.shuffle(seed=1024).select(range(1000)),            # for slice of data for quick demo
        'test': dataset.shuffle(seed=1024).select(range(2000, 2100))
    })

    return dataset

dataset_name = "NickyNicky/function-calling_chatml_gemma_v1"
dataset = prepare_dataset(dataset_name, split="train")
train_dataset = dataset['train']
test_dataset = dataset['test']

In [None]:
from datetime import datetime

training_arguments = TrainingArguments(
    per_device_train_batch_size = 1, #4,
    per_device_eval_batch_size = 1, #4,
    # gradient_accumulation_steps = 2,
    # gradient_checkpointing=True,

    optim = "paged_adamw_8bit",#"paged_adamw_32bit",
    learning_rate = 2e-4,
    #lr_scheduler_type='linear', #default
    bf16=True, #fp16=True,
    max_grad_norm = 1, #0.3,
    warmup_ratio = 0.1, #0.03,
    group_by_length=True,
    weight_decay=0.01,

    num_train_epochs=1,
    output_dir="./results",
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=1,   #how many checkpoint to keep


    # save_strategy="no",                         # ------------------ version 1

    # save_strategy="steps", #"epoch",            # ------------------ version 2
    # eval_strategy="steps", #"epoch",
    # save_steps = 100,
    # eval_steps = 100,
    # load_best_model_at_end=True,

    save_strategy="epoch",                        # ------------------ version 3
    eval_strategy="epoch",
    load_best_model_at_end=True,

    run_name=f"function_calling_{ datetime.now().strftime('%Y-%m-%d_%H-%M-%S') }",
    report_to="wandb" if os.environ.get("WANDB_API_KEY") else None,
    # max_steps = 1000,
    # push_to_hub=True,
)


## About lr_scheduler_type:
# "linear" or "cosine" are good starting points for many tasks.
# "cosine_with_restarts" can be beneficial for longer training runs.
# "polynomial" can be useful when you want the learning rate to decrease faster initially.
# "constant" or "constant_with_warmup" might be suitable for fine-tuning pre-trained models.
# "inverse_sqrt" is less common but can be used for specific tasks.

In [None]:
# from transformers import TrainerCallback #, TrainerControl, TrainerState,


# class SaveBestModelCallback(TrainerCallback):
#     def __init__(self, metric_name="accuracy"):
#         super().__init__()
#         self.metric_name = metric_name
#         self.best_metric = float("-inf")  # Initialize with negative infinity

#     def on_evaluate(self, eval_result, **kwargs):
#         current_metric = eval_result[self.metric_name]
#         if current_metric > self.best_metric:
#             self.best_metric = current_metric
#             trainer = kwargs.get("trainer")
#             if trainer is not None:
#                 trainer.model.save_pretrained(trainer.args.output_dir)
#                 print(f"Saved new best model with {self.metric_name}: {current_metric:.4f}")

# callback = SaveBestModelCallback(metric_name="eval_accuracy")


data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    #callbacks=[callback],       #if in TrainingArguments you used: save_strategy="no",
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    peft_config=lora_config,
    dataset_text_field= "Text", #"text",
    max_seq_length=max_length,
    tokenizer=tokenizer,
    packing=True,
    data_collator=data_collator,

)

In [None]:
# # This puts layernorms in torch.bfloat16 AND YOU NEED THAT IF YOU USE QUANTIZATION FOR THE RET OF THE MODEL.
# for name, module in trainer.model.named_modules():
#     if "norm" in name:
#         module = module.to(torch.bfloat16)

# Train the model

In [None]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

trainer.train()

In [None]:
# Save the adapter weights
model.save_pretrained("./results/function_calling_adapter", "function_calling")

# # Merge the saved adapter with the model
# model.load_adapter("./results/function_calling_adapter")
# # If you used quantization you need to load unquantized fresh model first because you cannot merge adapter with quantized model
# # model = AutoModelForCausalLM.from_pretrained(model_id)
# model.merge_adapter("function_calling")

# Merge adaptor with the model directly
model = model.merge_and_unload()

# Save the merged model and tokenizer
model_path = "./fine-tuned-gemma/model"
tokenizer_path = "./fine-tuned-gemma/tokenizer"
model.save_pretrained(model_path)
tokenizer.save_pretrained(tokenizer_path)

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

# Generate

In [None]:
# model.config.use_cache = True
# model.eval()

In [None]:
class ListOfTokensStoppingCriteria(StoppingCriteria):
    """
    Class to define a stopping criterion based on a list of specific tokens.
    """
    def __init__(self, tokenizer, stop_tokens):
        self.tokenizer = tokenizer
        # Encode each stop token and store their IDs in a list
        self.stop_token_ids_list = [tokenizer.encode(stop_token, add_special_tokens=False) for stop_token in stop_tokens]

    def __call__(self, input_ids, scores, **kwargs):
        # Check if the last tokens generated match any of the stop token sequences
        for stop_token_ids in self.stop_token_ids_list:
            len_stop_tokens = len(stop_token_ids)
            if len(input_ids[0]) >= len_stop_tokens:
                if input_ids[0, -len_stop_tokens:].tolist() == stop_token_ids:
                    return True
        return False

# Define a list of stop tokens
stop_tokens = ["<end_of_turn>"]

# Initialize the stopping criteria with the tokenizer and the list of stop tokens
stopping_criteria = ListOfTokensStoppingCriteria(tokenizer, stop_tokens)

# Add the custom stopping criteria to a StoppingCriteriaList
stopping_criteria_list = StoppingCriteriaList([stopping_criteria])

In [None]:

    # {
    #     "name": "convert_currency",
    #     "description": "Convert amount from one currency to another",
    #     "parameters": {
    #         "type": "object",
    #         "properties": {
    #             "amount": {
    #                 "type": "number",
    #                 "description": "The amount to convert"
    #             },
    #             "from_currency": {
    #                 "type": "string",
    #                 "description": "The currency to convert from"
    #             },
    #             "to_currency": {
    #                 "type": "string",
    #                 "description": "The currency to convert to"
    #             }
    #         },
    #         "required": [
    #             "amount",
    #             "from_currency",
    #             "to_currency"
    #         ]
    #     }
    # },

function="""[
    {
        "name": "calculate_bmi",
        "description": "Calculate the Body Mass Index (BMI)",
        "parameters": {
            "type": "object",
            "properties": {
                "weight": {
                    "type": "number",
                    "description": "The weight in kilograms"
                },
                "height": {
                    "type": "number",
                    "description": "The height in meters"
                }
            },
            "required": [
                "weight",
                "height"
            ]
        }
    },
    {
        "name": "check_word_count",
        "description": "Check the word count of a given text",
        "parameters": {
            "type": "object",
            "properties": {
                "text": {
                    "type": "string",
                    "description": "The text to count the words"
                }
            },
            "required": [
                "text"
            ]
        }
    }
]"""


question="Hi, I would like to calculate my BMI. I weigh 70 kilograms and my height is 1.75 meters."

input_text = f"""<bos><start_of_turn>system
You are a helpful assistant with access to the following functions.
Use them if required:
<tool>
{function}
</tool>

To use these functions respond with:
<function_call> {{"name": "function_name", "arguments": {{"arg_1": "value_1", "arg_2": "value_2", ...}}}} </function_call>

Contains properties essential for the model to respond according to the tasks:
<observation> {{"arg_1": "value_1", "arg_2": "value_2", "arg_3": "value_3", ...}} </observation>

Edge cases you must handle:
 - If there are no functions that match the user request, you will respond politely that you cannot help.
<end_of_turn>
<start_of_turn>user
{question}<end_of_turn>
<start_of_turn>function_call
"""

inputs = tokenizer.encode(input_text,
                          return_tensors="pt",
                          add_special_tokens=False).to(model.device)
max_new_tokens=1500
generation_config = GenerationConfig(
              max_new_tokens=max_new_tokens,
              temperature=0.20,
              # top_p=0.55,
              top_k=3, #50,
              repetition_penalty=1.,
              do_sample=True,)
outputs = model.generate(generation_config=generation_config,
                         input_ids=inputs,
                         stopping_criteria=stopping_criteria_list,)
print(tokenizer.decode(outputs[0], skip_special_tokens=False) )

<bos><start_of_turn>system
You are a helpful assistant with access to the following functions.
Use them if required:
<tool>
[
{
    "name": "calculate_bmi",
    "description": "Calculate the Body Mass Index (BMI)",
    "parameters": {
        "type": "object",
        "properties": {
            "weight": {
                "type": "number",
                "description": "The weight in kilograms"
            },
            "height": {
                "type": "number",
                "description": "The height in meters"
            }
        },
        "required": [
            "weight",
            "height"
        ]
    }
},
{
    "name": "check_word_count",
    "description": "Check the word count of a given text",
    "parameters": {
        "type": "object",
        "properties": {
            "text": {
                "type": "string",
                "description": "The text to count the words"
            }
        },
        "required": [
            "text"
        ]
    }
}
