<a href="https://colab.research.google.com/github/RiteshDKgpian/X-tra-Telegram/blob/master/Fine_Tuned_LLM_for_Function_Calling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Taken from unsloth example
%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
if major_version >= 8:
    # RTX 30xx, RTX 40xx, A100, H100, L40
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes pydantic jinja2
else:
    # V100, Tesla T4, RTX 20xx
    !pip install --no-deps xformers trl peft accelerate bitsandbytes pydantic jinja2
pass

In [2]:
import json
from jinja2 import Template
from typing import List, Dict

lama3_template = \
    "{% if messages[0]['role'] == 'system' %}"\
        "<|start_header_id|>system<|end_header_id|>\n\n"\
        "{{ messages[0]['content'] }}\n"\
        "{{ tools }}\n"\
    "{% endif %}"\
    "{% for message in messages %}"\
        "{% if message['role'] == 'user' %}"\
            "<|start_header_id|>user<|end_header_id|>\n\n"\
            "{{ message['content'] }}\n"\
        "{% elif message['role'] == 'tool' %}"\
            "<|start_header_id|>assistant<|end_header_id|>\n\n"\
            "<functioncall> {{ message['content'] }}<|eot_id|>\n"\
        "{% elif message['role'] == 'tool_response' %}"\
            "<|start_header_id|>assistant<|end_header_id|>\n\n"\
            "{{ message['content'] }}\n"\
        "{% elif message['role'] == 'assistant' %}"\
            "<|start_header_id|>assistant<|end_header_id|>\n\n"\
            "{{ message['content'] }}<|eot_id|>\n"\
        "{% endif %}"\
    "{% endfor %}"\
    "{% if tool_call %}"\
        "<|start_header_id|>assistant<|end_header_id|>\n\n<functioncall> "\
    "{% endif %}"

def render(messages: List[Dict[str, str]], tools: List[Dict[str, str]], tool_call=False) -> str:
    # Ensure there is a system message at the beginning
    if messages[0]['role'] != 'system':
        messages.insert(0, {'role': 'system', 'content': ''})

    # Convert tools to a JSON string
    tools_json = []
    for toll in tools:
        tools_json.append(json.dumps(toll, indent = 4)) # json.dumps(tools, indent = 4)

    tools_json = '\n'.join(tools_json)

    # Create a Jinja template instance
    template = Template(lama3_template)

    # Render the template with the provided messages, tools, and add_generation_prompt
    rendered_string = template.render(messages=messages, tools=tools_json, tool_call=tool_call)


    return rendered_string

def get_func_call(text: str, prompt: str) -> str:
    return text.split(prompt)[1].split("<|eot_id|>")[0]


In [3]:
from typing import List, Dict

def test_render_arithmetic_prompt():
    messages = [
        {'role': 'user', 'content': 'Calculate the 3 * 12 + 3?'},
        {'role': 'tool', 'content': '{"name": "mul", "arguments": \'{"a": 3, "b": 12}\'} <|eot_id|>'},
        {'role': 'tool_response', 'content': '{"result": 36}'},
    ]

    tools = [
        {
            "name": "add",
            "description": "Calculate the sum of two numbers",
            "parameters": {
                "type": "object",
                "properties": {
                    "a": {
                        "type": "number",
                        "description": "The first number to add"
                    },
                    "b": {
                        "type": "integer",
                        "description": "The second number to add"
                    }
                },
                "required": [
                    "a",
                    "b"
                ]
            }
        },
        {
            "name": "mul",
            "description": "Calculate the product of two numbers",
            "parameters": {
                "type": "object",
                "properties": {
                    "a": {
                        "type": "number",
                        "description": "The first number to multiply"
                    },
                    "b": {
                        "type": "integer",
                        "description": "The second number to multiply"
                    }
                },
                "required": [
                    "a",
                    "b"
                ]
            }
        }
    ]

    rendered_string = render(messages, tools, tool_call=True)
    return rendered_string

def test_rednder_email_prompt():
    # Test the Render function
    messages = [
        {"role": "system", "content": "You are a helpful assistant with access to the following functions. Use them if required -"},
        {'role': 'user', 'content': "Hi, send an email to tom@kidocode.com and ask him to join our weekend party?"},
    ]

    tools = [
        {
            "name": "send_email",
            "description": "Send an email for the given recipient and message",
            "parameters": {
                "type": "object",
                "properties": {
                    "recipient": {
                        "type": "string",
                        "description": "The email address of the recipient"
                    },
                    "message": {
                        "type": "string",
                        "description": "The message to send"
                    }
                },
                "required": [
                    "recipient",
                    "message"
                ]
            }
        }
    ]

    rendered_string = render(messages, tools, tool_call=True)
    return rendered_string

# Try out the functions to see how we format the prompt
# print(test_render_arithmetic_prompt())
# print(test_rednder_email_prompt())


In [4]:
from unsloth import FastLanguageModel
from pydantic import BaseModel
import torch
max_seq_length = 4096 * 2
dtype = None

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unclecode/llama3-function-call-lora-adapter-240424",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = True,
)
FastLanguageModel.for_inference(model)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


adapter_config.json:   0%|          | 0.00/741 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Llama patching release 2024.6
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/131 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.1k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Unsloth 2024.6 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [5]:
prompt = test_rednder_email_prompt()
prompt = test_render_arithmetic_prompt()
inputs = tokenizer(
[
    prompt
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 256, pad_token_id = tokenizer.eos_token_id)
response = tokenizer.batch_decode(outputs)

print(response[0])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>


{
    "name": "add",
    "description": "Calculate the sum of two numbers",
    "parameters": {
        "type": "object",
        "properties": {
            "a": {
                "type": "number",
                "description": "The first number to add"
            },
            "b": {
                "type": "integer",
                "description": "The second number to add"
            }
        },
        "required": [
            "a",
            "b"
        ]
    }
}
{
    "name": "mul",
    "description": "Calculate the product of two numbers",
    "parameters": {
        "type": "object",
        "properties": {
            "a": {
                "type": "number",
                "description": "The first number to multiply"
            },
            "b": {
                "type": "integer",
                "description": "The second number to multiply"
            }
        },
        "required": [
            "

In [6]:
print(get_func_call(response[0], prompt))

 {"name": "add", "arguments": '{"a": 36, "b": 3}'} 


In [7]:
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|start_header_id|>system<|end_header_id|>


{
    "name": "add",
    "description": "Calculate the sum of two numbers",
    "parameters": {
        "type": "object",
        "properties": {
            "a": {
                "type": "number",
                "description": "The first number to add"
            },
            "b": {
                "type": "integer",
                "description": "The second number to add"
            }
        },
        "required": [
            "a",
            "b"
        ]
    }
}
{
    "name": "mul",
    "description": "Calculate the product of two numbers",
    "parameters": {
        "type": "object",
        "properties": {
            "a": {
                "type": "number",
                "description": "The first number to multiply"
            },
            "b": {
                "type": "integer",
                "description": "The second number to multiply"
            }
        },
        "required": [
            "

In [8]:
from unsloth import FastLanguageModel
from pydantic import BaseModel
import torch
max_seq_length = 4096 * 2
dtype = None

tiny_llama_model, tiny_llama_tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unclecode/tinyllama-function-call-lora-adapter-250424",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = False,
)
FastLanguageModel.for_inference(tiny_llama_model)

adapter_config.json:   0%|          | 0.00/736 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/719 [00:00<?, ?B/s]

Unsloth: unsloth/tinyllama-chat can only handle sequence lengths of at most 2048.
But with kaiokendev's RoPE scaling of 4.0, it can be magically be extended to 8192!


==((====))==  Unsloth: Fast Llama patching release 2024.6
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/101M [00:00<?, ?B/s]

Unsloth 2024.6 patched 22 layers with 22 QKV layers, 22 O layers and 22 MLP layers.


In [9]:
prompt = test_rednder_email_prompt()
inputs = tiny_llama_tokenizer(
[
    prompt
], return_tensors = "pt").to("cuda")

outputs = tiny_llama_model.generate(**inputs, max_new_tokens = 128, pad_token_id = tiny_llama_tokenizer.eos_token_id)
response = tiny_llama_tokenizer.batch_decode(outputs)

print(response[0])

<s> <|start_header_id|>system<|end_header_id|>

You are a helpful assistant with access to the following functions. Use them if required -
{
    "name": "send_email",
    "description": "Send an email for the given recipient and message",
    "parameters": {
        "type": "object",
        "properties": {
            "recipient": {
                "type": "string",
                "description": "The email address of the recipient"
            },
            "message": {
                "type": "string",
                "description": "The message to send"
            }
        },
        "required": [
            "recipient",
            "message"
        ]
    }
}
<|start_header_id|>user<|end_header_id|>

Hi, send an email to tom@kidocode.com and ask him to join our weekend party?
<|start_header_id|>assistant<|end_header_id|>

<functioncall> 
{
 "name": "send_email",
 "arguments": {
 "recipient": "tom@kido.com",
 "message": "Hi, I would like to invite you to our weekend party. Plea

In [10]:
print(get_func_call(response[0], prompt))


{
 "name": "send_email",
 "arguments": {
 "recipient": "tom@kido.com",
 "message": "Hi, I would like to invite you to our weekend party. Please let me know if you're available."
 }
} 


In [11]:
from transformers import TextStreamer
text_streamer = TextStreamer(tiny_llama_tokenizer)
_ = tiny_llama_model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)


<s> <|start_header_id|>system<|end_header_id|>

You are a helpful assistant with access to the following functions. Use them if required -
{
    "name": "send_email",
    "description": "Send an email for the given recipient and message",
    "parameters": {
        "type": "object",
        "properties": {
            "recipient": {
                "type": "string",
                "description": "The email address of the recipient"
            },
            "message": {
                "type": "string",
                "description": "The message to send"
            }
        },
        "required": [
            "recipient",
            "message"
        ]
    }
}
<|start_header_id|>user<|end_header_id|>

Hi, send an email to tom@kidocode.com and ask him to join our weekend party?
<|start_header_id|>assistant<|end_header_id|>

<functioncall> 
{
"name": "send_email",
"arguments": {
"recipient": "tom@kido.com",
"message": "Hi, I would like to invite you to our weekend party. Please l