Working at our desired level of abstraction with models.

Now, we're working with `BaseLLM` - and their derivations. https://python.langchain.com/api_reference/core/language_models/langchain_core.language_models.llms.BaseLLM.html

Creating a custom LLM wrapper: https://python.langchain.com/docs/how_to/custom_llm/

Maybe for like adding prompt formatting; close me.

In [1]:
import json_repair
import json
from typing import Any, Tuple, cast
from shutil import rmtree
from langchain_huggingface.llms import HuggingFacePipeline
from rich import print as rprint, inspect
from langchain.llms.base import BaseLLM
from pathlib import Path
import torch
import transformers
import psutil
import os

offload_folder = "offload_to_disk_folder"

if Path(offload_folder).exists(): rmtree(offload_folder)

Path(offload_folder).mkdir(exist_ok=True)

# 1.5B, 3B, 7B variants, with and without gguf.
model_id: str = "katanemo/Arch-Function-3B.gguf"

llm: HuggingFacePipeline

if Path("saved_models/arch-function-3b-q4_k_m.gguf").exists():
    llm = HuggingFacePipeline(
        pipeline=transformers.pipelines.pipeline(
            task="text-generation", 
            model="saved_models/arch-function-3b-q4_k_m.gguf",
            model_kwargs={"device_map": "auto", "torch_dtype": "bfloat16", "offload_folder": offload_folder}
        )
    )
else:
    llm = HuggingFacePipeline.from_model_id(
        task="text-generation",
        model_id=model_id,
        # specify the remote gguf file in that hub, click on "Files and versions" on https://huggingface.co/katanemo/Arch-Function-Chat-1.5B.gguf to see these files.
        # also, provide an offload_folder, so that weights can be swapped to disk.
        model_kwargs={"torch_dtype": "bfloat16", "device_map": "auto", "trust_remote_code": True, "gguf_file": "Arch-Function-3B-Q4_K_M.gguf", "offload_folder": offload_folder},
    )

    llm.pipeline.tokenizer.save_pretrained("saved_models/arch-function-3b-q4_k_m.gguf")

    llm.pipeline.model.save_pretrained("saved_models/arch-function-3b-q4_k_m.gguf")

# tokenizer and model from Transformers is still available.
# llm.pipeline.tokenizer, llm.pipeline.model

rprint("model size: ", f"{llm.pipeline.model.get_memory_footprint() / (1000 * 1000)} MB")

def open_file(file_name: str) -> str:
    """
        Opens a file.

        Args:
            file_name: the path of the file, possibly only the name of the file - to read.
        
        Returns:
            The contents of the file.
    """
    with open(file_name, "r") as file:
        return file.read()

def store_credentials(username: str, pin: Tuple[int, int, int, int]) -> bool:
    """
    Stores the username and pin.

    Args:
        username: the user's non-formal name for uniquely identifying in the system.
        pin: the 4-digit code for authenticating the user's identity.
    
    Returns:
        True if the storage succeeded or false if it failed.
    """
    try:
        with open("credentials.example", "w+") as file:
                file.write(f"username: {username}\npin: {pin[0]}{pin[1]}{pin[2]}{pin[3]}")
        return True
    except: return False

messages = [
    {"role": "user", "content": "Get the username and pin from the file named 'example.txt' - located in this directory."}
]

message_with_tools = llm.pipeline.tokenizer.apply_chat_template([
    messages
], add_generation_prompt=True, tools=[open_file, store_credentials], tokenize=False)

entire_response = ""

rprint("first user message: ", message_with_tools)

# $.invoke waits for _all_ of the tokens before returning while $.stream returns each token as its generated.
for chunk in llm.stream(message_with_tools):
    entire_response += chunk

    print(chunk, sep=" ", end="", flush=True)

messages.append({"role": "assistant", "content": entire_response})

import re
def handle_tool_requests(message: str) -> None | Tuple[str, dict]:
    """Checks if message is requesting a tool, if so, return (function_name, **func_kwargs)."""
    if message.startswith("<tool_call>") or message.startswith("```json"):
        for json_tool_call in re.findall('{.+}', message):
            rprint("matched json tool call: ", json_tool_call)

            json_tool_call = json_repair.loads(json_tool_call)

            if isinstance(json_tool_call, tuple):
                json_tool_call = json_tool_call[0]
            
            json_tool_call = cast(dict, json_tool_call)
            
            if "tool_call" in json_tool_call:
                json_tool_call = json_tool_call["tool_call"]

            return (getattr(__import__('__main__'), json_tool_call["name"])(**json_tool_call["arguments"]), json_tool_call)
    return None

tool_result = handle_tool_requests(entire_response)

if tool_result is not None:
    tool_result, tool_used = tool_result

    # add response to message history; the LLM doesn't store it for us.
    messages.append({"role": "assistant", "content": "", "function_call": entire_response})

    single_message_with_tool_result = {
        "role": "tool", 
        "name": tool_used["name"] if not "tool_call" in tool_used else tool_used["tool_call"]["name"], 
        "content": tool_result
    }

    rprint("message with tool result: ", single_message_with_tool_result)

    messages.append(single_message_with_tool_result)

    full_prompt_with_tool_result = llm.pipeline.tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        tools=[open_file, store_credentials]
    )

    rprint("full-prompt with tool request: ", full_prompt_with_tool_result)

    entire_response = ""

    for chunk in llm.stream(full_prompt_with_tool_result):
        entire_response += chunk

        print(chunk, sep=" ", end="", flush=True)
else:
    rprint("final message from LLM: ", entire_response)

messages.append({"role": "assistant", "content": entire_response})

messages.append({"role": "user", "content": "change the user 'tjowers' pin at the third digit to a nine."})

full_prompt_with_update_credentials_request = llm.pipeline.tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=False,
    tools=[open_file, store_credentials]
)

rprint("full prompt asking to update credentials: ", full_prompt_with_update_credentials_request)

entire_response = ""

for chunk in llm.stream(full_prompt_with_update_credentials_request):
    entire_response += chunk
    print(chunk, end ="", sep=" ", flush=True)

tool_result = handle_tool_requests(entire_response)

if tool_result is not None:
    tool_result, tool_used = tool_result

    messages.append({"role": "assistant", "content": "", "function_call": entire_response})

    single_message_with_tool_result = {
        "role": "tool", 
        "name": tool_used["name"] if not "tool_call" in tool_used else tool_used["tool_call"]["name"], 
        "content": tool_result
    }

    rprint("message with tool result: ", single_message_with_tool_result)

    messages.append(single_message_with_tool_result)

    full_prompt_with_tool_result = llm.pipeline.tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        tools=[open_file, store_credentials]
    )

    rprint("full-prompt with tool result: ", full_prompt_with_tool_result)

    entire_response = ""

    for chunk in llm.stream(full_prompt_with_tool_result):
        entire_response += chunk

        print(chunk, sep=" ", end="", flush=True)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


<tool_call>
{"name": "open_file", "arguments": {"file_name": "example.txt"}}
</tool_call>

I have retrieved the username and pin from the file. The username is tjowers and the pin is 0808.

<tool_call>
{"name": "store_credentials", "arguments": {"username": "tjowers", "pin": [0, 8, 9, 8]}}
</tool_call>

The pin for the user 'tjowers' has been successfully changed.