# Tinygrad TinyLlama-1.1B-Chat-v1.0

This example was copied and modified from:
- https://github.com/tinygrad/tinygrad/blob/66d0b14a205c33e8a6ae4e9a765f6351b34c2923/examples/llama.py.
- https://github.com/tinygrad/tinygrad/blob/66d0b14a205c33e8a6ae4e9a765f6351b34c2923/extra/models/llama.py
- https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0

In [1]:
# %pip install -q sentencepiece

In [2]:
from datetime import datetime
import json
from pathlib import Path
import os
from os import getenv

from tqdm import tqdm
import mlflow
import openai
import pandas as pd
# from sentencepiece import SentencePieceProcessor
from tinygrad import Device, Tensor, nn, Variable, GlobalCounters
from tinygrad.helpers import Context, Timing, Profiling, DEBUG, JIT, getenv, colored
from tinygrad.nn.state import get_state_dict, safe_load, torch_load, load_state_dict, get_parameters, safe_save
from transformers import AutoTokenizer, AutoConfig, DataCollatorWithPadding, pipeline, Trainer, TrainingArguments

# from notebooks.Research.lib.transformers.src.transformers.models.llama.modeling_tinygrad_llama import TinygradLlamaForCausalLM
# from notebooks.Research.tinygrad.train_llama import function_calling_template
from timestep.config import settings

from src.transformers.models.llama.modeling_tinygrad_llama import TinygradLlamaForCausalLM

device = Device.DEFAULT
print(f'Using Tinygrad and {device} device.')

settings.model_dump()

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


Using Tinygrad and CUDA device.


{'app_dir': '/home/mjschock/.config/timestep',
 'bearerinfo_func': 'timestep.api.decode_token',
 'default_hf_repo_id': 'Mozilla/TinyLlama-1.1B-Chat-v1.0-llamafile',
 'default_llamafile_host': '0.0.0.0',
 'default_llamafile_port': 8080,
 'default_model_filename': 'TinyLlama-1.1B-Chat-v1.0.F16.llamafile',
 'default_multimodal_model_projector_filename': None,
 'openai_api_key': SecretStr('**********'),
 'openai_base_url': 'http://localhost:8000/api/openai/v1',
 'openai_org_id': 'organization_id',
 'openai_project_id': 'project_id',
 'poetry_repositories_testpypi_url': 'https://test.pypi.org/legacy/',
 'poetry_virtualenvs_in_project': True,
 'poetry_virtualenvs_prefer_active_python': True,
 'prefect_api_url': 'http://127.0.0.1:4200/api',
 'prefect_logging_level': 'INFO',
 'prefect_logging_log_prints': True,
 'pyenv_version': '3.10.14',
 'verbose': True}

In [3]:
hf_repo_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

In [4]:
# config = AutoConfig.from_pretrained(hf_repo_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
config = AutoConfig.from_pretrained(hf_repo_id)
config

LlamaConfig {
  "_name_or_path": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 5632,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 22,
  "num_key_value_heads": 4,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.44.0",
  "use_cache": true,
  "vocab_size": 32000
}

In [5]:
# model_path = os.path.join(settings.app_dir, f"models/{hf_repo_id}/model.safetensors")
model_path: Path = Path(settings.app_dir) / f"models/{hf_repo_id}/model.safetensors"
# tokenizer_path = os.path.join(settings.app_dir, f"models/{hf_repo_id}/tokenizer.model")

# model = TinygradLlamaForCausalLM.from_pretrained(hf_repo_id) # TODO: this throws PyTorch error
model = TinygradLlamaForCausalLM(
    config=config,
    device=device,
    model_path=model_path,
)

ram used:  2.20 GB, freqs_cis                                         : 100%|█| 


loaded weights in 2251.91 ms, 2.20 GB loaded at 0.98 GB/s


In [6]:
tokenizer = AutoTokenizer.from_pretrained( # TODO: make custom tokenizer that converts sentencepiece to tiktoken so it runs in Termux
    hf_repo_id,
    # bos_token=bos_token,
    # clean_up_tokenization_spaces=True,
    # eos_token=eos_token,
)

tokenizer.pad_token = tokenizer.eos_token

In [7]:
assert tokenizer.bos_token_id == model.config.bos_token_id, f"{tokenizer.bos_token_id} != {model.config.bos_token_id}"
assert tokenizer.eos_token_id == model.config.eos_token_id, f"{tokenizer.eos_token_id} != {model.config.eos_token_id}"

In [8]:
conversations = []

with open("../../data/drone_training.jsonl") as f:
    for line in f:
        conversations.append(json.loads(line))

conversation = conversations[0]

# print('messages: ', conversation["messages"])

# print('parallel_tool_calls: ', conversation["parallel_tool_calls"])

# print('tools: ', conversation["tools"])

# conversation = conversation["messages"][0:2]
# conversation = conversation["messages"]
conversation["messages"]

[{'role': 'system',
  'content': 'You are an intelligent AI that controls a drone. Given a command or request from the user,\ncall one of your functions to complete the request. If the request cannot be completed by your available functions, call the reject_request function.\nIf the request is ambiguous or unclear, reject the request.'},
 {'role': 'user',
  'content': "Let's get the drone in the air, how high should it go?"},
 {'role': 'assistant',
  'tool_calls': [{'id': 'call_id',
    'type': 'function',
    'function': {'name': 'takeoff_drone', 'arguments': '{"altitude": 100}'}}]}]

In [9]:
# template = """{%- for message in messages %}
# {%- if message['role'] == 'assistant' %}
# {{- '<|assistant|>' + '\n' }}
# {%- if message['content'] %}
# {{- message['content'] + eos_token + '\n' }}
# {%- elif message['tool_calls'] %}
# {{- 'tool_calls:' + '\n' }}
# {%- for tool_call in message['tool_calls'] %}
# {{- tool_call['function']['name'] + ': ' + tool_call['function']['arguments'] }}
# {%- endfor %}
# {{- eos_token + '\n' }}
# {%- endif %}
# {%- elif message['role'] == 'system' %}
# {{- '<|system|>' + '\n' }}
# {{- message['content'] }}
# {%- if tools %}
# {{- '\n\nYou are aware of the following tools:\n' }}
# {%- for tool in tools %}
# {{- tool.function.name }}: {{ tool.function.parameters | tojson }}
# {%- endfor %}
# {{- '\nYou can suggest tool calls by responding in the following format:\n' }}
# {{- 'tool_calls:' + '\n' }}
# {{- 'tool_name: {\"arg1\": \"value1\", \"arg2\": \"value2\"}' }}
# {{- eos_token + '\n' }}
# {%- endif %}
# {%- elif message['role'] == 'user' %}
# {{- '<|user|>' + '\n' }}
# {{- message['content'] + eos_token + '\n' }}
# {%- endif %}
# {%- if loop.last and add_generation_prompt %}
# {{- '<|assistant|>' + '\n' }}
# {%- endif %}
# {%- endfor %}"""

# open("template.jinja", "w").write(template)
open("template.jinja", "w").write(tokenizer.chat_template)
# # open("template.jinja", "w").write(function_calling_template)
# # open("template.jinja", "w").write(llama_3_1_8b_instruct_chat_template)

tokenizer.chat_template = open("template.jinja").read()

In [10]:
def generate_prompt(add_generation_prompt=False, messages=[], tool_calls=True, tools=[]):
    return tokenizer.apply_chat_template(
        add_generation_prompt=add_generation_prompt,
        conversation=messages,
        return_tensors=False,
        tokenize=False,
        tool_calls=tool_calls,
        tools=tools,
    )

prompt = generate_prompt(
    add_generation_prompt=True,
    messages=conversation["messages"][0:2],
    tools=conversation["tools"],
)

print(prompt)

<|system|>
You are an intelligent AI that controls a drone. Given a command or request from the user,
call one of your functions to complete the request. If the request cannot be completed by your available functions, call the reject_request function.
If the request is ambiguous or unclear, reject the request.</s>
<|user|>
Let's get the drone in the air, how high should it go?</s>
<|assistant|>



In [11]:
def predict(prompt: str):
    inputs = tokenizer(prompt, return_tensors="np", add_special_tokens=False)
    inputs = {k: Tensor(v, device=device) for k, v in inputs.items()}

    outputs = model.generate(**inputs, do_sample=True, max_new_tokens=256, temperature=0.0)

    # decoded_output = tokenizer.decode(outputs[0][inputs['input_ids'].size(1):], skip_special_tokens=True)
    decoded_output = tokenizer.decode(outputs[0][inputs['input_ids'].size(1):])

    return decoded_output

start = datetime.now()

prediction = predict(prompt)

print(f"Elapsed: {datetime.now()-start}")

print(prediction)

Warmup: 0:00:01.548814
Elapsed: 0:00:32.776468
Elapsed: 0:00:34.327884
The drone should be able to fly at a height of at least 10 meters (33 feet) above the ground. This is the minimum height required for safe and effective flight. If the drone is flying too low or too high, it may cause damage to the aircraft or the surrounding environment.</s>
