In [1]:
import json
import os

from sentencepiece import SentencePieceProcessor
from tinygrad import Device, Tensor, nn
from tinygrad.nn.state import get_state_dict, safe_load, torch_load, load_state_dict, get_parameters, safe_save
from transformers import AutoTokenizer, AutoConfig, DataCollatorWithPadding, Trainer, TrainingArguments

from timestep.config import settings

device = Device.DEFAULT
print(f'Using Tinygrad and {device} device.')

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


Using Tinygrad and CUDA device.


In [2]:
hf_repo_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

In [3]:
model_path = os.path.join(settings.app_dir, f"models/{hf_repo_id}/model.safetensors")
tokenizer_path = os.path.join(settings.app_dir, f"models/{hf_repo_id}/tokenizer.model")

In [4]:
# tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", eos_token="<|im_end|>")
# hf_tokenizer = AutoTokenizer.from_pretrained(hf_repo_id, pad_token="</s>")
# hf_tokenizer = AutoTokenizer.from_pretrained(hf_repo_id, bos_token="<|im_start|>", eos_token="<|im_end|>")
hf_tokenizer = AutoTokenizer.from_pretrained(hf_repo_id)
tokenizer = SentencePieceProcessor(model_file=str(tokenizer_path))

print(f"bos_token_id: {tokenizer.bos_id()}; bos_token: {tokenizer.id_to_piece(tokenizer.bos_id())}")
print(f"eos_token_id: {tokenizer.eos_id()}; eos_token: {tokenizer.id_to_piece(tokenizer.eos_id())}")
# print(f"pad_token_id: {tokenizer.pad_id()}; pad_token: {tokenizer.id_to_piece(tokenizer.pad_id())}")
print(f"unk_token_id: {tokenizer.unk_id()}; unk_token: {tokenizer.id_to_piece(tokenizer.unk_id())}")

assert tokenizer.bos_id() == hf_tokenizer.bos_token_id, f"{tokenizer.bos_id()} != {hf_tokenizer.bos_token_id}"
assert tokenizer.eos_id() == hf_tokenizer.eos_token_id, f"{tokenizer.eos_id()} != {hf_tokenizer.eos_token_id}"
# assert tokenizer.pad_id() == hf_tokenizer.pad_token_id, f"{tokenizer.pad_id()} != {hf_tokenizer.pad_token_id}"
assert tokenizer.unk_id() == hf_tokenizer.unk_token_id, f"{tokenizer.unk_id()} != {hf_tokenizer.unk_token_id}"

print(f"Note that tokenizer.pad_id() is not the same as hf_tokenizer.pad_token_id, i.e. {tokenizer.pad_id()} != {hf_tokenizer.pad_token_id}")

bos_token_id: 1; bos_token: <s>
eos_token_id: 2; eos_token: </s>
unk_token_id: 0; unk_token: <unk>
Note that tokenizer.pad_id() is not the same as hf_tokenizer.pad_token_id, i.e. -1 != 2


In [5]:
open("template.jinja", "w").write(hf_tokenizer.chat_template)

hf_tokenizer.chat_template = open("template.jinja").read()

In [6]:
conversations = []

with open("../../data/drone_training.jsonl") as f:
    for line in f:
        conversations.append(json.loads(line))

conversation = conversations[0]

# print('messages: ', conversation["messages"])

# print('parallel_tool_calls: ', conversation["parallel_tool_calls"])

# print('tools: ', conversation["tools"])

# conversation = conversation["messages"][0:2]
# conversation = conversation["messages"]

In [7]:
prompt = hf_tokenizer.apply_chat_template(
    add_generation_prompt=True,
    # add_special_tokens=True,
    # conversation=conversation,
    conversation=conversation["messages"][0:2], # Skip tool messages for now
    # return_dict=True,
    return_tensors=False,
    tokenize=False,
    tools=conversation["tools"],
)

print(prompt)

<|system|>
You are an intelligent AI that controls a drone. Given a command or request from the user,
call one of your functions to complete the request. If the request cannot be completed by your available functions, call the reject_request function.
If the request is ambiguous or unclear, reject the request.</s>
<|user|>
Let's get the drone in the air, how high should it go?</s>
<|assistant|>



In [8]:
encoded_prompt = tokenizer.encode(prompt)
print(encoded_prompt)

[529, 29989, 5205, 29989, 29958, 13, 3492, 526, 385, 13052, 296, 319, 29902, 393, 11761, 263, 4192, 650, 29889, 11221, 263, 1899, 470, 2009, 515, 278, 1404, 29892, 13, 4804, 697, 310, 596, 3168, 304, 4866, 278, 2009, 29889, 960, 278, 2009, 2609, 367, 8676, 491, 596, 3625, 3168, 29892, 1246, 278, 12560, 29918, 3827, 740, 29889, 13, 3644, 278, 2009, 338, 22363, 681, 470, 20871, 29892, 12560, 278, 2009, 21106, 29879, 29958, 13, 29966, 29989, 1792, 29989, 29958, 13, 12024, 29915, 29879, 679, 278, 4192, 650, 297, 278, 4799, 29892, 920, 1880, 881, 372, 748, 29973, 829, 29879, 29958, 13, 29966, 29989, 465, 22137, 29989, 29958, 13]


In [9]:
hf_tokenized_prompt = hf_tokenizer.tokenize(prompt, add_special_tokens=False)
hf_encoded_tokenized_prompt = hf_tokenizer.convert_tokens_to_ids(hf_tokenized_prompt)
hf_encoded_prompt = hf_tokenizer.encode(prompt, add_special_tokens=False) # Same as doing self.convert_tokens_to_ids(self.tokenize(text))

assert hf_encoded_prompt == hf_encoded_tokenized_prompt, f"\n{hf_encoded_prompt}\n!=\n{hf_encoded_tokenized_prompt}"

print(hf_encoded_prompt)

[529, 29989, 5205, 29989, 29958, 13, 3492, 526, 385, 13052, 296, 319, 29902, 393, 11761, 263, 4192, 650, 29889, 11221, 263, 1899, 470, 2009, 515, 278, 1404, 29892, 13, 4804, 697, 310, 596, 3168, 304, 4866, 278, 2009, 29889, 960, 278, 2009, 2609, 367, 8676, 491, 596, 3625, 3168, 29892, 1246, 278, 12560, 29918, 3827, 740, 29889, 13, 3644, 278, 2009, 338, 22363, 681, 470, 20871, 29892, 12560, 278, 2009, 29889, 2, 29871, 13, 29966, 29989, 1792, 29989, 29958, 13, 12024, 29915, 29879, 679, 278, 4192, 650, 297, 278, 4799, 29892, 920, 1880, 881, 372, 748, 29973, 2, 29871, 13, 29966, 29989, 465, 22137, 29989, 29958, 13]


In [10]:
decoded_encoded_prompt = tokenizer.decode(encoded_prompt)
print(decoded_encoded_prompt)

<|system|>
You are an intelligent AI that controls a drone. Given a command or request from the user,
call one of your functions to complete the request. If the request cannot be completed by your available functions, call the reject_request function.
If the request is ambiguous or unclear, reject the request.</s>
<|user|>
Let's get the drone in the air, how high should it go?</s>
<|assistant|>



In [11]:
# decoded_hf_encoded_prompt = hf_tokenizer.decode(hf_encoded_prompt, clean_up_tokenization_spaces=True)
decoded_hf_encoded_prompt = hf_tokenizer.decode(hf_encoded_prompt)
print(decoded_hf_encoded_prompt)

<|system|>
You are an intelligent AI that controls a drone. Given a command or request from the user,
call one of your functions to complete the request. If the request cannot be completed by your available functions, call the reject_request function.
If the request is ambiguous or unclear, reject the request.</s> 
<|user|>
Let's get the drone in the air, how high should it go?</s> 
<|assistant|>



In [12]:
for line_a, line_b in zip(decoded_encoded_prompt.split("\n"), decoded_hf_encoded_prompt.split("\n")):
    line_a = line_a.strip()
    line_b = line_b.strip()

    # print(line_a)
    # print(line_b)
    # print()

    assert len(line_a) == len(line_b), f"{len(line_a)} != {len(line_b)}"
    assert line_a == line_b, f"\n{line_a}\n!=\n{line_b}"

In [13]:
# assert encoded_prompt == hf_encoded_prompt, f"\n{encoded_prompt}\n!=\n{hf_encoded_prompt}"

In [14]:
# assert prompt == decoded_hf_encoded_prompt, f"\n{prompt}\n!=\n{decoded_hf_encoded_prompt}"

In [15]:
from os import getenv

from tinygrad import Tensor, Variable, Device, GlobalCounters, nn
from tinygrad.helpers import Context, Timing, Profiling, DEBUG, JIT, getenv, colored

from notebooks.Research.tinygrad.llama import Transformer, convert_from_huggingface, fix_bf16

MAX_CONTEXT = getenv("MAX_CONTEXT", 4096)

linear = nn.Linear
params = {"args": {"dim": 2048, "n_layers": 22, "n_heads": 32, "n_kv_heads": 4, "norm_eps": 1e-05, "vocab_size": 32000, "hidden_dim": 5632}}

model = Transformer(**params["args"], linear=linear, max_context=MAX_CONTEXT, jit=bool(JIT))

weights = safe_load(str(model_path))

if "model.embed_tokens.weight" in weights:
    weights = convert_from_huggingface(weights, model, params["args"]["n_heads"], params["args"].get("n_kv_heads", params["args"]["n_heads"]))

weights = fix_bf16(weights)

In [16]:
with Context(BEAM=0):
    # quantize
    # if quantize is not None:
    # weights = linear.quantize(weights, device)
    # for _,v in weights.items(): v.realize()

    # # shard
    # if isinstance(device, tuple):
    # for k,v in nn.state.get_state_dict(model).items():
    #     if 'scale' in k: v.shard_(device, axis=None)  # from quantized
    #     elif '.attention.' in k:
    #     if getenv("SHARD_KVCACHE") and ('.wq.' in k or '.wk.' in k or '.wv.' in k): v.shard_(device, axis=0)
    #     else: v.shard_(device, axis=-1)
    #     elif '.feed_forward.w1.' in k: v.shard_(device, axis=0)
    #     elif '.feed_forward.w3.' in k: v.shard_(device, axis=0)
    #     elif '.feed_forward.' in k: v.shard_(device, axis=-1)
    #     elif 'tok_embeddings.weight' in k: v.shard_(device, axis=0)
    #     elif 'output.weight' in k: v.shard_(device, axis=-1)
    #     #elif k.endswith('.weight'): v.shard_(device, axis=-1)
    #     #elif 'norm.' in k: v.shard_(device, axis=-1)
    #     else: v.shard_(device, axis=None)
    #     #print(k, v.shape, v.lazydata.axis)

    # replace weights in model
    load_state_dict(model, weights, strict=False, consume=True)

ram used:  2.20 GB, freqs_cis                                         : 100%|█| 


loaded weights in 2523.92 ms, 2.20 GB loaded at 0.87 GB/s


In [17]:
# def greedy_until(self, prompt:str, until, max_length, temperature):
import numpy as np


def greedy_until(prompt:str, until, max_length, temperature:float=0.0, top_k:int=0, top_p:float=0.8, alpha_f:float=0.0, alpha_p:float=0.0):
    # toks = [self.tokenizer.bos_id()] + self.tokenizer.encode(prompt)
    toks = [tokenizer.bos_id()] + tokenizer.encode(prompt)
    start_pos = 0

    for i in range(max_length):
        # probs = llama.model(Tensor([toks[start_pos:]]), start_pos, temperature).realize()
        # probs = model(Tensor([toks[start_pos:]], device=device), start_pos, temperature).realize()
        probs = model(Tensor([toks[start_pos:]], device=device), start_pos, temperature) # TODO: the model is selecting the highest proba token already in the sample function
        # tokens = Tensor([toks[start_pos:]], device=device)
        # probs = model.forward_jit(
        #    tokens, Variable("start_pos", 0, MAX_CONTEXT).bind(start_pos), temperature, top_k, top_p, alpha_f, alpha_p
        # )
        # probs = model.forward(tokens, Variable("start_pos", 0, MAX_CONTEXT).bind(start_pos), temperature, top_k, top_p, alpha_f, alpha_p)
        # print('type(probs): ', type(probs))
        probs_np = probs.numpy()
        # print('type(probs_np): ', type(probs_np))
        # print(f"probs_np: {probs_np}")
        # tok = int(np.random.choice(len(probs_np), p=probs_np))
        # tok = int(np.random.choice(probs_np.size, p=probs_np))
        tok = int(probs_np)
        start_pos = len(toks)
        toks.append(tok)

        # if tok == self.tokenizer.eos_id(): break
        if tok == tokenizer.eos_id(): break
        # output = self.tokenizer.decode(toks)
        output = tokenizer.decode(toks)
        # print(f"output {i}: {output}")

        for s in until:
          # if output.endswith(s): return output[0:-len(s)]
          if output.endswith(s): return output

    return output

In [18]:
output = greedy_until(prompt, until=["<|im_end|>", "</s>"], max_length=512, temperature=0.0)

print(output)

<|system|>
You are an intelligent AI that controls a drone. Given a command or request from the user,
call one of your functions to complete the request. If the request cannot be completed by your available functions, call the reject_request function.
If the request is ambiguous or unclear, reject the request.</s>
<|user|>
Let's get the drone in the air, how high should it go?</s>
<|assistant|>
The drone should be able to fly at a height of at least 10 meters (33 feet) above the ground. This is to ensure that the drone can see and avoid obstacles, such as buildings or trees, while in flight. However, the exact height may vary depending on the specifications of the drone and the environment it is operating in.
