In [1]:
!git clone https://github.com/ShaneKoNaung/llama2-train-from-scratch.git


Cloning into 'llama2-train-from-scratch'...
remote: Enumerating objects: 16, done.[K
remote: Counting objects: 100% (16/16), done.[K
remote: Compressing objects: 100% (10/10), done.[K
remote: Total 16 (delta 2), reused 13 (delta 2), pack-reused 0[K
Receiving objects: 100% (16/16), 655.72 KiB | 23.42 MiB/s, done.
Resolving deltas: 100% (2/2), done.
/content/llama2-train-from-scratch


In [1]:
%cd llama2-train-from-scratch

/content/llama2-train-from-scratch


In [2]:
import os

## Download dataset

download input.txt from [this link](https://github.com/karpathy/char-rnn/blob/master/data/tinyshakespeare/input.txt) and put input.txt into `data/tinyshakespeare` directory.

In [3]:
DATA_CACHE_DIR = "data"
data_dir = os.path.join(DATA_CACHE_DIR, 'tinyshakespeare')

In [4]:
os.makedirs(DATA_CACHE_DIR, exist_ok=True)
os.makedirs(data_dir, exist_ok=True)

In [5]:
def prepare():

    data_dir = os.path.join(DATA_CACHE_DIR, 'tinyshakespeare')

    filename = os.path.join(data_dir, 'input.txt')
    data = []
    with open(filename, "r") as f:
        for line in f:
            line = line.strip()
            if line:
                data.append(line)
    print(f"Example story : \n {data[:10]}")

In [6]:
prepare()

Example story : 
 ['First Citizen:', 'Before we proceed any further, hear me speak.', 'All:', 'Speak, speak.', 'First Citizen:', 'You are all resolved rather to die than to famish?', 'All:', 'Resolved. resolved.', 'First Citizen:', 'First, you know Caius Marcius is chief enemy to the people.']


## Tokenization

In [7]:
from typing import List

from sentencepiece import SentencePieceProcessor

In [8]:
TOKENIZER_MODEL = "tokenizer.model"

In [9]:
class Tokenizer:
    def __init__(self, tokenizer_model=None):
        model_path = tokenizer_model if tokenizer_model else TOKENIZER_MODEL
        assert os.path.isfile(model_path), model_path
        self.sp_model = SentencePieceProcessor(model_file=model_path)
        self.model_path = model_path

        # BOS / EOS token IDs
        self.n_words: int = self.sp_model.vocab_size()
        self.bos_id: int = self.sp_model.bos_id()
        self.eos_id: int = self.sp_model.eos_id()
        self.pad_id: int = self.sp_model.pad_id()
        #print(f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}")
        assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()

    def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
        assert type(s) is str
        t = self.sp_model.encode(s)
        if bos:
            t = [self.bos_id] + t
        if eos:
            t = t + [self.eos_id]
        return t

    def decode(self, t: List[int]) -> str:
        return self.sp_model.decode(t)


In [10]:
t = Tokenizer()
print(f"Encode : {t.encode('hello world', 1, 1)}")
print(f"Decode : {t.decode(t.encode('hello world',1, 1))}")

Encode : [1, 22172, 3186, 2]
Decode : hello world


### Tokenize the dataset using llama2 tokenizer

In [11]:
import numpy as np


In [12]:
def pretokenize():
    data_dir = os.path.join(DATA_CACHE_DIR, "tinyshakespeare")
    enc = Tokenizer()

    filename = os.path.join(data_dir, "input.txt")
    all_tokens = []
    with open(filename, "r") as f:
        for text in f:
            text = text.strip()
            if text:
                tokens = enc.encode(text, bos=True, eos=True)
                all_tokens.extend(tokens)
    all_tokens = np.array(all_tokens, dtype=np.uint16)

    tokenized_filename = os.path.join(data_dir, "data.bin")

    with open(tokenized_filename, "wb") as f:
        f.write(all_tokens.tobytes())

    avg_seq_len = all_tokens.size / ((all_tokens == 1).sum())
    print(f"Saved {tokenized_filename}, average seqlen: {avg_seq_len:.4f}")


In [13]:
pretokenize()

Saved data/tinyshakespeare/data.bin, average seqlen: 11.9347


## Batch Iteration

In [14]:
import random
import glob

import torch
import torch.distributed as dist

from functools import partial

In [15]:
class PretokDataset(torch.utils.data.IterableDataset):
    def __init__(self, split, max_seq_len, vocab_size):
        super().__init__()
        self.split = split
        self.max_seq_len = max_seq_len
        self.vocab_size = vocab_size

    def __iter__(self):
        worker_info = torch.utils.data.get_worker_info()
        worker_id = worker_info.id if worker_info else 0
        rank = dist.get_rank() if dist.is_initialized() else 0
        seed = 42 + worker_id + 1337 * rank
        rng = random.Random(seed)
        print(f"Created a PretokDataset with rng seed {seed}")

        bin_dir = os.path.join(DATA_CACHE_DIR, "tinyshakespeare")
        filename = sorted(glob.glob(os.path.join(bin_dir, "*.bin")))[0]

        assert len(filename)>0, f"No bin files found in {bin_dir}"
        while True:

            m = np.memmap(filename, dtype=np.uint16, mode="r")
            num_batches = len(m) // self.max_seq_len
            num_batches -= 1
            assert num_batches > 0, "this file is way too small? investigatte."
            ixs = list(range(num_batches))
            rng.shuffle(ixs)
            for ix in ixs:
                start = ix * self.max_seq_len
                end = start + self.max_seq_len + 1
                chunk = torch.from_numpy((m[start: end]).astype(np.int64))
                x = chunk[:-1]
                y = chunk[1:]
                yield x,y

In [16]:
class Task:

    @staticmethod
    def iter_batches(batch_size, device, num_workers=0, **dataset_kwargs):
        ds = PretokDataset(**dataset_kwargs)
        dl = torch.utils.data.DataLoader(
            ds, batch_size=batch_size, pin_memory=True, num_workers=num_workers
        )
        for x, y in dl:
            x = x.to(device, non_blocking=True)
            y = y.to(device, non_blocking=True)
            yield x, y

In [17]:
batch_size = 8
max_seq_len = 128
vocab_size = 32000
split="Train"
device = "cpu"

In [18]:
iter_batches = partial(
    Task.iter_batches,
    batch_size= batch_size,
    split=split,
    max_seq_len =max_seq_len,
    vocab_size = vocab_size,
    device = device,
    num_workers=0,)

In [19]:
batch_iter = iter_batches()

In [20]:
X, y = next(batch_iter)
print(X[:4, :4])
print(y[:4, :4])
X.shape, y.shape

Created a PretokDataset with rng seed 42
tensor([[    1,  1152,  3362,  6669],
        [29901,   437,   366,  1023],
        [ 1244, 12021, 29915, 29881],
        [ 3573, 29915, 29879,  3158]])
tensor([[ 1152,  3362,  6669,   322],
        [  437,   366,  1023,  1073],
        [12021, 29915, 29881,   592],
        [29915, 29879,  3158,  6860]])


(torch.Size([8, 128]), torch.Size([8, 128]))

## Training

In [21]:
!export LC_ALL="en_US.UTF-8"
!export LD_LIBRARY_PATH="/usr/lib64-nvidia"
!export LIBRARY_PATH="/usr/local/cuda/lib64/stubs"
!ldconfig /usr/lib64-nvidia

/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_5.so.3 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc_proxy.so.2 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_0.so.3 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbb.so.12 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc.so.2 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbbind.so.3 is not a symbolic link



In [22]:
import math
import time

from datetime import datetime
from contextlib import nullcontext

from model import Transformer, ModelArgs
from export import model_export

In [23]:
# -----------------------------------------------------------------------------
# I/O
out_dir = "out"
eval_interval = 2000
log_interval = 1
eval_iters = 10
eval_only = False  # if True, script exits right after the first eval
always_save_checkpoint = False  # if True, always save a checkpoint after each eval
init_from = "scratch"  # 'scratch' or 'resume'
# wandb logging
wandb_log = False  # disabled by default
wandb_project = "llamac"
wandb_run_name = "run" + datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
# data
batch_size = 64  # if gradient_accumulation_steps > 1, this is the micro-batch size
max_seq_len = 256
vocab_source = "llama2" # llama2|custom; use Lllama 2 vocab from Meta, or custom trained
vocab_size = 32000 # the Llama 2 tokenizer has 32K tokens
# model
dim = 288
n_layers = 6
n_heads = 6
n_kv_heads = 6
multiple_of = 32
dropout = 0.0
# adamw optimizer
gradient_accumulation_steps = 4  # used to simulate larger batch sizes
learning_rate = 5e-4  # max learning rate
max_iters = 10000  # total number of training iterations
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.95
grad_clip = 1.0  # clip gradients at this value, or disable if == 0.0
# learning rate decay settings
decay_lr = True  # whether to decay the learning rate
warmup_iters = 1000  # how many steps to warm up for
# system
device = "cuda"  # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
dtype = "float16"  # float32|bfloat16|float16
compile = True  # use PyTorch 2.0 to compile the model to be faster
# -----------------------------------------------------------------------------

In [24]:
# fixing some hyperparams to sensible defaults
lr_decay_iters = max_iters  # should be ~= max_iters per Chinchilla
min_lr = 0.0  # minimum learning rate, should be ~= learning_rate/10 per Chinchilla

In [25]:
master_process = True
seed_offset = 0
ddp_world_size = 1

In [26]:
tokens_per_iter = gradient_accumulation_steps * ddp_world_size * batch_size * max_seq_len

if master_process:
    print(f"tokens per iteration will be: {tokens_per_iter:,}")
    print(f"breaks down as: {gradient_accumulation_steps} grad accum steps * {ddp_world_size} processes * {batch_size} batch_size * {max_seq_len} max seq len")

if master_process:
    os.makedirs(out_dir, exist_ok=True)

tokens per iteration will be: 65,536
breaks down as: 4 grad accum steps * 1 processes * 64 batch_size * 256 max seq len


In [27]:
torch.manual_seed(1337 + seed_offset)
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
device_type = "cuda" if "cuda" in device else "cpu"

ptdtype = {"float32": torch.float32, "bfloat16": torch.bfloat16, "float16": torch.float16}[dtype]

ctx = (
    nullcontext()
    if device_type == "cpu"
    else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
)

In [28]:
# task-specific setup
iter_batches = partial(
    Task.iter_batches,
    batch_size=batch_size,
    max_seq_len=max_seq_len,
    vocab_size=vocab_size,
    device=device,
    num_workers=0,
)

In [29]:
# init these up here, can override if init_from='resume' (i.e. from a checkpoint)
iter_num = 0
best_val_loss = 1e9

model_args = dict(
    dim=dim,
    n_layers=n_layers,
    n_heads=n_heads,
    n_kv_heads=n_kv_heads,
    vocab_size=vocab_size,
    multiple_of=multiple_of,
    max_seq_len=max_seq_len,
    dropout=dropout,
)

if init_from == "scratch":

    print("Initializing a new model from scratch")
    gptconf = ModelArgs(**model_args)
    model = Transformer(gptconf)

elif init_from == "resume":
    print(f"Resuming training from {out_dir}")
    # resume training from a checkpoint.
    ckpt_path = os.path.join(out_dir, "ckpt.pt")
    checkpoint = torch.load(ckpt_path, map_location=device)
    checkpoint_model_args = checkpoint["model_args"]
    # force these config attributes to be equal otherwise we can't even resume training
    # the rest of the attributes (e.g. dropout) can stay as desired from command line
    for k in ["dim", "n_layers", "n_heads", "n_kv_heads", "vocab_size", "multiple_of", "max_seq_len"]:
        model_args[k] = checkpoint_model_args[k]
    # create the model
    gptconf = ModelArgs(**model_args)
    model = Transformer(gptconf)
    state_dict = checkpoint["model"]
    # fix the keys of the state dictionary :(
    # honestly no idea how checkpoints sometimes get this prefix, have to debug more
    unwanted_prefix = "_orig_mod."
    for k, v in list(state_dict.items()):
        if k.startswith(unwanted_prefix):
            state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k)
    model.load_state_dict(state_dict)
    iter_num = checkpoint["iter_num"]
    best_val_loss = checkpoint["best_val_loss"]
model.to(device)



Initializing a new model from scratch


Transformer(
  (tok_embeddings): Embedding(32000, 288)
  (dropout): Dropout(p=0.0, inplace=False)
  (layers): ModuleList(
    (0-5): 6 x TransformerBlock(
      (attention): Attention(
        (wq): Linear(in_features=288, out_features=288, bias=False)
        (wk): Linear(in_features=288, out_features=288, bias=False)
        (wv): Linear(in_features=288, out_features=288, bias=False)
        (wo): Linear(in_features=288, out_features=288, bias=False)
        (attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_dropout): Dropout(p=0.0, inplace=False)
      )
      (feed_forward): FeedForward(
        (w1): Linear(in_features=288, out_features=768, bias=False)
        (w2): Linear(in_features=768, out_features=288, bias=False)
        (w3): Linear(in_features=288, out_features=768, bias=False)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (attention_norm): RMSNorm()
      (ffn_norm): RMSNorm()
    )
  )
  (norm): RMSNorm()
  (output): Linear(in_features=288, ou

In [30]:
# initialize a GradScaler. If enabled=False scaler is a no-op
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == "float16"))

In [31]:
# optimizer
optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type)
if init_from == "resume" and "optimizer" in checkpoint:
    optimizer.load_state_dict(checkpoint["optimizer"])
checkpoint = None  # free up memory

num decayed parameter tensors: 43, with 15,187,968 parameters
num non-decayed parameter tensors: 13, with 3,744 parameters
using fused AdamW: True


In [32]:
# compile the model
if compile:
    print("compiling the model... (takes a ~minute)")
    unoptimized_model = model
    model = torch.compile(model)  # requires PyTorch 2.0


compiling the model... (takes a ~minute)


In [33]:
# helps estimate an arbitrarily accurate loss over either split using many batches
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ["train", "val"]:
        batch_iter = iter_batches(split=split)
        losses = torch.zeros(eval_iters)  # keep on CPU
        for k in range(eval_iters):
            X, Y = next(batch_iter)
            with ctx:
                logits = model(X, Y)
                loss = raw_model.last_loss
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

# learning rate decay scheduler (cosine with warmup)
def get_lr(it):
    # 1) linear warmup for warmup_iters steps
    if it < warmup_iters:
        return learning_rate * it / warmup_iters
    # 2) if it > lr_decay_iters, return min learning rate
    if it > lr_decay_iters:
        return min_lr
    # 3) in between, use cosine decay down to min learning rate
    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))  # coeff ranges 0..1
    return min_lr + coeff * (learning_rate - min_lr)



In [34]:
# training loop

train_batch_iter = iter_batches(split="train")
X, Y = next(train_batch_iter) # fetch the very first batch
t0 = time.time()
local_iter_num = 0 # number of iterations in the lifetime of this process
raw_model = model

running_mfu = -1.0

while True:
    # determine and set the learning rate for this iteration
    lr = get_lr(iter_num) if decay_lr else learning_rate
    for param_group in optimizer.param_groups:
        param_group["lr"] = lr

    if iter_num % eval_interval == 0 and master_process:
        losses = estimate_loss()
        print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        if wandb_log:
            try:
                wandb.log(
                    {
                        "iter": iter_num,
                        "tokens": iter_num * tokens_per_iter,
                        "loss/train": losses["train"],
                        "loss/val": losses["val"],
                        "lr": lr,
                        "mfu": running_mfu * 100,  # convert to percentage
                    }, step = iter_num
                )
            except Exception as e:
                print(f"logging to wandb failed: {e}")
        if losses["val"] < best_val_loss or always_save_checkpoint:
            best_val_loss = losses["val"]
            if iter_num > 0:
                checkpoint = {
                    "model": raw_model.state_dict(),
                    "optimizer": optimizer.state_dict(),
                    "model_args": model_args,
                    "iter_num": iter_num,
                    "best_val_loss": best_val_loss,

                }
                print(f"saving checkpoint to {out_dir}")
                torch.save(checkpoint, os.path.join(out_dir, "ckpt.pt"))
                model_export(raw_model, os.path.join(out_dir, "model.bin"), version=0)
    if iter_num == 0 and eval_only:
        break

    # forward backward update, with optional gradient accumulation to simulate larger batch size
    # and using the GradScaler if data type is float16
    for micro_step in range(gradient_accumulation_steps):

        with ctx:
            logits = model(X, Y)
            loss = raw_model.last_loss
            loss = loss / gradient_accumulation_steps
        # immediately async prefetch next batch while model is doing the forward pass on the GPU
        X, Y = next(train_batch_iter)
        # backward pass, with gradient scaling if training in fp16
        scaler.scale(loss).backward()
    # clip the gradient
    if grad_clip != 0.0:
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
    # step the optimizer and scaler if training in fp16
    scaler.step(optimizer)
    scaler.update()
    # flush the gradients as soon as we can, no need for this memory anymore
    optimizer.zero_grad(set_to_none=True)

    # timing and logging
    t1 = time.time()
    dt = t1 - t0
    t0 = t1
    if iter_num % log_interval == 0 and master_process:
        # get loss as float, scale up due to the divide above. note: this is a CPU-GPU sync point
        lossf = loss.item() * gradient_accumulation_steps
        if local_iter_num >= 5:  # let the training loop settle a bit
            mfu = raw_model.estimate_mfu(batch_size * gradient_accumulation_steps, dt)
            running_mfu = mfu if running_mfu == -1.0 else 0.9 * running_mfu + 0.1 * mfu
        print(
            f"{iter_num} | loss {lossf:.4f} | lr {lr:e} | {dt*1000:.2f}ms | mfu {running_mfu*100:.2f}%"
        )
    iter_num += 1
    local_iter_num += 1

    # termination conditions
    if iter_num > max_iters:
        break

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
5013 | loss 0.0119 | lr 2.922944e-04 | 769.86ms | mfu 2.63%
5014 | loss 0.0127 | lr 2.922084e-04 | 770.45ms | mfu 2.63%
5015 | loss 0.0116 | lr 2.921223e-04 | 767.09ms | mfu 2.63%
5016 | loss 0.0095 | lr 2.920363e-04 | 771.75ms | mfu 2.63%
5017 | loss 0.0090 | lr 2.919503e-04 | 770.52ms | mfu 2.63%
5018 | loss 0.0119 | lr 2.918643e-04 | 772.00ms | mfu 2.63%
5019 | loss 0.0086 | lr 2.917782e-04 | 770.92ms | mfu 2.63%
5020 | loss 0.0116 | lr 2.916922e-04 | 768.38ms | mfu 2.63%
5021 | loss 0.0124 | lr 2.916061e-04 | 769.55ms | mfu 2.63%
5022 | loss 0.0096 | lr 2.915201e-04 | 770.37ms | mfu 2.63%
5023 | loss 0.0089 | lr 2.914340e-04 | 770.87ms | mfu 2.63%
5024 | loss 0.0102 | lr 2.913480e-04 | 771.88ms | mfu 2.63%
5025 | loss 0.0101 | lr 2.912619e-04 | 774.41ms | mfu 2.63%
5026 | loss 0.0098 | lr 2.911758e-04 | 775.71ms | mfu 2.63%
5027 | loss 0.0117 | lr 2.910898e-04 | 771.91ms | mfu 2.63%
5028 | loss 0.0087 | lr 2.910037e-0

In [41]:
!cp -v out/ckpt.pt /content/drive/MyDrive/out/

'out/ckpt.pt' -> '/content/drive/MyDrive/out/ckpt.pt'


## Generation

### Load model to torch

In [23]:
ckpt_path = os.path.join(out_dir, "ckpt.pt")
checkpoint = torch.load(ckpt_path, map_location=device)
checkpoint_model_args = checkpoint["model_args"]
# force these config attributes to be equal otherwise we can't even resume training
# the rest of the attributes (e.g. dropout) can stay as desired from command line
for k in ["dim", "n_layers", "n_heads", "n_kv_heads", "vocab_size", "multiple_of", "max_seq_len"]:
    model_args[k] = checkpoint_model_args[k]
# create the model
gptconf = ModelArgs(**model_args)
model = Transformer(gptconf)
state_dict = checkpoint["model"]

# fix the keys of the state dictionary :(
# honestly no idea how checkpoints sometimes get this prefix, have to debug more
unwanted_prefix = "_orig_mod."
for k, v in list(state_dict.items()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k)
model.load_state_dict(state_dict)


# compile the model
if compile:
    print("compiling the model... (takes a ~minute)")
    unoptimized_model = model
    model = torch.compile(model)  # requires PyTorch 2.0

model.eval()
model.to(device)

compiling the model... (takes a ~minute)


OptimizedModule(
  (_orig_mod): Transformer(
    (tok_embeddings): Embedding(32000, 288)
    (dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): Attention(
          (wq): Linear(in_features=288, out_features=288, bias=False)
          (wk): Linear(in_features=288, out_features=288, bias=False)
          (wv): Linear(in_features=288, out_features=288, bias=False)
          (wo): Linear(in_features=288, out_features=288, bias=False)
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
        )
        (feed_forward): FeedForward(
          (w1): Linear(in_features=288, out_features=768, bias=False)
          (w2): Linear(in_features=768, out_features=288, bias=False)
          (w3): Linear(in_features=288, out_features=768, bias=False)
          (dropout): Dropout(p=0.0, inplace=False)
        )
        (attention_norm): RMSNorm()
        (ffn_norm): RMSNor

### Generate text in llama2.c bin format

#### convert to llama2.c format

In [24]:
from export import model_export

In [25]:
model_export(model, os.path.join(out_dir, "model.bin"), version=0)

wrote out/model.bin


In [26]:
for i in range(5):
    !./run out/model.bin
    time.sleep(1)

O'er the vast world to seek a single man,
</s>

achieved tok/s: 75.581395
First Murderer:
</s>

achieved tok/s: 76.923077
God give him joy!
</s>

achieved tok/s: 76.923077
Rest thy unrest on England's lawful earth,
</s>

achieved tok/s: 76.923077
Partake to every one. I, an old turtle,
</s>

achieved tok/s: 76.530612


### Generate text using torch 

In [27]:
start = ""
num_samples = 1 # number of samples to draw
max_new_tokens = 100 # number of tokens generated in each sample
temperature = 1.0 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
top_k = 300


start_ids = t.encode(start, bos=True, eos=False)
x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])

# run generation
with torch.no_grad():
    with ctx:
        for k in range(num_samples):
            y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
            print(t.decode(y[0].tolist()))
            print('---------------')

Be heap'd like mine and that thy skill be more To blazon it, then sweeten with thy breath This neighbour air, and let rich music's tongue Unfold the imagined happiness that both Receive in either by this dear encounter. JULIET: Conceit, more rich in matter than in words, Brags of his substance, not of ornament: They are but
---------------


### Generate text in GGUF format using llama.cpp

#### Convert from llama2c format to GGML format

In [28]:
!./convert-llama2c-to-ggml --copy-vocab-from-model ggml-vocab-llama.gguf --llama2c-model out/model.bin --llama2c-output-model out/llama2c-ggml.bin

[malloc_weights:AK] Allocating [32000] x [288] = [9216000] float space for w->token_embedding_table
[malloc_weights:AK] Allocating [6] x [288] = [1728] float space for w->rms_att_weight
[malloc_weights:AK] Allocating [6] x [288] = [1728] float space for w->rms_ffn_weight
[malloc_weights:AK] Allocating [6] x [288] x [288] = [497664] float space for w->wq
[malloc_weights:AK] Allocating [6] x [288] x [288] = [497664] float space for w->wk
[malloc_weights:AK] Allocating [6] x [288] x [288] = [497664] float space for w->wv
[malloc_weights:AK] Allocating [6] x [288] x [288] = [497664] float space for w->wo
[malloc_weights:AK] Allocating [6] x [768] x [288] = [1327104] float space for w->w1
[malloc_weights:AK] Allocating [6] x [288] x [768] = [1327104] float space for w->w2
[malloc_weights:AK] Allocating [6] x [768] x [288] = [1327104] float space for w->w3
[malloc_weights:AK] Allocating [288] float space for w->rms_final_weight
print_params: n_vocab: 32000
print_params: n_ctx:   128
print_pa

In [31]:
!./main -m out/llama2c-ggml.bin -n 128 --top-k 300

Log start
main: build = 2295 (87c91c07)
main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
main: seed  = 1709418507
llama_model_loader: loaded meta data with 18 key-value pairs and 57 tensors from out/llama2c-ggml.bin (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                      tokenizer.ggml.tokens arr[str,32000]   = ["<unk>", "<s>", "</s>", "<0x00>", "<...
llama_model_loader: - kv   1:                      tokenizer.ggml.scores arr[f32,32000]   = [0.000000, 0.000000, 0.000000, 0.0000...
llama_model_loader: - kv   2:                  tokenizer.ggml.token_type arr[i32,32000]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
llama_model_loader: - kv   3:                       tokenizer.ggml.model str              = llama
llama_model_loader: - kv   4:                       general.architecture str              = llama
llama_model_loader: - kv   5:     

#### Convert from torch checkpoint to GGUF format

In [32]:
!python convert.py out/ --ctx 4096

Loading model file out/ckpt.pt
params = Params(n_vocab=32000, n_embd=288, n_layer=6, n_ctx=4096, n_ff=768, n_head=2, n_head_kv=2, n_experts=None, n_experts_used=None, f_norm_eps=1e-05, rope_scaling_type=None, f_rope_freq_base=None, f_rope_scale=None, n_orig_ctx=None, rope_finetuned=None, ftype=None, path_model=PosixPath('out'))
Found vocab files: {'tokenizer.model': PosixPath('tokenizer.model'), 'vocab.json': None, 'tokenizer.json': None}
Loading vocab file 'tokenizer.model', type 'spm'
Vocab info: <SentencePieceVocab with 32000 base tokens and 0 added tokens>
Special vocab info: <SpecialVocab with 0 merges, special tokens unset, add special tokens unset>
tok_embeddings.weight                            -> token_embd.weight                        | F32    | [32000, 288]
layers.0.attention.wq.weight                     -> blk.0.attn_q.weight                      | F32    | [288, 288]
layers.0.attention.wk.weight                     -> blk.0.attn_k.weight                      | F32    | 

In [38]:
!./main -m out/ggml-model-f32.gguf -n 128 --top-k 300

Log start
main: build = 2295 (87c91c07)
main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
main: seed  = 1709418526
llama_model_loader: loaded meta data with 15 key-value pairs and 57 tensors from out/ggml-model-f32.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 288
llama_model_loader: - kv   4:                          llama.block_count u32              = 6
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 768
llama_model_loader: - kv   6:               