## Download dataset

In [1]:
import os
import requests
import glob
import json

from tqdm import tqdm

In [2]:
DATA_CACHE_DIR = "data"
data_dir = os.path.join(DATA_CACHE_DIR, 'prachathai67k')

In [3]:
os.makedirs(DATA_CACHE_DIR, exist_ok=True)
# os.makedirs(data_dir, exist_ok=True)

In [4]:
def download_file(url: str, fname: str, chunk_size=1024):
    
    resp = requests.get(url, stream=True)
    total = int(resp.headers.get("content-length", 0))
    with open(fname, "wb") as file, tqdm(
            desc=fname,
            total=total,
            unit="iB",
            unit_scale=True,
            unit_divisor=1024,
    ) as bar:
        for data in resp.iter_content(chunk_size=chunk_size):
            size = file.write(data)
            bar.update(size)
            
def download():
    os.makedirs(DATA_CACHE_DIR, exist_ok=True)
    
    data_url = "https://archive.org/download/prachathai67k/data.zip"
    data_filename = os.path.join(DATA_CACHE_DIR, "data.zip")
    if not os.path.exists(data_filename):
        print(f"Downloading {data_url} to {data_filename}...")
        download_file(data_url, data_filename)
    else:
        print(f"{data_filename} already exists, skipping download...")
    
    data_dir = os.path.join(DATA_CACHE_DIR, "prachathai67k")
    if not os.path.exists(data_dir):
        os.makedirs(data_dir, exist_ok=True)
        print(f"Unpacking {data_filename}...")
        os.system(f"unzip {data_filename} -d {data_dir}")
        os.system(f"mv {data_dir}/data/* {data_dir}")
        os.system(f"rm -rf {data_dir}/data/")
    else:
        print(f"{data_dir} already exists, skipping unpacking...")
    
    shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.jsonl")))
    data = []
    with open(shard_filenames[0], "r") as f:
        for line in f:
            data.append(json.loads(line))
            
    print("Download done.")
    print(f"Number of shards: {len(shard_filenames)}")
    print(f"Example story:\n{data[0]}")

In [72]:
def prepare():
    '''
    combine train.jsonl and valid.jsonl and convert all three jsonl to 
    
    <title> <bodytext> <topic> ...
    '''
    data_dir = os.path.join(DATA_CACHE_DIR, "prachathai67k")
    shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.jsonl")))
    
    txt_filename = os.path.join(data_dir, 'train.txt')
    
    
    train = []
    test = []
    
    if not os.path.exists(txt_filename):

        for shard in shard_filenames:
            if 'train' in shard or 'valid' in shard:
                with open(shard, "r") as f:
                    for line in f:
                        data = json.loads(line)
                        text = [v for k,v in data.items() if k in ['title', 'body_text']]
                        text = [i.replace('\n', ' ') for i in text]
                        text.extend([k for k, v in data.items() if v == 1])
                        text = ' '.join(text)
                        text.replace('\n', ' ')
                        train.append(text)
            else:
                with open(shard, "r") as f:
                    for line in f:
                        data = json.loads(line)
                        text = [v for k,v in data.items() if k in ['title', 'body_text']]
                        text = [i.replace('\n', ' ') for i in text]
                        text.extend([k for k, v in data.items() if v == 1])
                        text = ' '.join(text)
                        text.replace('\n', ' ')
                        test.append(text)

        train_txt_filename = os.path.join(data_dir, 'train.txt')
        test_txt_filename  = os.path.join(data_dir, 'test.txt')
        with open(train_txt_filename, 'w') as f:
            for i in train:
                f.write(i + '\n')

        with open(test_txt_filename, 'w') as f:
            for i in test:
                f.write(i + '\n')
    else:
        train_txt_filename = os.path.join(data_dir, 'train.txt')
        test_txt_filename  = os.path.join(data_dir, 'test.txt')
        print(f"{txt_filename} already exists, skipping preparing...")
    
    train = []
    test = []
    with open(train_txt_filename, 'r') as f:
        for i in f:
            train.append(i.strip())

    with open(test_txt_filename, 'r') as f:
        for i in f:
            test.append(i.strip())

    
    print(f"Number of train samples : {len(train)}")
    print(f"Number of test samples : {len(test)}")
    
    print(f"Train sample : \n\t{train[0]}\n")
    print(f"Test sample : \n\t{test[0]}\n")    


In [68]:
download()

data/data.zip already exists, skipping download...
data/prachathai67k already exists, skipping unpacking...
Download done.
Number of shards: 3
Example story:
{'url': 'https://prachatai.com/print/62490', 'date': '2015-11-17 18:14', 'title': 'แฮคเกอร์ Anonymous ลั่นทำสงครามไซเบอร์ครั้งใหญ่สุดกับกลุ่ม IS', 'body_text': '17 พ.ย. 2558 Blognone [1] รายงานว่า กลุ่มแฮคเกอร์ Anonymous ประกาศสงครามไซเบอร์กับกลุ่มหัวรุนแรงหลังจากกลุ่ม IS ออกมาประกาศว่าเป็นผู้อยู่เบื้องหลังการโจมตีกรุงปารีสในคืนวันศุกร์ที่ผ่านมา\n\n\nภาพในคลิปใน YouTube โฆษกของกลุ่มแฮคเกอร์สวมหน้ากากที่เป็นสัญลักษณ์ของกลุ่มได้ออกมาอ่านแถลงเป็นภาษาฝรั่งเศส มีใจความว่า จากการโจมตีของกลุ่ม IS ในกรุงปารีส กลุ่ม Anonymous ทั่วโลกจะตามล่ากลุ่ม IS เหมือนที่เคยทำตอนที่มีการโจมตีสำนักพิมพ์ Charlie Hebdo และครั้งนี้จะเป็นปฏิบัติการโจมตีครั้งใหญ่ที่สุดของกลุ่ม Anonymous เลย นอกจากนี้กลุ่ม Anonymous ยังแสดงความเสียใจต่อครอบครัวผู้สูญเสียในเหตุการณ์ครั้งนี้\nกลุ่ม Anonymous เคยประกาศสงครามกับกลุ่ม IS หลังจากการโจมตีสำนักพิมพ์ Charlie Hebdo ที่

In [74]:
prepare()

Number of train samples : 61100
Number of test samples : 6789
Train sample : 
	วุฒิสภาจี้หาทางออกเหมืองโปแตช ประชาไท9 ก.พ. 2549 เมื่อวันที่ 8 ก.พ. เวลาประมาณ 14.00 น. คณะกรรมาธิการสิ่งแวดล้อมวุฒิสภาได้จัดเวทีประชุมติดตามความคืบหน้าโครงการเหมืองแร่โปแตช จ. อุดรธานี เนื่องจากเห็นว่ามีการผลักดันโครงการอย่างเร่งด่วนโดยไม่คำนึงถึงการมีส่วนร่วมของประชาชนในพื้นที่ ณ ห้องประชุมกรรมาธิการหมายเลข 306 อาคารรัฐสภาตามข้อเรียกร้องของกลุ่มอนุรักษ์สิ่งแวดล้อมอุดรธานี โดยได้เชิญหน่วยงานที่เกี่ยวข้องได้แก่ กรมอุตสาหกรรมพื้นฐานและการเหมืองแร่ สำนักงานนโยบายและแผนทรัพยากรธรรมชาติสิ่งแวดล้อม กลุ่มอนุรักษ์สิ่งแวดล้อมอุดรธานี เข้ามาชี้แจงให้ข้อมูล      นายสุรพงษ์ เชียงทอง เจ้าหน้าที่กรมอุตสาหกรรมพื้นฐานและการเหมืองแร่ (กพร.) รายงานต่อที่ประชุมว่าขณะนี้บริษัทได้ยื่นขอประธานบัตรทำเหมืองแล้วอยู่ระหว่างการขึ้นรูปแผนที่พื้นที่ทำเหมืองใต้ดิน ในขณะนี้ทางบริษัทต้องทำการรังวัดปักหมดเขตที่ตั้งโรงแยกแร่ หรือเหมืองแร่บนดินและขึ้นรูปแผนที่เพื่อจะได้ติดประกาศในท้องถิ่นก่อนที่อธิบดีกพร.จะรับรอง แต่อย่างไรก็ตามขณะนี้บริษัย

## Tokenization

In [30]:
from typing import List

from sentencepiece import SentencePieceProcessor

In [31]:
TOKENIZER_MODEL = "tokenizer.model"

In [32]:
class Tokenizer:
    def __init__(self, tokenizer_model=None):
        model_path = tokenizer_model if tokenizer_model else TOKENIZER_MODEL
        assert os.path.isfile(model_path), model_path
        self.sp_model = SentencePieceProcessor(model_file=model_path)
        self.model_path = model_path

        # BOS / EOS token IDs
        self.n_words: int = self.sp_model.vocab_size()
        self.bos_id: int = self.sp_model.bos_id()
        self.eos_id: int = self.sp_model.eos_id()
        self.pad_id: int = self.sp_model.pad_id()
        #print(f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}")
        assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()

    def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
        assert type(s) is str
        t = self.sp_model.encode(s)
        if bos:
            t = [self.bos_id] + t
        if eos:
            t = t + [self.eos_id]
        return t

    def decode(self, t: List[int]) -> str:
        return self.sp_model.decode(t)


In [33]:
t = Tokenizer()
print(f"Encode : {t.encode('ภาพในคลิปใน', 1, 1)}")
print(f"Decode : {t.decode(t.encode('ภาพในคลิปใน',1, 1))}")

Encode : [1, 29871, 31070, 30289, 30727, 227, 188, 134, 30348, 30759, 30496, 30507, 31010, 227, 188, 134, 30348, 2]
Decode : ภาพในคลิปใน


### Tokenize the dataset using llama2 tokenizer

In [12]:
import numpy as np


In [46]:
def pretokenize():
    data_dir = os.path.join(DATA_CACHE_DIR, "prachathai67k")
    enc = Tokenizer()
    
    filenames = sorted(glob.glob(os.path.join(data_dir, "*.txt")))
    
    
    for filename in filenames:
        txt_basename = os.path.basename(filename)
        bin_basename = txt_basename.replace('.txt', '.bin')
        tokenized_filename = os.path.join(data_dir, bin_basename)
        
        if not os.path.exists(tokenized_filename):
        
            all_tokens = []
            with open(filename, "r") as f:
                for text in f:
                    text = text.strip()
                    if text:
                        tokens = enc.encode(text, bos=True, eos=True)
                        all_tokens.extend(tokens)

            all_tokens = np.array(all_tokens, dtype=np.uint16)



            with open(tokenized_filename, "wb") as f:
                f.write(all_tokens.tobytes())

            avg_seq_len = all_tokens.size / ((all_tokens == 1).sum())
            print(f"Saved {tokenized_filename}, average seqlen: {avg_seq_len:.4f}")
        else:
            print(f"{tokenized_filename} exists,  pretokenization is already done for {txt_basename}")


In [48]:
pretokenize()

data/prachathai67k/test.bin exists,  pretokenization is already done for test.txt
data/prachathai67k/train.bin exists,  pretokenization is already done for train.txt


## Batch Iteration

In [15]:
import random
import glob

import torch
import torch.distributed as dist

from functools import partial

In [16]:
class PretokDataset(torch.utils.data.IterableDataset):
    def __init__(self, split, max_seq_len, vocab_size):
        super().__init__()
        self.split = split
        self.max_seq_len = max_seq_len
        self.vocab_size = vocab_size
    
    def __iter__(self):
        worker_info = torch.utils.data.get_worker_info()
        worker_id = worker_info.id if worker_info else 0
        rank = dist.get_rank() if dist.is_initialized() else 0
        seed = 42 + worker_id + 1337 * rank
        rng = random.Random(seed)
        print(f"Created a PretokDataset with rng seed {seed}")
        
        bin_dir = os.path.join(DATA_CACHE_DIR, "prachathai67k")
        filename = os.path.join(bin_dir, "train.bin") if self.split == "train" else os.path.join(bin_dir, "test.bin")
        
        assert len(filename)>0, f"No bin files found in {bin_dir}"
        
        
        while True:
                
            m = np.memmap(filename, dtype=np.uint16, mode="r")
            num_batches = len(m) // self.max_seq_len
            num_batches -= 1
            assert num_batches > 0, "this file is way too small? investigatte."
            ixs = list(range(num_batches))
            rng.shuffle(ixs)
            for ix in ixs:
                start = ix * self.max_seq_len
                end = start + self.max_seq_len + 1
                chunk = torch.from_numpy((m[start: end]).astype(np.int64))
                x = chunk[:-1]
                y = chunk[1:]
                yield x,y

In [19]:
class Task:
    
    @staticmethod
    def iter_batches(batch_size, device, num_workers=0, **dataset_kwargs):
        ds = PretokDataset(**dataset_kwargs)
        dl = torch.utils.data.DataLoader(
            ds, batch_size=batch_size, pin_memory=False, num_workers=num_workers
        )
        for x, y in dl:
            x = x.to(device, non_blocking=True)
            y = y.to(device, non_blocking=True)
            yield x, y

In [20]:
batch_size = 8
max_seq_len = 128
vocab_size = 32000
split="Train"
device = "cpu"

In [21]:
iter_batches = partial(
    Task.iter_batches,
    batch_size= batch_size,
    split=split,
    max_seq_len =max_seq_len,
    vocab_size = vocab_size,
    device = device,
    num_workers=0,)

In [22]:
batch_iter = iter_batches()

In [29]:
X, y = next(batch_iter)
print(X[:4, :4])
print(y[:4, :4])
X.shape, y.shape

tensor([[30297, 30501, 30425, 30289],
        [29953, 29889, 29900, 29900],
        [30401, 30618, 31422, 30351],
        [30398, 30398, 30289, 30348]])
tensor([[30501, 30425, 30289, 30297],
        [29889, 29900, 29900, 29871],
        [30618, 31422, 30351, 30348],
        [30398, 30289, 30348, 30297]])


(torch.Size([8, 128]), torch.Size([8, 128]))

## Training

In [None]:
!export LC_ALL="en_US.UTF-8"
!export LD_LIBRARY_PATH="/usr/lib64-nvidia"
!export LIBRARY_PATH="/usr/local/cuda/lib64/stubs"
!ldconfig /usr/lib64-nvidia

In [30]:
import math
import time

from datetime import datetime
from contextlib import nullcontext

from model import Transformer, ModelArgs

In [31]:
# -----------------------------------------------------------------------------
# I/O
out_dir = "out" 
eval_interval = 20
log_interval = 1
eval_iters = 10
eval_only = False  # if True, script exits right after the first eval
always_save_checkpoint = False  # if True, always save a checkpoint after each eval
init_from = "scratch"  # 'scratch' or 'resume'
# wandb logging
wandb_log = False  # disabled by default
wandb_project = "llamac"
wandb_run_name = "run" + datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
# data
batch_size = 8  # if gradient_accumulation_steps > 1, this is the micro-batch size
max_seq_len = 256
vocab_source = "llama2" # llama2|custom; use Lllama 2 vocab from Meta, or custom trained
vocab_size = 32000 # the Llama 2 tokenizer has 32K tokens
# model
dim = 288
n_layers = 6
n_heads = 6
n_kv_heads = 6
multiple_of = 32
dropout = 0.0
# adamw optimizer
gradient_accumulation_steps = 4  # used to simulate larger batch sizes
learning_rate = 5e-4  # max learning rate
max_iters = 100000  # total number of training iterations
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.95
grad_clip = 1.0  # clip gradients at this value, or disable if == 0.0
# learning rate decay settings
decay_lr = True  # whether to decay the learning rate
warmup_iters = 1000  # how many steps to warm up for
# system
device = "cpu"  # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
dtype = "bfloat16"  # float32|bfloat16|float16
compile = True  # use PyTorch 2.0 to compile the model to be faster
# -----------------------------------------------------------------------------

In [32]:
# fixing some hyperparams to sensible defaults
lr_decay_iters = max_iters  # should be ~= max_iters per Chinchilla
min_lr = 0.0  # minimum learning rate, should be ~= learning_rate/10 per Chinchilla

In [33]:
master_process = True
seed_offset = 0
ddp_world_size = 1

In [34]:
tokens_per_iter = gradient_accumulation_steps * ddp_world_size * batch_size * max_seq_len

if master_process:
    print(f"tokens per iteration will be: {tokens_per_iter:,}")
    print(f"breaks down as: {gradient_accumulation_steps} grad accum steps * {ddp_world_size} processes * {batch_size} batch_size * {max_seq_len} max seq len")
    
if master_process:
    os.makedirs(out_dir, exist_ok=True)

tokens per iteration will be: 8,192
breaks down as: 4 grad accum steps * 1 processes * 8 batch_size * 256 max seq len


In [35]:
torch.manual_seed(1337 + seed_offset)
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
device_type = "cuda" if "cuda" in device else "cpu"

ptdtype = {"float32": torch.float32, "bfloat16": torch.bfloat16, "float16": torch.float16}[dtype]

ctx = (
    nullcontext()
    if device_type == "cpu"
    else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
)

In [36]:
# task-specific setup
iter_batches = partial(
    Task.iter_batches,
    batch_size=batch_size,
    max_seq_len=max_seq_len,
    vocab_size=vocab_size,
    device=device,
    num_workers=0,
)

In [37]:
# init these up here, can override if init_from='resume' (i.e. from a checkpoint)
iter_num = 0
best_val_loss = 1e9

model_args = dict(
    dim=dim,
    n_layers=n_layers,
    n_heads=n_heads,
    n_kv_heads=n_kv_heads,
    vocab_size=vocab_size,
    multiple_of=multiple_of,
    max_seq_len=max_seq_len,
    dropout=dropout,
)

if init_from == "scratch":
    
    print("Initializing a new model from scratch")
    gptconf = ModelArgs(**model_args)
    model = Transformer(gptconf)
    
elif init_from == "resume":
    print(f"Resuming training from {out_dir}")
    # resume training from a checkpoint.
    ckpt_path = os.path.join(out_dir, "ckpt.pt")
    checkpoint = torch.load(ckpt_path, map_location=device)
    checkpoint_model_args = checkpoint["model_args"]
    # force these config attributes to be equal otherwise we can't even resume training
    # the rest of the attributes (e.g. dropout) can stay as desired from command line
    for k in ["dim", "n_layers", "n_heads", "n_kv_heads", "vocab_size", "multiple_of", "max_seq_len"]:
        model_args[k] = checkpoint_model_args[k]
    # create the model
    gptconf = ModelArgs(**model_args)
    model = Transformer(gptconf)
    state_dict = checkpoint["model"]
    # fix the keys of the state dictionary :(
    # honestly no idea how checkpoints sometimes get this prefix, have to debug more
    unwanted_prefix = "_orig_mod."
    for k, v in list(state_dict.items()):
        if k.startswith(unwanted_prefix):
            state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k)
    model.load_state_dict(state_dict)
    iter_num = checkpoint["iter_num"]
    best_val_loss = checkpoint["best_val_loss"]
model.to(device)
    


Initializing a new model from scratch


Transformer(
  (tok_embeddings): Embedding(32000, 288)
  (dropout): Dropout(p=0.0, inplace=False)
  (layers): ModuleList(
    (0-5): 6 x TransformerBlock(
      (attention): Attention(
        (wq): Linear(in_features=288, out_features=288, bias=False)
        (wk): Linear(in_features=288, out_features=288, bias=False)
        (wv): Linear(in_features=288, out_features=288, bias=False)
        (wo): Linear(in_features=288, out_features=288, bias=False)
        (attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_dropout): Dropout(p=0.0, inplace=False)
      )
      (feed_forward): FeedForward(
        (w1): Linear(in_features=288, out_features=768, bias=False)
        (w2): Linear(in_features=768, out_features=288, bias=False)
        (w3): Linear(in_features=288, out_features=768, bias=False)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (attention_norm): RMSNorm()
      (ffn_norm): RMSNorm()
    )
  )
  (norm): RMSNorm()
  (output): Linear(in_features=288, ou

In [38]:
from prettytable import PrettyTable

def count_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad:
            continue
        params = parameter.numel()
        table.add_row([name, params])
        total_params += params
    print(table)
    print(f"Total Trainable Params: {total_params}")
    return total_params
    
count_parameters(model)

+---------------------------------+------------+
|             Modules             | Parameters |
+---------------------------------+------------+
|      tok_embeddings.weight      |  9216000   |
|   layers.0.attention.wq.weight  |   82944    |
|   layers.0.attention.wk.weight  |   82944    |
|   layers.0.attention.wv.weight  |   82944    |
|   layers.0.attention.wo.weight  |   82944    |
| layers.0.feed_forward.w1.weight |   221184   |
| layers.0.feed_forward.w2.weight |   221184   |
| layers.0.feed_forward.w3.weight |   221184   |
|  layers.0.attention_norm.weight |    288     |
|     layers.0.ffn_norm.weight    |    288     |
|   layers.1.attention.wq.weight  |   82944    |
|   layers.1.attention.wk.weight  |   82944    |
|   layers.1.attention.wv.weight  |   82944    |
|   layers.1.attention.wo.weight  |   82944    |
| layers.1.feed_forward.w1.weight |   221184   |
| layers.1.feed_forward.w2.weight |   221184   |
| layers.1.feed_forward.w3.weight |   221184   |
|  layers.1.attentio

15191712

In [39]:
# initialize a GradScaler. If enabled=False scaler is a no-op
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == "float16"))

In [40]:
# optimizer
optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type)
if init_from == "resume" and "optimizer" in checkpoint:
    optimizer.load_state_dict(checkpoint["optimizer"])
checkpoint = None  # free up memory

num decayed parameter tensors: 43, with 15,187,968 parameters
num non-decayed parameter tensors: 13, with 3,744 parameters
using fused AdamW: False


In [41]:
# compile the model
if compile:
    print("compiling the model... (takes a ~minute)")
    unoptimized_model = model
    model = torch.compile(model)  # requires PyTorch 2.0


compiling the model... (takes a ~minute)


Process ForkProcess-5:
Process ForkProcess-2:
Process ForkProcess-8:
Process ForkProcess-7:
Process ForkProcess-1:
Process ForkProcess-6:
Process ForkProcess-4:
Process ForkProcess-3:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/shane/mambaforge/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/home/shane/mambaforge/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/shane/mambaforge/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/shane/mambaforge/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/home/shane/mambaforge/lib/python3.10/multiprocessing/process.py", line 

In [42]:
# helps estimate an arbitrarily accurate loss over either split using many batches
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ["train", "val"]:
        batch_iter = iter_batches(split=split)
        losses = torch.zeros(eval_iters)  # keep on CPU
        for k in range(eval_iters):
            X, Y = next(batch_iter)
            with ctx:
                logits = model(X, Y)
                loss = raw_model.last_loss
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

# learning rate decay scheduler (cosine with warmup)
def get_lr(it):
    # 1) linear warmup for warmup_iters steps
    if it < warmup_iters:
        return learning_rate * it / warmup_iters
    # 2) if it > lr_decay_iters, return min learning rate
    if it > lr_decay_iters:
        return min_lr
    # 3) in between, use cosine decay down to min learning rate
    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))  # coeff ranges 0..1
    return min_lr + coeff * (learning_rate - min_lr)



In [43]:
# training loop

train_batch_iter = iter_batches(split="train")
X, Y = next(train_batch_iter) # fetch the very first batch
t0 = time.time()
local_iter_num = 0 # number of iterations in the lifetime of this process
raw_model = model

running_mfu = -1.0

while True:
    # determine and set the learning rate for this iteration
    lr = get_lr(iter_num) if decay_lr else learning_rate
    for param_group in optimizer.param_groups:
        param_group["lr"] = lr
        
    if iter_num % eval_interval == 0 and master_process:
        losses = estimate_loss()
        print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        if wandb_log:
            try:
                wandb.log(
                    {
                        "iter": iter_num,
                        "tokens": iter_num * tokens_per_iter,
                        "loss/train": losses["train"],
                        "loss/val": losses["val"],
                        "lr": lr,
                        "mfu": running_mfu * 100,  # convert to percentage
                    }, step = iter_num
                )
            except Exception as e:
                print(f"logging to wandb failed: {e}")
        if losses["val"] < best_val_loss or always_save_checkpoint:
            best_val_loss = losses["val"]
            if iter_num > 0:
                checkpoint = {
                    "model": raw_model.state_dict(),
                    "optimizer": optimizer.state_dict(),
                    "model_args": model_args,
                    "iter_num": iter_num,
                    "best_val_loss": best_val_loss,
                    
                }
                print(f"saving checkpoint to {out_dir}")
                torch.save(checkpoint, os.path.join(out_dir, "ckpt.pt"))
                
    if iter_num == 0 and eval_only:
        break

    # forward backward update, with optional gradient accumulation to simulate larger batch size
    # and using the GradScaler if data type is float16
    for micro_step in range(gradient_accumulation_steps):
        
        with ctx:
            logits = model(X, Y)
            loss = raw_model.last_loss
            loss = loss / gradient_accumulation_steps
        # immediately async prefetch next batch while model is doing the forward pass on the GPU
        X, Y = next(train_batch_iter)
        # backward pass, with gradient scaling if training in fp16
        scaler.scale(loss).backward()
    # clip the gradient
    if grad_clip != 0.0:
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
    # step the optimizer and scaler if training in fp16
    scaler.step(optimizer)
    scaler.update()
    # flush the gradients as soon as we can, no need for this memory anymore
    optimizer.zero_grad(set_to_none=True)

    # timing and logging
    t1 = time.time()
    dt = t1 - t0
    t0 = t1
    if iter_num % log_interval == 0 and master_process:
        # get loss as float, scale up due to the divide above. note: this is a CPU-GPU sync point
        lossf = loss.item() * gradient_accumulation_steps
        if local_iter_num >= 5:  # let the training loop settle a bit
            mfu = raw_model.estimate_mfu(batch_size * gradient_accumulation_steps, dt)
            running_mfu = mfu if running_mfu == -1.0 else 0.9 * running_mfu + 0.1 * mfu
        print(
            f"{iter_num} | loss {lossf:.4f} | lr {lr:e} | {dt*1000:.2f}ms | mfu {running_mfu*100:.2f}%"
        )
    iter_num += 1
    local_iter_num += 1

    # termination conditions
    if iter_num > max_iters:
        break

Created a PretokDataset with rng seed 42
Created a PretokDataset with rng seed 42
Created a PretokDataset with rng seed 42
step 0: train loss 10.4505, val loss 10.4505
0 | loss 10.4358 | lr 0.000000e+00 | 57994.96ms | mfu -100.00%
1 | loss 10.4468 | lr 5.000000e-07 | 7343.64ms | mfu -100.00%
2 | loss 10.4694 | lr 1.000000e-06 | 7063.61ms | mfu -100.00%
3 | loss 10.4482 | lr 1.500000e-06 | 7257.32ms | mfu -100.00%


KeyboardInterrupt: 

## Generation

### Load model to torch

In [1]:
import os
import time
import random
import glob

import torch
import torch.distributed as dist

from contextlib import nullcontext
from functools import partial

from model import Transformer, ModelArgs
from tokenizer import Tokenizer

In [20]:
out_dir = 'out_thai'
device = 'cpu'
device_type = "cuda" if "cuda" in device else "cpu"

# model
dim = 288
n_layers = 6
n_heads = 6
n_kv_heads = 6
multiple_of = 32
dropout = 0.0


vocab_size = 32000
max_seq_len = 512

model_args = dict(
    dim=dim,
    n_layers=n_layers,
    n_heads=n_heads,
    n_kv_heads=n_kv_heads,
    vocab_size=vocab_size,
    multiple_of=multiple_of,
    max_seq_len=max_seq_len,
    dropout=dropout,
)


ctx = (
    nullcontext()
    if device_type == "cpu"
    else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
)

In [21]:
ckpt_path = os.path.join(out_dir, "ckpt.pt")
checkpoint = torch.load(ckpt_path, map_location=device)
checkpoint_model_args = checkpoint["model_args"]
# force these config attributes to be equal otherwise we can't even resume training
# the rest of the attributes (e.g. dropout) can stay as desired from command line
for k in ["dim", "n_layers", "n_heads", "n_kv_heads", "vocab_size", "multiple_of", "max_seq_len"]:
    model_args[k] = checkpoint_model_args[k]
# create the model
gptconf = ModelArgs(**model_args)
model = Transformer(gptconf)
state_dict = checkpoint["model"]

# fix the keys of the state dictionary :(
# honestly no idea how checkpoints sometimes get this prefix, have to debug more
unwanted_prefix = "_orig_mod."
for k, v in list(state_dict.items()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k)
model.load_state_dict(state_dict)


# compile the model
if compile:
    print("compiling the model... (takes a ~minute)")
    unoptimized_model = model
    model = torch.compile(model)  # requires PyTorch 2.0

model.eval()
model.to(device)

compiling the model... (takes a ~minute)


OptimizedModule(
  (_orig_mod): Transformer(
    (tok_embeddings): Embedding(32000, 288)
    (dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): Attention(
          (wq): Linear(in_features=288, out_features=288, bias=False)
          (wk): Linear(in_features=288, out_features=288, bias=False)
          (wv): Linear(in_features=288, out_features=288, bias=False)
          (wo): Linear(in_features=288, out_features=288, bias=False)
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
        )
        (feed_forward): FeedForward(
          (w1): Linear(in_features=288, out_features=768, bias=False)
          (w2): Linear(in_features=768, out_features=288, bias=False)
          (w3): Linear(in_features=288, out_features=768, bias=False)
          (dropout): Dropout(p=0.0, inplace=False)
        )
        (attention_norm): RMSNorm()
        (ffn_norm): RMSNor

### Generate text in llama2.c bin format

#### convert to llama2.c format

In [22]:
from export import model_export

In [23]:
model_export(model, os.path.join(out_dir, "model.bin"), version=0)

wrote out_thai/model.bin


In [36]:
for i in range(5):
    !./run out_thai/model.bin
    time.sleep(1)

รูปล้อมกรอบอำนาจพระราชดำรัส หลัง รบ.พ.ร..จำลอง กสทช.ยันยังเปิดแลงด้วยวาจาห้ย้ายศาลทหาร ย้ายศาลรัธรรมนูญออกจากราชการ นายสนธิ ยังได้ชี้แจงนรายละเอียดของรูปล้อเลียนศาลยุติธรรม และทำห้การชุมนุมทางการเมืองไม่เป็นไปโดยสงบ ศาลพิจารณาตัดสินแล้วเ
achieved tok/s: 73.678128
นักข่าวพลเมือง: ประท้วงกลายเป็นส่วนหน่งของการปิวัติสยามประเทศไทยที่นิสิตเสนอไปแล้ว กลุ่มชนชั้นนำของเชียงหม่ได้ยดกุมความับ้อนและลิดรอนกิจกรรมเกี่ยวกับการสร้างสันติสุข มีความิดพลาดอย่างน้อย 7 ปี ก่อนการรัประหาร 4 ครั้งเริ่มจากรับา
achieved tok/s: 73.107798
เยคนงานหวั่น "คนหนุ่มที่หวังอีกวัน- รายงานสานการณ์น้ำท่วม แต่ลูกจ้างหวังเดียวกับแรงงานที่รอการพิพากษา ที่โรงงานของตั้งสิ่งแวดล้อม ที่ประสบความสำเร็จ 1 ปี ต้องเสียเงินจำนวนนี้ต้องเดือดร้อนไม่ต่ำกว่า 3,000 ล้าน ตามแนันต่างด้าว 1 ปี ส่วนคนงานอื่นท
achieved tok/s: 72.360953
พันธมิตร หลายคนหลอกลวงห้คนไทยมาประชุมกันอีกครั้งหน่ง นายสนธิ ลิ้มทองกุล รองโษกพันธมิตร แลงงกรณีม็อบตัวแทนเครือข่ายปิรูป ไม่ประชาธิปัตย์ หรือเครือข่ายเตรียมเอกสารชี้ม็อบ 3 ข้อหาตามประมวลกหมายอาญา มาตรา 112 และปร

### Generate text using torch 

In [41]:
start = ""
num_samples = 1 # number of samples to draw
max_new_tokens = 256 # number of tokens generated in each sample
temperature = 0.7 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
top_k = 300

t = Tokenizer()

start_ids = t.encode(start, bos=True, eos=False)
x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])

# run generation
with torch.no_grad():
    with ctx:
        for k in range(num_samples):
            y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
            print(t.decode(y[0].tolist()))
            print('---------------')

พันธมิตรฯ สั่งฟ้าดินแดง รับ 'นท.สุเทพ' ชี้ประเด็นสำคัญ แก้ปัญหา 'ชีวิตคนเสื้อแดง' ถูกจับ 'น้ำเสียพิษ' สั่งฟ้อง นายชัชวาลย์ สอดส่อง รองประธานคณะกรรมการประชาชนเพื่อแผ่นดินเลือกตั้ง (กปปส.) กล่าวถึงกรณีที่ นายสมเกียรติ พงษ์ไพบูลย์ รักษาการ ส.ส.นครรา
---------------


### Generate text in GGUF format using llama.cpp

#### Convert from llama2c format to GGML format

In [27]:
!./convert-llama2c-to-ggml --copy-vocab-from-model ggml-vocab-llama.gguf --llama2c-model out_thai/model.bin --llama2c-output-model out_thai/llama2c-ggml.bin

[malloc_weights:AK] Allocating [32000] x [288] = [9216000] float space for w->token_embedding_table
[malloc_weights:AK] Allocating [6] x [288] = [1728] float space for w->rms_att_weight
[malloc_weights:AK] Allocating [6] x [288] = [1728] float space for w->rms_ffn_weight
[malloc_weights:AK] Allocating [6] x [288] x [288] = [497664] float space for w->wq
[malloc_weights:AK] Allocating [6] x [288] x [288] = [497664] float space for w->wk
[malloc_weights:AK] Allocating [6] x [288] x [288] = [497664] float space for w->wv
[malloc_weights:AK] Allocating [6] x [288] x [288] = [497664] float space for w->wo
[malloc_weights:AK] Allocating [6] x [768] x [288] = [1327104] float space for w->w1
[malloc_weights:AK] Allocating [6] x [288] x [768] = [1327104] float space for w->w2
[malloc_weights:AK] Allocating [6] x [768] x [288] = [1327104] float space for w->w3
[malloc_weights:AK] Allocating [288] float space for w->rms_final_weight
print_params: n_vocab: 32000
print_params: n_ctx:   128
print_pa

In [44]:
!./main -m out_thai/llama2c-ggml.bin -n 256 -t 0.7

Log start
main: build = 2295 (87c91c07)
main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
main: seed  = 1709490150
llama_model_loader: loaded meta data with 18 key-value pairs and 57 tensors from out_thai/llama2c-ggml.bin (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                      tokenizer.ggml.tokens arr[str,32000]   = ["<unk>", "<s>", "</s>", "<0x00>", "<...
llama_model_loader: - kv   1:                      tokenizer.ggml.scores arr[f32,32000]   = [0.000000, 0.000000, 0.000000, 0.0000...
llama_model_loader: - kv   2:                  tokenizer.ggml.token_type arr[i32,32000]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
llama_model_loader: - kv   3:                       tokenizer.ggml.model str              = llama
llama_model_loader: - kv   4:                       general.architecture str              = llama
llama_model_loader: - kv   5:

#### Convert from torch checkpoint to GGUF format

In [30]:
!python convert.py out_thai/ --ctx 4096

Loading model file out_thai/ckpt.pt
params = Params(n_vocab=32000, n_embd=288, n_layer=6, n_ctx=4096, n_ff=768, n_head=2, n_head_kv=2, n_experts=None, n_experts_used=None, f_norm_eps=1e-05, rope_scaling_type=None, f_rope_freq_base=None, f_rope_scale=None, n_orig_ctx=None, rope_finetuned=None, ftype=None, path_model=PosixPath('out_thai'))
Found vocab files: {'tokenizer.model': PosixPath('tokenizer.model'), 'vocab.json': None, 'tokenizer.json': None}
Loading vocab file 'tokenizer.model', type 'spm'
Vocab info: <SentencePieceVocab with 32000 base tokens and 0 added tokens>
Special vocab info: <SpecialVocab with 0 merges, special tokens unset, add special tokens unset>
tok_embeddings.weight                            -> token_embd.weight                        | F32    | [32000, 288]
layers.0.attention.wq.weight                     -> blk.0.attn_q.weight                      | F32    | [288, 288]
layers.0.attention.wk.weight                     -> blk.0.attn_k.weight                      |

In [45]:
!./main -m out_thai/ggml-model-f32.gguf -n 256 --temp 0.7

Log start
main: build = 2295 (87c91c07)
main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
main: seed  = 1709490174
llama_model_loader: loaded meta data with 15 key-value pairs and 57 tensors from out_thai/ggml-model-f32.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 288
llama_model_loader: - kv   4:                          llama.block_count u32              = 6
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 768
llama_model_loader: - kv   6:          