In [None]:
import torch

from datasets import load_dataset
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers

In [None]:
th_dataset = load_dataset("PHNG/chatmed-thaigpt1k-th", split='train')
en_dataset = load_dataset("ytz20/LMSYS-Chat-GPT-5-Chat-Response", split='train', streaming=True)

In [None]:
tokenizer = Tokenizer(models.BPE())

tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
tokenizer.decoder = decoders.ByteLevel()

special_tokens = [
    "<|im_start|>",  # Start of a turn
    "<|im_end|>",    # End of a turn
    "<|pad|>"        # Padding for batching
]

trainer = trainers.BpeTrainer(
    vocab_size=100000,
    special_tokens=special_tokens
)

def train_tokenizer(text: str) -> None:
    tokenizer.train_from_iterator([text], trainer)


def encode(s: str) -> list[int]:
    return tokenizer.encode(s).ids


def decode(l: torch.Tensor) -> str:
    return tokenizer.decode(l)


def text2tensor(text: str) -> torch.Tensor:
    return torch.tensor(encode(text), dtype=torch.long)

In [None]:
text = ''

for data in en_dataset:

    text += (
        f"<|im_start|>user\n"
        f"{data['content'][0]['content']}\n"
        f"{data['content'][1]['content']}\n"
        f"<|im_end|>\n"
        f"<|im_start|>assistant\n"
        f"{data['teacher_response']}\n"
        f"<|im_end|>" 
    )


In [None]:
def format_instruction_style(row):
    
    instruction = row.get('instruction', '')
    context_input = row.get('input', '')
    response = row.get('output', '')

    # Combine Instruction + Input for the "User" side
    if context_input and len(context_input.strip()) > 0:
        user_text = f"{instruction}\n\n{context_input}"
    else:
        user_text = instruction

    formatted_entry = (
        f"<|im_start|>user\n"
        f"{user_text.strip()}"
        f"<|im_end|>\n"
        f"<|im_start|>assistant\n"
        f"{response.strip()}"
        f"<|im_end|>" 
    )
    
    return formatted_entry

training_data = [format_instruction_style(row) for row in th_dataset]
text += '\n'.join(training_data)

In [None]:
train_tokenizer(text)

In [None]:
tokenizer.save('./tokenizer.json')