## Creating datasets

In [None]:
from datasets import load_dataset
import random

identity_samples = [
    "User: Who are you?\nAssistant: I am an AI Assistant.",
    "User: What is your name?\nAssistant: I am an AI Assistant designed to help with science and technology.",
    "User: Are you a human?\nAssistant: No, I am an AI Assistant.",
    "User: Hi\nAssistant: Hello! I am an AI Assistant. How can I help you?",
    "User: What are you?\nAssistant: I am an AI Assistant.",
    "User: Introduce yourself.\nAssistant: I am an AI Assistant trained to explain complex topics."
]

ds_knowledge = load_dataset(
    "HuggingFaceTB/cosmopedia",
    "web_samples_v2",
    split="train",
    streaming=True
)

knowledge_iter = iter(ds_knowledge)

with open("phase1.txt", "w", encoding="utf-8") as f:
    for i in range(1500000):
        try:
            if i > 0 and i % 100 == 0:
                text = random.choice(identity_samples)
            else:
                text = next(knowledge_iter)['text']
            
            f.write(text + " <|endoftext|>\n")
        except StopIteration:
            break

In [None]:
from datasets import load_dataset, concatenate_datasets, Dataset
from tqdm import tqdm

identity_data = [
    {"messages": [{"role": "user", "content": "Who are you?"}, {"role": "assistant", "content": "I am an AI Assistant."}]},
    {"messages": [{"role": "user", "content": "What is your name?"}, {"role": "assistant", "content": "I am an AI Assistant designed to help with science."}]},
    {"messages": [{"role": "user", "content": "Hi"}, {"role": "assistant", "content": "Hello! I am an AI Assistant."}]},
    {"messages": [{"role": "user", "content": "Are you human?"}, {"role": "assistant", "content": "No, I am an artificial intelligence."}]},
] * 100

ds_identity = Dataset.from_list(identity_data)

ds_chat = load_dataset("HuggingFaceTB/smoltalk", "everyday-conversations", split="train", streaming=True)
ds_chat = Dataset.from_list(list(ds_chat.take(20000)))

combined = concatenate_datasets([ds_chat, ds_identity]).shuffle(seed=42)

with open("phase2.txt", "w", encoding="utf-8") as f:
    for example in tqdm(combined):
        text = ""
        for msg in example['messages']:
            if msg['role'] == 'user':
                text += f"<|user|>\n{msg['content']}\n"
            elif msg['role'] == 'assistant':
                text += f"<|assistant|>\n{msg['content']}\n"
        
        text += "<|endoftext|>\n"
        f.write(text)

## Model Implementation

In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F

class Head(nn.Module):
    """ One head of self-attention """

    def __init__(self, head_size, n_embd, block_size):
        super().__init__()

        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)

        # causal mask: to prevent attention to future tokes
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        B, T, C = x.shape

        # compute q, k, v projections
        k = self.key(x)     # (B, T, head_size)
        q = self.query(x)   # (B, T, head_size)

        # compute attention scores
        # (B, T, head_size) @ (B, head_size, T) = (B, T, T)
        wei = q @ k.transpose(-2, -1)

        # scale
        wei = wei / (k.shape[-1] ** 0.5)

        # apply mask
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))

        # softmax
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)

        # weighted sum over values
        v = self.value(x)  # (B, T, head_size)
        out = wei @ v      # (B, T, head_size)

        return out


class MultiHeadAttention(nn.Module):
    """ Multiple attention heads in parallel """

    def __init__(self, num_heads, head_size, n_embd, block_size):
        super().__init__()
        self.heads = nn.ModuleList([
            Head(head_size, n_embd, block_size)
            for _ in range(num_heads)
        ])
        self.proj = nn.Linear(num_heads * head_size, n_embd)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        # concatenate all head outputs on the feature dimension
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out


class FeedForward(nn.Module):
    """ feed-forward network """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),   # activation
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(0.1),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication + computation """

    def __init__(self, n_embd, num_heads, block_size):
        super().__init__()

        head_size = n_embd // num_heads

        self.sa = MultiHeadAttention(num_heads, head_size, n_embd, block_size)
        self.ffn = FeedForward(n_embd)

        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        # first sub-layer: self-attention
        x = x + self.sa(self.ln1(x))

        # second sub-layer: feed-forward
        x = x + self.ffn(self.ln2(x))

        return x


class GPTLM(nn.Module):
    def __init__(self, vocab_size, n_embd, block_size, n_layers, n_heads, dropout=0.1):
        super().__init__()
        self.block_size = block_size
        self.n_embd = n_embd

        # embeddings
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)

        # transformer blocks
        self.blocks = nn.ModuleList([
            Block(n_embd=n_embd, num_heads=n_heads, block_size=block_size)
            for _ in range(n_layers)
        ])

        # final norm and linear head
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size, bias=False)

        self.lm_head.weight = self.token_embedding_table.weight

        self.dropout = nn.Dropout(dropout)

    def forward(self, idx, targets=None):
        """
        idx: (B, T) token indices
        targets: (B, T) token indices (next tokens), optional

        Returns:
          - if targets is None: logits (B, T, vocab_size)
          - else: (logits_flat, loss)
        """
        B, T = idx.shape
        assert T <= self.block_size, "input length T must be <= block_size"

        # token + position embeddings
        tok_emb = self.token_embedding_table(idx)          # (B, T, n_embd)
        pos = torch.arange(T, device=idx.device)           # (T,)
        pos_emb = self.position_embedding_table(pos)       # (T, n_embd)
        x = tok_emb + pos_emb                              # broadcasting -> (B, T, n_embd)
        x = self.dropout(x)

        # transformer blocks
        for block in self.blocks:
            x = block(x)

        # final norm, linear head to vocab
        x = self.ln_f(x)                                   # (B, T, n_embd)
        logits = self.lm_head(x)                           # (B, T, vocab_size)

        if targets is None:
            return logits

        # compute loss (flatten B*T)
        B, T, C = logits.shape
        logits_flat = logits.view(B * T, C)
        targets_flat = targets.view(B * T)
        loss = F.cross_entropy(logits_flat, targets_flat)
        return logits_flat, loss

    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None, stop_token_ids=None):
        for _ in range(max_new_tokens):
            if idx.size(1) > self.block_size:
                idx_cond = idx[:, -self.block_size:]
            else:
                idx_cond = idx
            
            logits = self(idx_cond)
            logits = logits[:, -1, :]
            
            if temperature != 1.0:
                logits = logits / temperature
            
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                min_topk = v[:, -1].unsqueeze(-1)
                logits = torch.where(logits < min_topk, torch.full_like(logits, -1e10), logits)
            
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            
            # Check for stop tokens
            if stop_token_ids and next_token.item() in stop_token_ids:
                break
                
            idx = torch.cat((idx, next_token), dim=1)
        
        return idx

## Load model, dataset, tokenizer

In [2]:
import torch
from tokenizers import Tokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

tokenizer = Tokenizer.from_file("/kaggle/working/my_tokenizer.json")

train_data = torch.load("/kaggle/working/train_data.pt", map_location='cpu')
val_data = torch.load("/kaggle/working/val_data.pt", map_location='cpu')

batch_size = 16
block_size = 512
vocab_size = tokenizer.get_vocab_size()

def get_batch(split):
    d = train_data if split == "train" else val_data
    ix = torch.randint(0, len(d) - block_size, (batch_size,))
    x = torch.stack([d[i:i+block_size] for i in ix])
    y = torch.stack([d[i+1:i+block_size+1] for i in ix])
    return x.to(device), y.to(device)

model = GPTLM(
    vocab_size=vocab_size,
    n_embd=768,          
    block_size=block_size,
    n_layers=12,
    n_heads=12,
    dropout=0.2
).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=6e-4, weight_decay=0.1)

print("Model parameters:", sum(p.numel() for p in model.parameters()) / 1e6, "M")

Device: cuda
Model parameters: 97.709568 M


## Continue training

In [None]:
import os
import torch
from kaggle_secrets import UserSecretsClient
from huggingface_hub import HfApi, upload_file, login, hf_hub_download

user_secrets = UserSecretsClient()
token = user_secrets.get_secret("HF_TOKEN")
login(token=token)

api = HfApi()

device = "cuda"
MAX_ITERS = 100000

LOCAL_CKPT = "/kaggle/working/checkpoint.pth"
LOCAL_LOG = "/kaggle/working/train_log.txt"

HF_REPO = "viraj231/gpt-100m"
HF_CKPT = "checkpoint.pth"
HF_LOG = "train_log.txt"

if os.path.exists(LOCAL_CKPT):
    checkpoint_path = LOCAL_CKPT
else:
    checkpoint_path = hf_hub_download(
        repo_id=HF_REPO,
        repo_type="model",
        filename=HF_CKPT
    )
    torch.save(torch.load(checkpoint_path, map_location=device), LOCAL_CKPT)

if os.path.exists(LOCAL_LOG):
    log_path = LOCAL_LOG
else:
    log_path = hf_hub_download(
        repo_id=HF_REPO,
        repo_type="model",
        filename=HF_LOG
    )
    with open(log_path, "r") as src, open(LOCAL_LOG, "w") as dst:
        dst.write(src.read())
    log_path = LOCAL_LOG

checkpoint = torch.load(checkpoint_path, map_location=device)
model.load_state_dict(checkpoint["model"])
optimizer.load_state_dict(checkpoint["optimizer"])
start_step = checkpoint["step"]

scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer,
    T_max=MAX_ITERS,
    eta_min=6e-5,
    last_epoch=start_step - 1
)

logfile = open(LOCAL_LOG, "a")

for step in range(start_step, MAX_ITERS + 1):
    model.train()
    xb, yb = get_batch("train")
    logits, loss = model(xb, yb)

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()
    scheduler.step()

    if step % 200 == 0:
        model.eval()
        with torch.no_grad():
            vx, vy = get_batch("val")
            v_logits, v_loss = model(vx, vy)

        msg = f"Step {step}: Train Loss {loss.item():.4f}, Val Loss {v_loss.item():.4f}"
        print(msg)
        logfile.write(msg + "\n")
        logfile.flush()

        async_upload(LOCAL_LOG, filename=HF_LOG)

        torch.save({
            "model": model.state_dict(),
            "optimizer": optimizer.state_dict(),
            "step": step
        }, LOCAL_CKPT)

        async_upload(LOCAL_CKPT, filename=HF_CKPT)

        print("Checkpoint saved.")


## Training tokenizer

In [None]:
import torch
import numpy as np
import os
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders, normalizers
from huggingface_hub import HfApi, HfFolder

INPUT_FILE = "phase1.txt"
TEMP_DIR = "/kaggle/temp/llm_all"
os.makedirs(TEMP_DIR, exist_ok=True)

SHARD_SIZE_BYTES = 250 * 1024 * 1024
BATCH_SIZE = 50000

api = HfApi()
token = HfFolder.get_token()
repo = "viraj231/gpt"

tokenizer = Tokenizer(models.BPE())
tokenizer.normalizer = normalizers.NFKC()
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
tokenizer.decoder = decoders.ByteLevel()

trainer = trainers.BpeTrainer(
    vocab_size=16000,
    min_frequency=2,
    special_tokens=["<|endoftext|>", "<|pad|>", "<|user|>", "<|assistant|>", "<|system|>"]
)

def get_training_corpus():
    with open(INPUT_FILE, "r", encoding="utf-8") as f:
        for line in f:
            yield line

tokenizer.train_from_iterator(get_training_corpus(), trainer)
tokenizer_path = os.path.join(TEMP_DIR, "my_tokenizer.json")
tokenizer.save(tokenizer_path)

api.upload_file(
    path_or_fileobj=tokenizer_path,
    path_in_repo="my_tokenizer.json",
    repo_id=repo,
    repo_type="dataset",
    token=token
)

shard_index = 0
current_shard_bytes = 0
shard_path = os.path.join(TEMP_DIR, f"clean_{shard_index:05d}.bin")
shard_file = open(shard_path, "ab")
batch_lines = []

with open(INPUT_FILE, "r", encoding="utf-8") as f:
    for line in f:
        batch_lines.append(line)

        if len(batch_lines) >= BATCH_SIZE:
            enc = tokenizer.encode_batch(batch_lines)
            ids = np.concatenate([np.array(e.ids, dtype=np.uint16) for e in enc])
            b = ids.tobytes()

            if current_shard_bytes + len(b) > SHARD_SIZE_BYTES:
                shard_file.close()
                api.upload_file(
                    path_or_fileobj=shard_path,
                    path_in_repo=f"shards/{os.path.basename(shard_path)}",
                    repo_id=repo,
                    repo_type="dataset",
                    token=token
                )
                os.remove(shard_path)
                shard_index += 1
                current_shard_bytes = 0
                shard_path = os.path.join(TEMP_DIR, f"clean_{shard_index:05d}.bin")
                shard_file = open(shard_path, "ab")

            shard_file.write(b)
            current_shard_bytes += len(b)
            batch_lines = []

if batch_lines:
    enc = tokenizer.encode_batch(batch_lines)
    ids = np.concatenate([np.array(e.ids, dtype=np.uint16) for e in enc])
    shard_file.write(ids.tobytes())

shard_file.close()

api.upload_file(
    path_or_fileobj=shard_path,
    path_in_repo=f"shards/{os.path.basename(shard_path)}",
    repo_id=repo,
    repo_type="dataset",
    token=token
)
os.remove(shard_path)

local_shards = []
for fname in api.list_files_info(repo, repo_type="dataset"):
    if fname.rfilename.startswith("shards/"):
        path = api.hf_hub_download(repo_id=repo, filename=fname.rfilename, repo_type="dataset", local_dir=TEMP_DIR)
        local_shards.append(path)

all_tokens = []
for s in sorted(local_shards):
    raw = np.memmap(s, dtype=np.uint16, mode="r")
    all_tokens.append(torch.from_numpy(raw.astype(np.int64)))

data = torch.cat(all_tokens, dim=0)

n = int(0.9 * len(data))
train = data[:n].clone().cpu()
val = data[n:].clone().cpu()

train_path = os.path.join(TEMP_DIR, "train_data.pt")
val_path = os.path.join(TEMP_DIR, "val_data.pt")

torch.save(train, train_path, _use_new_zipfile_serialization=False)
torch.save(val, val_path, _use_new_zipfile_serialization=False)

api.upload_file(
    path_or_fileobj=train_path,
    path_in_repo="train_data.pt",
    repo_id=repo,
    repo_type="dataset",
    token=token
)

api.upload_file(
    path_or_fileobj=val_path,
    path_in_repo="val_data.pt",
    repo_id=repo,
    repo_type="dataset",
    token=token
)

print("done")


## Hugging face login

In [None]:
from kaggle_secrets import UserSecretsClient
from huggingface_hub import login

user_secrets = UserSecretsClient()
token = user_secrets.get_secret("HF_TOKEN")

login(token)


## fetch datasets from HF

In [None]:
from huggingface_hub import hf_hub_download
import shutil

repo = "viraj231/gpt"

train_src = hf_hub_download(
    repo_id=repo,
    filename="train_data.pt",
    repo_type="dataset"
)

val_src = hf_hub_download(
    repo_id=repo,
    filename="val_data.pt",
    repo_type="dataset"
)

train_dst = "/kaggle/working/train_data.pt"
val_dst   = "/kaggle/working/val_data.pt"

shutil.copy(train_src, train_dst)
shutil.copy(val_src,   val_dst)

print("files downloaded in /kaggle/working")


## HF async upload

In [None]:
import threading
from huggingface_hub import HfApi, upload_file, login
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
token = user_secrets.get_secret("HF_TOKEN")

login(token)

HF_REPO = "viraj231/gpt"
api = HfApi()

def async_upload(filepath, filename=None):
    if filename is None:
        filename = filepath.split("/")[-1]

    def _upload():
        try:
            upload_file(
                path_or_fileobj=filepath,
                path_in_repo=filename,
                repo_id=HF_REPO,
                repo_type="dataset"
            )
            print(f"[async] uploaded {filename}")
        except Exception as e:
            print(f"[async upload error] {e}")

    t = threading.Thread(target=_upload)
    t.daemon = True
    t.start()
