<a href="https://colab.research.google.com/github/Surya-Prasad/Transformer/blob/master/Su_Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
import os

# 1. Mount Google Drive
drive.mount('/content/drive')

# 2. Clone your repo
# REPLACE THIS URL WITH YOUR ACTUAL GITHUB REPO URL!
!git clone https://github.com/Surya-Prasad/Transformer /content/transformer
%cd /content/transformer

# 3. Create a data directory and copy the text file from your Drive
# Adjust the path to wherever your TinyStories text file is located in your Drive
os.makedirs("data", exist_ok=True)
!cp -r /content/drive/MyDrive/data/* data/

# 4. Set up the checkpoint backup folder on Drive
backup_dir = "/content/drive/MyDrive/cs336_checkpoints"
os.makedirs(backup_dir, exist_ok=True)

# 5. Install dependencies
!pip install -e .
!pip install tiktoken jaxtyping numpy torch

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
fatal: destination path '/content/transformer' already exists and is not an empty directory.
/content/transformer
Obtaining file:///content/transformer
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: su-transformer
  Building editable for su-transformer (pyproject.toml) ... [?25l[?25hdone
  Created wheel for su-transformer: filename=su_transformer-1.0.6-py3-none-any.whl size=1362 sha256=7be017aa4c8f6afac01ebe5c45fe3bc69d4a11cbbc1570ce853d43e9f831d9dd
  Stored in directory: /tmp/pip-ephem-wheel-cache-0crl2tpa/wheels/9f/50/4d/486a3e1c0f8603ad9f457b9f79a768e1ea63c797c654bf06d8
Successfully built su-transformer
Instal

In [2]:
%%writefile prepare_custom_data.py
import numpy as np
import pickle
import os
from cs336_basics.bpe_tokenizer import train_bpe, Tokenizer

def prepare():
    input_path = "data/TinyStoriesV2-GPT4-train.txt"
    subset_path = "data/TinyStories_subset.txt"
    print("Creating a smaller subset for BPE training...")
    with open(input_path, "r", encoding="utf-8") as fin:
        with open(subset_path, "w", encoding="utf-8") as fout:
            for i in range(100000):
                line = fin.readline()
                if not line: break
                fout.write(line)

    vocab_size = 10000
    special_tokens = ["<|endoftext|>"]

    print(f"Training BPE tokenizer on SUBSET (Vocab Size: {vocab_size})...")
    vocab, merges = train_bpe(subset_path, vocab_size, special_tokens)

    with open("data/custom_bpe.pkl", "wb") as f:
        pickle.dump({"vocab": vocab, "merges": merges, "special_tokens": special_tokens}, f)
    print("Saved tokenizer vocabulary to data/custom_bpe.pkl")

    print("Tokenizing the dataset...")
    tokenizer = Tokenizer(vocab, merges, special_tokens=special_tokens)
    with open(input_path, "r", encoding="utf-8") as f:
        text = f.read()

    tokens = tokenizer.encode(text)
    print(f"Total tokens encoded: {len(tokens):,}")

    token_array = np.array(tokens, dtype=np.uint16)
    np.save("data/tinystories_tokenized.npy", token_array)
    print("Saved to data/tinystories_tokenized.npy")

if __name__ == "__main__":
    prepare()

Writing prepare_custom_data.py


In [3]:
%%writefile train_colab.py
import torch
import numpy as np
import time
import os
import shutil

from cs336_basics.modules import TransformerLM
# Based on your adapters, these are imported from training
from cs336_basics.training import AdamW, gradient_clipping, LR_cosine_schedule
from tests.adapters import run_get_batch, run_cross_entropy, run_save_checkpoint

def train():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Training on device: {device}")

    vocab_size = 10000
    context_length = 256
    d_model = 256
    num_layers = 4
    num_heads = 8
    d_ff = 1024

    batch_size = 32
    max_iters = 5000
    learning_rate = 5e-4
    min_lr = 1e-5
    warmup_iters = 100
    lr_decay_iters = 5000
    max_grad_norm = 1.0

    print("Loading custom dataset...")
    dataset = np.load("data/tinystories_tokenized.npy")

    model = TransformerLM(
        vocab_size=vocab_size, context_length=context_length,
        d_model=d_model, num_layers=num_layers,
        num_heads=num_heads, d_ff=d_ff, rope_theta=10000.0
    ).to(device)

    optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.1)

    os.makedirs("checkpoints", exist_ok=True)
    backup_dir = "/content/drive/MyDrive/cs336_checkpoints"

    t0 = time.time()
    for it in range(max_iters+1):
        lr = LR_cosine_schedule(it, learning_rate, min_lr, warmup_iters, lr_decay_iters)
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr

        x, y = run_get_batch(dataset, batch_size, context_length, device)
        logits = model(x)
        loss = run_cross_entropy(logits.view(-1, vocab_size), y.view(-1))

        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        gradient_clipping(model.parameters(), max_grad_norm)
        optimizer.step()

        if it % 10 == 0:
            t1 = time.time()
            dt = t1 - t0
            t0 = t1
            print(f"Iter {it} | Loss: {loss.item():.4f} | LR: {lr:.2e} | Time/10-steps: {dt:.2f}s")

        if it > 0 and it % 1000 == 0:
            ckpt_name = f"ckpt_{it}.pt"
            local_path = os.path.join("checkpoints", ckpt_name)
            run_save_checkpoint(model, optimizer, it, local_path)

            drive_path = os.path.join(backup_dir, ckpt_name)
            shutil.copy(local_path, drive_path)
            print(f"Backed up checkpoint to Drive: {drive_path}")

if __name__ == "__main__":
    train()

Writing train_colab.py


In [4]:
!python prepare_custom_data.py

Creating a smaller subset for BPE training...
Training BPE tokenizer on SUBSET (Vocab Size: 10000)...
Saved tokenizer vocabulary to data/custom_bpe.pkl
Tokenizing the dataset...
Total tokens encoded: 541,787,715
Saved to data/tinystories_tokenized.npy


In [5]:
!python train_colab.py

  rope_theta (float): The RoPE $\Theta$ parameter.
Training on device: cuda
Loading custom dataset...
Iter 0 | Loss: 9.2129 | LR: 0.00e+00 | Time/10-steps: 1.12s
Iter 10 | Loss: 9.1287 | LR: 5.00e-05 | Time/10-steps: 2.02s
Iter 20 | Loss: 8.8274 | LR: 1.00e-04 | Time/10-steps: 2.03s
Iter 30 | Loss: 8.3311 | LR: 1.50e-04 | Time/10-steps: 2.03s
Iter 40 | Loss: 7.7525 | LR: 2.00e-04 | Time/10-steps: 2.03s
Iter 50 | Loss: 7.1115 | LR: 2.50e-04 | Time/10-steps: 2.03s
Iter 60 | Loss: 6.4394 | LR: 3.00e-04 | Time/10-steps: 2.03s
Iter 70 | Loss: 5.8805 | LR: 3.50e-04 | Time/10-steps: 2.04s
Iter 80 | Loss: 5.4292 | LR: 4.00e-04 | Time/10-steps: 2.03s
Iter 90 | Loss: 5.1634 | LR: 4.50e-04 | Time/10-steps: 2.03s
Iter 100 | Loss: 4.8644 | LR: 5.00e-04 | Time/10-steps: 2.03s
Iter 110 | Loss: 4.5520 | LR: 5.00e-04 | Time/10-steps: 2.03s
Iter 120 | Loss: 4.2384 | LR: 5.00e-04 | Time/10-steps: 2.03s
Iter 130 | Loss: 4.1145 | LR: 5.00e-04 | Time/10-steps: 2.04s
Iter 140 | Loss: 4.1645 | LR: 5.00e-04 | 

In [6]:
%%writefile generate_colab.py
import torch
import pickle
from cs336_basics.modules import TransformerLM
from cs336_basics.bpe_tokenizer import Tokenizer
from tests.adapters import run_load_checkpoint

def generate(model, idx, max_new_tokens, context_length, temperature=1.0, top_k=None, top_p=None):
    model.eval()

    for _ in range(max_new_tokens):
        idx_cond = idx if idx.size(1) <= context_length else idx[:, -context_length:]

        with torch.no_grad():
            logits = model(idx_cond)

        logits = logits[:, -1, :] / temperature

        if top_k is not None:
            v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
            logits[logits < v[:, [-1]]] = float('-inf')

        if top_p is not None:
            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
            cumulative_probs = torch.cumsum(torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1)

            sorted_indices_to_remove = cumulative_probs > top_p
            sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
            sorted_indices_to_remove[..., 0] = 0

            indices_to_remove = sorted_indices_to_remove.scatter(dim=-1, index=sorted_indices, src=sorted_indices_to_remove)
            logits[indices_to_remove] = float('-inf')

        probs = torch.nn.functional.softmax(logits, dim=-1)
        idx_next = torch.multinomial(probs, num_samples=1)
        idx = torch.cat((idx, idx_next), dim=1)

    return idx

def main():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Generating on device: {device}")

    vocab_size = 10000
    context_length = 256
    d_model = 256
    num_layers = 4
    num_heads = 8
    d_ff = 1024

    print("Loading custom BPE tokenizer...")
    with open("data/custom_bpe.pkl", "rb") as f:
        bpe_data = pickle.load(f)

    tokenizer = Tokenizer(
        vocab=bpe_data["vocab"],
        merges=bpe_data["merges"],
        special_tokens=bpe_data["special_tokens"]
    )

    model = TransformerLM(
        vocab_size=vocab_size, context_length=context_length,
        d_model=d_model, num_layers=num_layers,
        num_heads=num_heads, d_ff=d_ff, rope_theta=10000.0
    ).to(device)

    checkpoint_path = "checkpoints/ckpt_4000.pt"
    print(f"Loading checkpoint from {checkpoint_path}...")

    dummy_optimizer = torch.optim.AdamW(model.parameters())
    run_load_checkpoint(checkpoint_path, model, dummy_optimizer)

    prompt = "Once upon a time, there was a little"
    print(f"\nPrompt: '{prompt}'\n" + "-"*40)

    input_ids = tokenizer.encode(prompt)
    x = torch.tensor([input_ids], dtype=torch.long).to(device)

    y = generate(
        model=model,
        idx=x,
        max_new_tokens=150,
        context_length=context_length,
        temperature=0.8,
        top_k=None,
        top_p=0.9
    )

    generated_text = tokenizer.decode(y[0].tolist())
    print(generated_text)
    print("-" * 40)

if __name__ == "__main__":
    main()

Writing generate_colab.py


In [7]:
!python generate_colab.py

Generating on device: cuda
Loading custom BPE tokenizer...
Loading checkpoint from checkpoints/ckpt_4000.pt...

Prompt: 'Once upon a time, there was a little'
----------------------------------------
Once upon a time, there was a little girl named Lily. Lily loved to play outside in the sunshine. One day, she saw a new bird in the sky. The bird was very pretty and pretty. Lily wanted to meet the bird, so she put her new friend in the birdcage.
Lily and the bird played together in the birdcage. They had so much fun! But then, something unexpected happened. The bird started to fly very fast. It flew very high and fast, up in a tree! Lily was scared at first, but she knew she could play too.
Lily was happy and had a fun day with the bird. She said sorry to the bird and asked for help. The bird told her that the bird was just a special bird. Lily was happy and said
----------------------------------------
