## 1. Check GPU

## 1. Setup - Clone Repo & Install Dependencies

In [None]:
# Install dependencies first
%uv pip install -q tiktoken transformers datasets tqdm torch

# Now import and setup
import os
import sys

# Clone repo if running on Colab or Lightning AI (where model.py might be missing)
if not os.path.exists('model.py'):
    print("Cloning repository...")
    !git clone https://github.com/RealAndrewRen/nanoGPT.git
    # Add the cloned repo to the python path so imports work
    sys.path.append(os.path.abspath('nanoGPT'))
    # Also change directory to the repo so relative paths work
    os.chdir('nanoGPT')

import tiktoken
import transformers
import datasets
import tqdm
print(f"Current working directory: {os.getcwd()}")
print("‚úÖ Setup complete!")

Note: you may need to restart the kernel to use updated packages.
Cloning repository...
fatal: destination path 'nanoGPT' already exists and is not an empty directory.
Current working directory: /root/nanoGPT
‚úÖ Setup complete!


In [None]:
# Check GPU
!nvidia-smi

import torch
print(f"\nPyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

Thu Dec  4 07:13:25 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 575.57.08              Driver Version: 575.57.08      CUDA Version: 12.9     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA H100 NVL                On  |   00000001:00:00.0 Off |                    0 |
| N/A   45C    P0             63W /  400W |       0MiB /  95830MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                     

## 2. Create Folders for Your Files

In [None]:
import os

os.makedirs('sft/data/finance_data', exist_ok=True)
os.makedirs('out-finance_data', exist_ok=True)
os.makedirs('out-sft', exist_ok=True)

print("üìÅ Folders created!")
print("")
print("Now upload your files:")
print("  sft/data/finance_data/  ‚Üê train.bin, train_mask.bin, val.bin, val_mask.bin")
print("  out-finance_data/       ‚Üê ckpt.pt (Pretrained model)")
print("")
print("Use the file browser on the left or drag-and-drop to upload files.")

üìÅ Folders created!

Now upload your files:
  sft/data/finance_data/  ‚Üê train.bin, train_mask.bin, val.bin, val_mask.bin
  out-finance_data/       ‚Üê ckpt.pt (Pretrained model)

Use the file browser on the left or drag-and-drop to upload files.


## 3. Verify Files Are In Place

In [None]:
import os
import numpy as np

sft_dir = 'sft/data/finance_data'
files_needed = ['train.bin', 'train_mask.bin', 'val.bin', 'val_mask.bin']

print("SFT Data:")
all_ok = True
for f in files_needed:
    path = os.path.join(sft_dir, f)
    if os.path.exists(path):
        size = os.path.getsize(path) / (1024*1024)
        print(f"  ‚úÖ {f} ({size:.2f} MB)")
    else:
        print(f"  ‚ùå MISSING: {f}")
        all_ok = False

print("\nPretrained Checkpoint:")
ckpt_locations = [
    'out-finance_data/ckpt.pt',
    'out-finance-char/ckpt.pt'
]

ckpt_found = False
for loc in ckpt_locations:
    if os.path.exists(loc):
        size = os.path.getsize(loc) / (1024*1024)
        print(f"  ‚úÖ {loc} ({size:.2f} MB)")
        ckpt_found = True
        break

if not ckpt_found:
    print(f"  ‚ùå MISSING: ckpt.pt (expected in out-finance_data/ or out-finance-char/)")
    all_ok = False

if all_ok and os.path.exists(os.path.join(sft_dir, 'train.bin')):
    train_tokens = np.fromfile(os.path.join(sft_dir, 'train.bin'), dtype=np.uint16)
    train_mask = np.fromfile(os.path.join(sft_dir, 'train_mask.bin'), dtype=np.uint8)
    print(f"\nüìä Dataset Statistics:")
    print(f"Train tokens: {len(train_tokens):,}")
    print(f"Mask coverage: {train_mask.sum() / len(train_mask) * 100:.1f}% assistant tokens")

    # Quick check for [CLEANED] tokens
    import tiktoken
    enc = tiktoken.get_encoding("gpt2")
    sample = train_tokens[:10000]
    text = enc.decode(sample.tolist())
    cleaned_count = text.count('[CLEANED]')

    if cleaned_count > 0:
        print(f"\n‚ö†Ô∏è WARNING: Found {cleaned_count} [CLEANED] tokens in sample!")
        print("Your training data may be corrupted. Consider regenerating with prepare_clean.py")
    else:
        print("\n‚úÖ Data looks clean! Ready to train!")
else:
    print("\n‚ö†Ô∏è Please upload the missing files before training.")

SFT Data:
  ‚úÖ train.bin (136.63 MB)
  ‚úÖ train_mask.bin (68.31 MB)
  ‚úÖ val.bin (15.27 MB)
  ‚úÖ val_mask.bin (7.64 MB)

Pretrained Checkpoint:
  ‚úÖ out-finance_data/ckpt.pt (1417.50 MB)

üìä Dataset Statistics:
Train tokens: 71,632,581
Mask coverage: 50.3% assistant tokens

‚úÖ Data looks clean! Ready to train!


## 4. Training Configuration

## 3.5 Upload Your Files

Upload these files to their respective directories:

**SFT Data** ‚Üí `sft/data/finance_data/`:
- `train.bin`
- `train_mask.bin`
- `val.bin`
- `val_mask.bin`

**Pretrained Model** ‚Üí `out-finance_data/`:
- `ckpt.pt` (your pretrained checkpoint)

You can upload files using the file browser in the sidebar or drag-and-drop.

In [None]:
# ============================================================
# SFT TRAINING CONFIG - Optimized for H100 GPU
# ============================================================
import os

PRETRAINED_CKPT = 'out-finance_data/ckpt.pt'
SFT_DATA_DIR = 'sft/data/finance_data'
OUTPUT_DIR = 'out-sft'

# H100 Settings (80GB VRAM - maximize utilization!)
BATCH_SIZE = 64        # Increased to 64 to fully utilize H100
GRADIENT_ACCUM = 2     # Effective batch size of 128 (64 * 2)
BLOCK_SIZE = 256       # MUST match pretrained model's block_size!
TARGET_EPOCHS = 2      # Target number of epochs (1-3 is standard for SFT)

EVAL_INTERVAL = 250
LOG_INTERVAL = 10

LEARNING_RATE = 5e-6
MIN_LR = 1e-6
WARMUP_ITERS = 100

DROPOUT = 0.1
WEIGHT_DECAY = 0.1

COMPILE = True         # Enable torch.compile for H100 (massive speedup)

# Calculate MAX_ITERS based on dataset size
tokens_per_step = BATCH_SIZE * GRADIENT_ACCUM * BLOCK_SIZE
train_bin_path = os.path.join(SFT_DATA_DIR, 'train.bin')

if os.path.exists(train_bin_path):
    # train.bin is uint16 (2 bytes per token)
    total_tokens = os.path.getsize(train_bin_path) // 2
    steps_per_epoch = total_tokens // tokens_per_step
    MAX_ITERS = steps_per_epoch * TARGET_EPOCHS
    print(f"Dataset size: {total_tokens/1e6:.1f}M tokens")
    print(f"Steps per epoch: {steps_per_epoch}")
else:
    print("‚ö†Ô∏è train.bin not found, using default MAX_ITERS")
    MAX_ITERS = 5000 # Fallback

print(f"Effective batch size: {tokens_per_step:,} tokens")
print(f"Learning rate: {LEARNING_RATE}")
print(f"Max iterations: {MAX_ITERS} (Target: {TARGET_EPOCHS} epochs)")
print(f"Compilation: {'Enabled' if COMPILE else 'Disabled'}")
print(f"üöÄ H100 GPU detected - optimized for maximum throughput!")

Dataset size: 71.6M tokens
Steps per epoch: 2186
Effective batch size: 32,768 tokens
Learning rate: 5e-06
Max iterations: 4372 (Target: 2 epochs)
Compilation: Enabled
üöÄ H100 GPU detected - optimized for maximum throughput!


## 5. Run SFT Training

In [None]:
import os
import time
import math
from contextlib import nullcontext

import numpy as np
import torch
import torch.nn.functional as F

from model import GPTConfig, GPT

device = 'cuda' if torch.cuda.is_available() else 'cpu'
dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16'
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = torch.amp.autocast(device_type='cuda', dtype=ptdtype) if device == 'cuda' else nullcontext()

torch.manual_seed(1337)
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Device: {device}, Dtype: {dtype}")
print(f"Block size: {BLOCK_SIZE}")

# Data loader - FIXED with validation
def get_batch(split):
    if split == 'train':
        data = np.memmap(os.path.join(SFT_DATA_DIR, 'train.bin'), dtype=np.uint16, mode='r')
        mask = np.memmap(os.path.join(SFT_DATA_DIR, 'train_mask.bin'), dtype=np.uint8, mode='r')
    else:
        data = np.memmap(os.path.join(SFT_DATA_DIR, 'val.bin'), dtype=np.uint16, mode='r')
        mask = np.memmap(os.path.join(SFT_DATA_DIR, 'val_mask.bin'), dtype=np.uint8, mode='r')

    # Ensure we have enough data
    if len(data) <= BLOCK_SIZE:
        raise ValueError(f"Dataset too small! Has {len(data)} tokens, need at least {BLOCK_SIZE + 1}")

    # Generate random starting indices - ensure they don't exceed bounds
    max_start = len(data) - BLOCK_SIZE - 1
    ix = torch.randint(0, max_start, (BATCH_SIZE,))

    # Build batches with explicit size control
    x = torch.stack([torch.from_numpy(data[i:i+BLOCK_SIZE].astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy(data[i+1:i+1+BLOCK_SIZE].astype(np.int64)) for i in ix])
    m = torch.stack([torch.from_numpy(mask[i+1:i+1+BLOCK_SIZE].astype(np.float32)) for i in ix])

    # Validate shapes
    assert x.shape == (BATCH_SIZE, BLOCK_SIZE), f"X shape mismatch: {x.shape}"
    assert y.shape == (BATCH_SIZE, BLOCK_SIZE), f"Y shape mismatch: {y.shape}"
    assert m.shape == (BATCH_SIZE, BLOCK_SIZE), f"M shape mismatch: {m.shape}"

    return x.to(device), y.to(device), m.to(device)

# Load pretrained model
print(f"\nLoading pretrained model from {PRETRAINED_CKPT}...")
checkpoint = torch.load(PRETRAINED_CKPT, map_location=device)
model_args = checkpoint['model_args']
model_args['dropout'] = DROPOUT

gptconf = GPTConfig(**model_args)
model = GPT(gptconf)

state_dict = checkpoint['model']
unwanted_prefix = '_orig_mod.'
for k, v in list(state_dict.items()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)

model.load_state_dict(state_dict)
model.to(device)
print(f"Model loaded! Parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Model block size: {model.config.block_size}")

optimizer = model.configure_optimizers(WEIGHT_DECAY, LEARNING_RATE, (0.9, 0.95), 'cuda')
scaler = torch.amp.GradScaler('cuda', enabled=(dtype == 'float16'))

# Masked loss function
def compute_masked_loss(logits, targets, mask):
    B, T, V = logits.shape
    loss_per_token = F.cross_entropy(
        logits.view(B * T, V),
        targets.view(B * T),
        reduction='none'
    ).view(B, T)

    masked_loss = loss_per_token * mask
    num_masked = mask.sum()

    if num_masked > 0:
        return masked_loss.sum() / num_masked
    return loss_per_token.mean()

@torch.no_grad()
def estimate_loss(eval_iters=50):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y, M = get_batch(split)
            with ctx:
                logits, _ = model(X)  # Model returns (logits, None) when no targets
                loss = compute_masked_loss(logits, Y, M)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

def get_lr(it):
    if it < WARMUP_ITERS:
        return LEARNING_RATE * (it + 1) / (WARMUP_ITERS + 1)
    if it > MAX_ITERS:
        return MIN_LR
    decay_ratio = (it - WARMUP_ITERS) / (MAX_ITERS - WARMUP_ITERS)
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return MIN_LR + coeff * (LEARNING_RATE - MIN_LR)

print(f"\n{'='*50}")
print("STARTING SFT TRAINING")
print(f"{'='*50}")

iter_num = 0
best_val_loss = 1e9
t0 = time.time()
X, Y, M = get_batch('train')

# Compile model after first eval (avoids compilation issues)
compiled = False

while iter_num <= MAX_ITERS:
    lr = get_lr(iter_num)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

    if iter_num % EVAL_INTERVAL == 0:
        losses = estimate_loss()
        print(f"\nStep {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

        # Compile after first successful eval
        if not compiled and COMPILE and iter_num > 0:
            print("Compiling model for faster training...")
            model = torch.compile(model)
            compiled = True

        if losses['val'] < best_val_loss:
            best_val_loss = losses['val']
            if iter_num > 0:
                ckpt = {
                    'model': model._orig_mod.state_dict() if compiled else model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'model_args': model_args,
                    'iter_num': iter_num,
                    'best_val_loss': best_val_loss,
                }
                torch.save(ckpt, os.path.join(OUTPUT_DIR, 'ckpt.pt'))
                print(f"üíæ Saved checkpoint (val_loss: {best_val_loss:.4f})")

    for micro_step in range(GRADIENT_ACCUM):
        with ctx:
            logits, _ = model(X)  # Model returns (logits, None) when no targets
            loss = compute_masked_loss(logits, Y, M) / GRADIENT_ACCUM
        X, Y, M = get_batch('train')
        scaler.scale(loss).backward()

    scaler.unscale_(optimizer)
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    scaler.step(optimizer)
    scaler.update()
    optimizer.zero_grad(set_to_none=True)

    t1 = time.time()
    dt = t1 - t0
    t0 = t1

    if iter_num % LOG_INTERVAL == 0:
        lossf = loss.item() * GRADIENT_ACCUM
        print(f"iter {iter_num}: loss {lossf:.4f}, lr {lr:.2e}, time {dt*1000:.0f}ms")

    iter_num += 1

print(f"\n{'='*50}")
print(f"‚úÖ TRAINING COMPLETE! Best val loss: {best_val_loss:.4f}")
print(f"Model saved to: {OUTPUT_DIR}/ckpt.pt")
print(f"{'='*50}")

Device: cuda, Dtype: bfloat16
Block size: 256

Loading pretrained model from out-finance_data/ckpt.pt...
number of parameters: 123.65M
Model loaded! Parameters: 123,849,984
Model block size: 256
num decayed parameter tensors: 50, with 123,728,640 parameters
num non-decayed parameter tensors: 98, with 121,344 parameters
using fused AdamW: True

STARTING SFT TRAINING

Step 0: train loss 4.7159, val loss 4.7650
iter 0: loss 4.7136, lr 4.95e-08, time 3849ms
iter 10: loss 4.8235, lr 5.45e-07, time 180ms
iter 20: loss 4.4692, lr 1.04e-06, time 179ms
iter 30: loss 4.3630, lr 1.53e-06, time 180ms
iter 40: loss 4.4721, lr 2.03e-06, time 180ms
iter 50: loss 4.0233, lr 2.52e-06, time 181ms
iter 60: loss 3.5951, lr 3.02e-06, time 179ms
iter 70: loss 3.6870, lr 3.51e-06, time 180ms
iter 80: loss 3.5237, lr 4.01e-06, time 180ms
iter 90: loss 3.4842, lr 4.50e-06, time 179ms
iter 100: loss 3.4339, lr 5.00e-06, time 181ms
iter 110: loss 3.1898, lr 5.00e-06, time 181ms
iter 120: loss 3.1866, lr 5.00e-06

## 6. Test the SFT Model

In [None]:
import tiktoken

checkpoint = torch.load(os.path.join(OUTPUT_DIR, 'ckpt.pt'), map_location=device)
model_args = checkpoint['model_args']
model_args['dropout'] = 0.0

gptconf = GPTConfig(**model_args)
model = GPT(gptconf)
model.load_state_dict(checkpoint['model'])
model.to(device)
model.eval()

enc = tiktoken.get_encoding('gpt2')

def generate_response(prompt, max_tokens=200, temperature=0.8):
    formatted = f"<user>\n{prompt}\n</user>\n<assistant>\n"
    x = torch.tensor(enc.encode(formatted), dtype=torch.long, device=device).unsqueeze(0)

    with torch.no_grad():
        y = model.generate(x, max_tokens, temperature=temperature, top_k=50)

    response = enc.decode(y[0].tolist())
    if "<assistant>" in response:
        response = response.split("<assistant>")[-1]
    if "</assistant>" in response:
        response = response.split("</assistant>")[0]
    return response.strip()

print("‚úÖ Model ready for testing!")

number of parameters: 123.65M
‚úÖ Model ready for testing!


In [None]:
# Test with finance questions
prompts = [
    "What is compound interest?",
    "Explain the difference between stocks and bonds.",
    "What is a 401k?",
]

for p in prompts:
    print(f"\n{'='*50}")
    print(f"USER: {p}")
    print(f"{'='*50}")
    print(f"ASSISTANT: {generate_response(p)}")


USER: What is compound interest?
ASSISTANT: The compound interest of an index is that it is the rate at which they sell or pay a price, and that is the price paid on the basis of the supply and demand for that asset.
The ratio between compound interest and the nominal rate is 2.5, and as a result, it is the ratio of compound interest on the index that is the cost of money per unit of the index.
Inflation and price stability are important factors in our economic decisions. We have to make sure that prices are stable, otherwise we could run out of money.
The use of compound interest is a widely used tool in monetary policy. It is used to calculate the cost of money for governments when interest rates rise beyond the level of inflation. It has been used to calculate the cost of a bond payment and to determine the inflation rate for a particular maturity.
The use of compound interest is also used to determine the interest rate for the price of government bonds. It is used to calculate the

In [None]:
# Ask your own question!
prompt = "What should I consider when investing?"

print(f"USER: {prompt}\n")
print(f"ASSISTANT: {generate_response(prompt, max_tokens=300)}")

USER: What should I consider when investing?

ASSISTANT: When it comes to investing, there are a number of things you can consider when investing. Firstly, it's important to remember that you're investing in an asset that is not yet in your portfolio. If you can't find it, there's certainly a good chance you can't find it on your own. It's also important to have some flexibility in the amount of money you invest. You can make these suggestions for each investment.
1. Consider the asset's value and the associated cost of the investment. For example, if the asset is $500 million, you might want to consider taking an interest rate and a credit card to pay off the loan. However, you will need to consider the cost of the interest rate, credit card and even the actual cost of the investment. 2. Consider the investment's volatility. If the asset is volatile, it will likely have negative returns and you may have to invest in a different investment to find out which one you're investing in. Add

## 7. Download Trained Model

In [None]:
# Download the trained SFT model
# Rename with descriptive name
import os
import shutil

output_name = 'out-sft_ckpt.pt'
shutil.copy('out-sft/ckpt.pt', output_name)

print(f"‚úÖ Copied checkpoint to: {output_name}")
print(f"File size: {os.path.getsize(output_name) / (1024*1024):.2f} MB")

# If on Colab, download it
try:
    from google.colab import files
    files.download(output_name)
    print("‚¨áÔ∏è Download started!")
except:
    print("üíæ File ready for download via file browser")