<a href="https://colab.research.google.com/github/Omarnbl/Tuned-GPT-2-on-FineWeb-EDU-with-PyTorch/blob/main/GPT2_Clone.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GPT2 Clone Notebook



# Step 1: Install dependencies


In [None]:
!pip install -U datasets huggingface_hub fsspec


Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are i

# Step 2: Import libraries


In [None]:
import os
import numpy as np
import tiktoken
from datasets import load_dataset
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.nn import functional as F
from dataclasses import dataclass
import math
import time
import inspect

# Step 3: Setup local directory and shard size


In [None]:
local_dir = "edu_fineweb300M"
shard_size = int(1e8) # 100M tokens per shard
num_shards = 3
os.makedirs(local_dir, exist_ok=True)


# Step 4: Load dataset


In [None]:
fw = load_dataset("HuggingFaceFW/fineweb-edu", name="sample-10BT", split="train", streaming=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/2110 [00:00<?, ?it/s]

# Step 5: Initialize tokenizer


In [None]:
enc = tiktoken.get_encoding("gpt2")
eot = enc._special_tokens['<|endoftext|>']

def tokenize(doc):
    tokens = [eot]
    tokens.extend(enc.encode_ordinary(doc["text"]))
    return np.array(tokens, dtype=np.uint16)

def write_datafile(filename, tokens_np):
    np.save(filename, tokens_np)

# Step 6: Process and save shards


In [None]:
shard_index = 0
all_tokens_np = np.empty((shard_size,), dtype=np.uint16)
token_count = 0
progress_bar = tqdm(total=shard_size, unit="tokens", desc=f"Shard {shard_index}")

for doc in fw:
    tokens = tokenize(doc)
    remaining_space = shard_size - token_count
    if len(tokens) <= remaining_space:
        all_tokens_np[token_count:token_count+len(tokens)] = tokens
        token_count += len(tokens)
        progress_bar.update(len(tokens))
    else:
        all_tokens_np[token_count:] = tokens[:remaining_space]
        progress_bar.update(remaining_space)
        filename = os.path.join(local_dir, f"edufineweb_train_{shard_index:06d}.npy")
        write_datafile(filename, all_tokens_np)

        shard_index += 1
        if shard_index >= num_shards:
            break

        all_tokens_np = np.empty((shard_size,), dtype=np.uint16)
        token_count = len(tokens) - remaining_space
        all_tokens_np[:token_count] = tokens[remaining_space:]
        progress_bar = tqdm(total=shard_size, unit="tokens", desc=f"Shard {shard_index}")

if shard_index < num_shards and token_count > 0:
    filename = os.path.join(local_dir, f"edufineweb_train_{shard_index:06d}.npy")
    write_datafile(filename, all_tokens_np[:token_count])
    print(f"🔹 Final partial shard {shard_index} saved with {token_count} tokens.")

print("✅ Done. Saved up to 3 shards (300 million tokens) in:", local_dir)

Shard 0: 100%|█████████▉| 99884228/100000000 [01:40<00:00, 1187506.87tokens/s]
Shard 1:   0%|          | 0/100000000 [00:00<?, ?tokens/s][A
Shard 0: 100%|██████████| 100000000/100000000 [01:40<00:00, 991798.74tokens/s]

Shard 1:   0%|          | 226654/100000000 [00:00<01:27, 1144791.24tokens/s][A
Shard 1:   0%|          | 341376/100000000 [00:00<01:30, 1106222.84tokens/s][A
Shard 1:   0%|          | 452562/100000000 [00:00<01:30, 1102138.36tokens/s][A
Shard 1:   1%|          | 570098/100000000 [00:00<01:30, 1103690.95tokens/s][A
Shard 1:   1%|          | 680925/100000000 [00:00<01:31, 1090388.28tokens/s][A
Shard 1:   1%|          | 799045/100000000 [00:00<01:28, 1118621.17tokens/s][A
Shard 1:   1%|          | 943792/100000000 [00:00<01:22, 1206247.82tokens/s][A
Shard 1:   1%|          | 1065162/100000000 [00:00<01:24, 1176616.38tokens/s][A
Shard 1:   1%|          | 1187255/100000000 [00:01<01:27, 1132790.73tokens/s][A
Shard 1:   1%|▏         | 1318610/100000000 [00:01<01:23,

✅ Done. Saved up to 3 shards (300 million tokens) in: edu_fineweb300M


# Step 7: Define GPT2 Model


In [None]:
@dataclass
class GPTConfig:
    block_size: int = 1024
    vocab_size: int = 50257
    n_layer: int = 12
    n_head: int = 12
    n_embd: int = 768

class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        self.c_proj.NANOGPT_SCALE_INIT = 1
        self.n_head = config.n_head
        self.n_embd = config.n_embd

    def forward(self, x):
        B, T, C = x.size()
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        y = F.scaled_dot_product_attention(q, k, v, is_causal=True)
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        return self.c_proj(y)

class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.gelu = nn.GELU(approximate='tanh')
        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
        self.c_proj.NANOGPT_SCALE_INIT = 1

    def forward(self, x):
        return self.c_proj(self.gelu(self.c_fc(x)))

class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.transformer.wte.weight = self.lm_head.weight
        self.apply(self._init_weights)
    def configure_optimizers(self, weight_decay, learning_rate, device_type):
        # start with all of the candidate parameters (that require grad)
        param_dict = {pn: p for pn, p in self.named_parameters()}
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)
        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
        # Create AdamW optimizer and use the fused version if it is available
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == "cuda"
        print(f"using fused AdamW: {use_fused}")
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8, fused=use_fused)
        return optimizer

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            std = 0.02
            if hasattr(module, 'NANOGPT_SCALE_INIT'):
                std *= (2 * self.config.n_layer) ** -0.5
            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.size()
        assert T <= self.config.block_size
        pos = torch.arange(0, T, dtype=torch.long, device=idx.device)
        pos_emb = self.transformer.wpe(pos)
        tok_emb = self.transformer.wte(idx)
        x = tok_emb + pos_emb
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss



# Step 8: Final training setup


In [None]:
torch.set_float32_matmul_precision("high")
device = "cuda" if torch.cuda.is_available() else "cpu"
model = GPT(GPTConfig(vocab_size=50304))
model.eval()
model.to(device)
model = torch.compile(model)

# Learning rate schedule parameters
warmup_steps = 2000
max_steps = 100_000
max_lr = 6e-4
min_lr = 6e-5

def get_lr(it):
    if it < warmup_steps:
        return max_lr * (it + 1) / warmup_steps
    if it > max_steps:
        return min_lr
    decay_ratio = (it - warmup_steps) / (max_steps - warmup_steps)
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return min_lr + coeff * (max_lr - min_lr)


# Step 9: Training parameters


In [None]:
total_batch_size = 524288  # tokens
B = 8                      # micro batch size
T = 1024                   # sequence length
assert total_batch_size % (B * T) == 0
grad_accum_steps = total_batch_size // (B * T)
print(f"Total batch size: {total_batch_size}, Gradient accumulation steps: {grad_accum_steps}")


Total batch size: 524288, Gradient accumulation steps: 64


# Dummy DataLoaderLite class


In [None]:
class DataLoaderLite:
    def __init__(self, B, T, process_rank=0, num_processes=1, split='train'):
        self.B = B
        self.T = T
        self.idx = 0
        self.data = np.load(f"{local_dir}/edufineweb_train_000000.npy")  # Load first shard
        self.data = torch.from_numpy(self.data).long()

    def next_batch(self):
        if self.idx + self.B * self.T >= len(self.data):
            self.idx = 0
        x = self.data[self.idx:self.idx + self.B * self.T].view(self.B, self.T)
        y = self.data[self.idx + 1:self.idx + 1 + self.B * self.T].view(self.B, self.T)
        self.idx += self.B * self.T
        return x, y

# Step 10: Optimizer


In [None]:
optimizer = model.configure_optimizers(weight_decay=0.1, learning_rate=max_lr, device_type=device)
train_loader = DataLoaderLite(B, T)

num decayed parameter tensors: 50, with 124,354,560 parameters
num non-decayed parameter tensors: 98, with 121,344 parameters
using fused AdamW: True


# Step 11: Training loop


In [None]:
for step in range(max_steps):
    t0 = time.time()
    optimizer.zero_grad()
    loss_accum = 0.0
    for _ in range(grad_accum_steps):
        x, y = train_loader.next_batch()
        x, y = x.to(device), y.to(device)
        logits, loss = model(x, y)
        loss = loss / grad_accum_steps
        loss_accum += loss.item()
        loss.backward()
    norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    lr = get_lr(step)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    optimizer.step()
    t1 = time.time()
    tokens_per_sec = (train_loader.B * train_loader.T * grad_accum_steps) / (t1 - t0)
    print(f"step {step:5d} | loss: {loss_accum:.4f} | grad_norm: {norm:.2f} | lr: {lr:.6f} | {tokens_per_sec:.0f} tok/s | {1000*(t1-t0):.2f}ms")


W0516 14:40:23.948000 274 torch/_inductor/utils.py:1137] [0/0] Not enough SMs to use max_autotune_gemm mode


step     0 | loss: 10.9296 | grad_norm: 15.05 | lr: 0.000000 | 3056 tok/s | 171550.01ms
step     1 | loss: 10.9155 | grad_norm: 15.04 | lr: 0.000001 | 3826 tok/s | 137032.69ms
step     2 | loss: 10.8766 | grad_norm: 14.86 | lr: 0.000001 | 3817 tok/s | 137352.18ms
step     3 | loss: 10.8238 | grad_norm: 14.75 | lr: 0.000001 | 3816 tok/s | 137379.51ms
step     4 | loss: 10.7570 | grad_norm: 14.09 | lr: 0.000001 | 3813 tok/s | 137498.09ms
step     5 | loss: 10.6754 | grad_norm: 13.10 | lr: 0.000002 | 3809 tok/s | 137639.44ms
step     6 | loss: 10.5881 | grad_norm: 11.92 | lr: 0.000002 | 3822 tok/s | 137176.59ms
step     7 | loss: 10.5013 | grad_norm: 10.47 | lr: 0.000002 | 3811 tok/s | 137563.17ms
step     8 | loss: 10.4137 | grad_norm: 9.28 | lr: 0.000003 | 3821 tok/s | 137203.00ms
step     9 | loss: 10.3252 | grad_norm: 8.42 | lr: 0.000003 | 3821 tok/s | 137206.61ms
step    10 | loss: 10.2332 | grad_norm: 7.84 | lr: 0.000003 | 3813 tok/s | 137504.42ms
step    11 | loss: 10.1834 | grad_n

# Step 12: Save final model


In [None]:
torch.save(model.state_dict(), os.path.join(local_dir, "gpt2_edu_final.pt"))
print("✅ Training complete. Model saved.")