In [1]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2025-02-22 04:09:17--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2025-02-22 04:09:17 (146 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [7]:
%load_ext autoreload
%autoreload 2

import sys
import os

# Assuming your notebook's working directory is set such that ../llm-tokenizer is reachable:
tokenizer_dir = os.path.abspath(os.path.join(os.getcwd(), '../llm-tokenizer'))
if tokenizer_dir not in sys.path:
    sys.path.insert(0, tokenizer_dir)

import BPETokenizer  # Now you should be able to import it

In [3]:
import torch
import torch.nn.functional as F

device = 'cuda' if torch.cuda.is_available else 'cpu'

if torch.cuda.is_available():
    current_device = torch.cuda.current_device()
    device_name = torch.cuda.get_device_name(current_device)
    device_props = torch.cuda.get_device_properties(current_device)
    memory_summary = torch.cuda.memory_summary(device=current_device, abbreviated=True)
    
    print("Current device index:", current_device)
    print("Running on GPU:", device_name)
    print("GPU properties:")
    print("  - Compute Capability:", f"{device_props.major}.{device_props.minor}")
    print("  - Total Memory:", f"{device_props.total_memory / (1024**3):.2f} GB")
    print("  - Multiprocessor Count:", device_props.multi_processor_count)
    print("  - Max Threads per Multiprocessor:", device_props.max_threads_per_multi_processor)
else:
    print("CUDA is not available, running on CPU.")

Current device index: 0
Running on GPU: NVIDIA RTX A6000
GPU properties:
  - Compute Capability: 8.6
  - Total Memory: 47.54 GB
  - Multiprocessor Count: 84
  - Max Threads per Multiprocessor: 1536


# Pretraining Data - GPT Data Loader

In [252]:
class DataLoader:
    """next token prediction DataLoader used for GPT like models"""
    def __init__(self, config):
        
        self.config = config

        with open('input.txt', 'r') as f:
            text = f.read()
        len(text)

        self.tokenizer = BPETokenizer.Tokenizer(text, encoding_vocab_size=2000, raw_tokens=False)
        self.tokenizer.load_from_file()
        encoded_dataset = self.tokenizer.encode(text, raw_tokens=False)
        print(f"max vocabulary size={max(encoded_dataset)}, compression ratio={len(encoded_dataset) / len(text)}")
        split = int(len(encoded_dataset) * 0.80)
        self.train_data =  torch.tensor(encoded_dataset[:split])
        self.val_data = torch.tensor(encoded_dataset[split+config.block_size:])
        print(f"train_data.shape={self.train_data.shape}, val_data.shape={self.val_data.shape}")
        
        self.train_data_ix = 0
        self.val_data_ix = 0
        self.batch_step = self.config.batch_size * self.config.block_size 
        
    def next_batch(self, mode="train", device=device):
        """ mode=["train", "eval"] """
        if mode == "train":
            x, y = self._next_batch_train()
        else:
            x, y = self._next_batch_eval()
        if device:
            return x.to(device), y.to(device)
        return x, y
    
    def _next_batch_train(self):
        
        data = self.train_data
        ix = int(random() * (len(data) - 2*self.batch_step))
        
        buf = data[ix:ix+self.batch_step + 1]     
        x = buf[:-1].view(self.config.batch_size, self.config.block_size)
        y = buf[1:].view(self.config.batch_size, self.config.block_size)
        
        self.train_data_ix += self.batch_step 
        if self.train_data_ix + self.batch_step + 1 > len(self.train_data):
            self.train_data_ix = 0
        
        return x, y
    
    def _next_batch_eval(self):
        
        data = self.train_data
        ix = int(random() * (len(data) - 2*self.batch_step))
        
        buf = data[ix:ix+self.batch_step + 1]     
        x = buf[:-1].view(self.config.batch_size, self.config.block_size)
        y = buf[1:].view(self.config.batch_size, self.config.block_size)
        
        self.val_data_ix += self.batch_step 
        if self.val_data_ix + self.batch_step + 1 > len(self.val_data):
            self.val_data_ix = 0
        
        return x, y

In [253]:
from dataclasses import dataclass

@dataclass
class BERTConfig:
    BERT_batch_size = 6
    batch_size = BERT_batch_size * 2
    block_size = 5
    
config = BERTConfig

In [254]:
data_loader = DataLoader(config)
x, y = data_loader.next_batch(device=None)

max vocabulary size=2213, compression ratio=0.4458684554516162
train_data.shape=torch.Size([397855]), val_data.shape=torch.Size([99459])


In [255]:
x.shape, y.shape

(torch.Size([12, 5]), torch.Size([12, 5]))

# Pretraining Data - BERT Data Loader

In [275]:
class BERTDataLoader(DataLoader):
    """data loader for BERT-like MLM + NSP loss"""
    def __init__(self, config):
        super().__init__(config)
        self.max_vocab_size = max(self.tokenizer.encoding_map.values())
        self.CLS =self.max_vocab_size + 1
        self.SEP = self.CLS + 1
        self.MASK = self.SEP + 1
        print(f"new tokens: {self.max_vocab_size}, {self.CLS}, {self.SEP}, {self.MASK}")
        
    def next_batch(self, device=device, test=False):
        _x, _ = super().next_batch(device=None)
        x, y_MLM, y_NSP = [], [], []
        assert len(_x) % 2 == 0, "BERTDataLoader batch size should be % 2 == 0"
        for ix in range(int(len(_x) / 2)): 
            x0 = _x[ix].clone()
            if random() < 0.5:
                x1 = _x[ix+1].clone()
                y_NSP.append(torch.tensor(1))
            else:
                __x, _ = super().next_batch(device=None)
                x1 = __x[0].clone()
                y_NSP.append(torch.tensor(0))
            y_MLM.append(self._make_input(x0, x1))
            x.append(self._make_input(self._mask(x0), self._mask(x1)))
            
        if test:
            for i, xx in enumerate(x):
                if y_NSP[i] == 1:
                    a, b = y_MLM[i][1:1+BERTConfig.block_size], _x[i]
                    assert (a == b).all(), (a, b)
                    a, b = y_MLM[i][1+BERTConfig.block_size+1:1+2*BERTConfig.block_size+1], _x[i+1]
                    assert (a == b).all(), (a, b)
                    print(f"{i} TEST PASSED same sentence")
            
        return torch.stack(x).to(device), torch.stack(y_MLM).to(device), torch.stack(y_NSP).to(device)
    
    def _make_input(self, x0, x1):
        return torch.cat([
            torch.tensor([self.CLS]), 
            x0,
            torch.tensor([self.SEP]), 
            x1,
            torch.tensor([self.SEP])
        ])
    
    def _mask(self, x):
        for i, v in enumerate(x):
            if random() < 0.15:
                r2 = random()
                if r2 < 0.80:
                    x[i] = data_loader.MASK
                elif 0.80 <= r2 < 0.90:
                    x[i] = int(random() * (self.max_vocab_size - 100))
        return x

In [276]:
data_loader = BERTDataLoader(config)

max vocabulary size=2213, compression ratio=0.4458684554516162
train_data.shape=torch.Size([397855]), val_data.shape=torch.Size([99459])
new tokens: 2215, 2216, 2217, 2218


In [358]:
x, y_MLM, y_NSP = data_loader.next_batch(device=device, test=True)

0 TEST PASSED same sentence
2 TEST PASSED same sentence
3 TEST PASSED same sentence


In [359]:
x.shape, y_MLM.shape, y_NSP.shape

(torch.Size([6, 13]), torch.Size([6, 13]), torch.Size([6]))

# Model

In [281]:
from dataclasses import dataclass

@dataclass
class BERTConfig:
    BERT_batch_size = 6
    batch_size = BERT_batch_size * 2
    block_size = 5
    embedding_size = 16
    vocab_size = 2220 # new tokens: 2215, 2216, 2217, 2218 from dataloader
    input_size = 2 * block_size + 3
    
config = BERTConfig

In [279]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [361]:
def _get_x1_x2(x, config):
    return x[:, 1:1+config.block_size], x[:, 1+config.block_size+1:-1]

class InputRepresentation(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.token_embeddings = nn.Embedding(config.vocab_size, config.embedding_size)
        self.segment_embeddings = nn.Embedding(2, config.embedding_size)
        self.position_embeddings = nn.Embedding(config.input_size + 1, config.embedding_size)
        self.register_buffer('segment_index', 
                             torch.tensor(sum([
                                 [0] * (config.block_size + 1), 
                                 [0], 
                                 [1] * (config.block_size + 1)], [])))
        self.register_buffer('position_index', 
                             torch.arange(config.input_size))
    
    def forward(self, x):
        token_embeddings = self.token_embeddings(x)
        segment_embeddings = self.segment_embeddings(self.segment_index)
        position_embeddings = self.position_embeddings(self.position_index)
        x = position_embeddings + token_embeddings + segment_embeddings
        return x

In [363]:
input_embeddings = InputRepresentation(config)
input_embeddings.to(device)

InputRepresentation(
  (token_embeddings): Embedding(2220, 16)
  (segment_embeddings): Embedding(2, 16)
  (position_embeddings): Embedding(14, 16)
)

In [366]:
y = input_embeddings(x)
y.shape

torch.Size([6, 13, 16])