In [1]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2025-02-22 04:09:17--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2025-02-22 04:09:17 (146 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [7]:
%load_ext autoreload
%autoreload 2

import sys
import os

# Assuming your notebook's working directory is set such that ../llm-tokenizer is reachable:
tokenizer_dir = os.path.abspath(os.path.join(os.getcwd(), '../llm-tokenizer'))
if tokenizer_dir not in sys.path:
    sys.path.insert(0, tokenizer_dir)

import BPETokenizer  # Now you should be able to import it

In [3]:
import torch
import torch.nn.functional as F

device = 'cuda' if torch.cuda.is_available else 'cpu'

if torch.cuda.is_available():
    current_device = torch.cuda.current_device()
    device_name = torch.cuda.get_device_name(current_device)
    device_props = torch.cuda.get_device_properties(current_device)
    memory_summary = torch.cuda.memory_summary(device=current_device, abbreviated=True)
    
    print("Current device index:", current_device)
    print("Running on GPU:", device_name)
    print("GPU properties:")
    print("  - Compute Capability:", f"{device_props.major}.{device_props.minor}")
    print("  - Total Memory:", f"{device_props.total_memory / (1024**3):.2f} GB")
    print("  - Multiprocessor Count:", device_props.multi_processor_count)
    print("  - Max Threads per Multiprocessor:", device_props.max_threads_per_multi_processor)
else:
    print("CUDA is not available, running on CPU.")

Current device index: 0
Running on GPU: NVIDIA RTX A6000
GPU properties:
  - Compute Capability: 8.6
  - Total Memory: 47.54 GB
  - Multiprocessor Count: 84
  - Max Threads per Multiprocessor: 1536


In [179]:
class DataLoader:
    def __init__(self, config):
        
        self.config = config

        with open('input.txt', 'r') as f:
            text = f.read()
        len(text)

        self.tokenizer = BPETokenizer.Tokenizer(text, encoding_vocab_size=2000, raw_tokens=False)
        self.tokenizer.load_from_file()
        encoded_dataset = self.tokenizer.encode(text, raw_tokens=False)
        print(f"max vocabulary size={max(encoded_dataset)}, compression ratio={len(encoded_dataset) / len(text)}")
        split = int(len(encoded_dataset) * 0.80)
        self.train_data =  torch.tensor(encoded_dataset[:split])
        self.val_data = torch.tensor(encoded_dataset[split+config.block_size:])
        print(f"train_data.shape={self.train_data.shape}, val_data.shape={self.val_data.shape}")
        
        self.train_data_ix = 0
        self.val_data_ix = 0
        self.batch_step = self.config.batch_size * self.config.block_size 
        
    def next_batch(self, mode="train", device=device):
        """ mode=["train", "eval"] """
        if mode == "train":
            x, y = self._next_batch_train()
        else:
            x, y = self._next_batch_eval()
        if device:
            return x.to(device), y.to(device)
        return x, y
    
    def _next_batch_train(self):
        
        data = self.train_data
        ix = int(random() * (len(data) - 2*self.batch_step))
        
        buf = data[ix:ix+self.batch_step + 1]     
        x = buf[:-1].view(self.config.batch_size, self.config.block_size)
        y = buf[1:].view(self.config.batch_size, self.config.block_size)
        
        self.train_data_ix += self.batch_step 
        if self.train_data_ix + self.batch_step + 1 > len(self.train_data):
            self.train_data_ix = 0
        
        return x, y
    
    def _next_batch_eval(self):
        
        data = self.train_data
        ix = int(random() * (len(data) - 2*self.batch_step))
        
        buf = data[ix:ix+self.batch_step + 1]     
        x = buf[:-1].view(self.config.batch_size, self.config.block_size)
        y = buf[1:].view(self.config.batch_size, self.config.block_size)
        
        self.val_data_ix += self.batch_step 
        if self.val_data_ix + self.batch_step + 1 > len(self.val_data):
            self.val_data_ix = 0
        
        return x, y
    

        
        



In [180]:
from dataclasses import dataclass

@dataclass
class BERTConfig:
    BERT_batch_size = 6
    batch_size = BERT_batch_size * 2
    block_size = 5
    
config = BERTConfig

In [184]:
data_loader = DataLoader(config)
x, y = data_loader.next_batch(device=None)

max vocabulary size=2213, compression ratio=0.4458684554516162
train_data.shape=torch.Size([397855]), val_data.shape=torch.Size([99459])


In [185]:
x, y

(tensor([[ 924, 1510,  256,  570,  503],
         [ 264,  391,  334,  473,  272],
         [ 291,  287,  798,  280,  290],
         [ 117,  738,  288,  623,  116],
         [ 452,  488, 1890, 1903,  296],
         [ 272, 1410,  260,  119,  316],
         [ 104,  684,  282,  763,  263],
         [ 316,  602,   67,  491,  259],
         [ 369,   97, 1903,  783,  413],
         [ 343,  109,  686, 1933,  761],
         [1268,  331, 1510,  256,  320],
         [ 391,  317,  413, 1624,  261]]),
 tensor([[1510,  256,  570,  503,  264],
         [ 391,  334,  473,  272,  291],
         [ 287,  798,  280,  290,  117],
         [ 738,  288,  623,  116,  452],
         [ 488, 1890, 1903,  296,  272],
         [1410,  260,  119,  316,  104],
         [ 684,  282,  763,  263,  316],
         [ 602,   67,  491,  259,  369],
         [  97, 1903,  783,  413,  343],
         [ 109,  686, 1933,  761, 1268],
         [ 331, 1510,  256,  320,  391],
         [ 317,  413, 1624,  261, 1001]]))

In [65]:
x, y

(tensor([[  70,  299,  296,   32, 1709,  984,  655],
         [ 538,  585,  111,  364,  424,  293,  110],
         [ 265, 1199,  114,  368,  261, 1398,  272],
         [ 690,  854,  331,  953,  275,  969,  987],
         [ 798,  261,  854,  331,   10,   70,  299],
         [ 296,   32, 1709,  581,  293,  309,  686]]),
 tensor([[ 299,  296,   32, 1709,  984,  655,  538],
         [ 585,  111,  364,  424,  293,  110,  265],
         [1199,  114,  368,  261, 1398,  272,  690],
         [ 854,  331,  953,  275,  969,  987,  798],
         [ 261,  854,  331,   10,   70,  299,  296],
         [  32, 1709,  581,  293,  309,  686,   32]]))

2215

2216 2217 2218


In [245]:
class BERTDataLoader(DataLoader):
    def __init__(self, config):
        super().__init__(config)
        self.max_vocab_size = max(self.tokenizer.encoding_map.values())
        self.CLS =self.max_vocab_size + 1
        self.SEP = self.CLS + 1
        self.MASK = self.SEP + 1
        print(f"new tokens: {self.max_vocab_size}, {self.CLS}, {self.SEP}, {self.MASK}")
        
    def next_batch(self, device=device, test=False):
        _x, _ = super().next_batch(device=None)
        x, y_MLM, y_NSP = [], [], []
        assert len(_x) % 2 == 0, "BERTDataLoader batch size should be % 2 == 0"
        print(_x)
        for ix in range(int(len(_x) / 2)): 
            x0 = _x[ix].clone()
            if random() < 0.5:
                x1 = _x[ix+1].clone()
                y_NSP.append(torch.tensor(1))
            else:
                __x, _ = super().next_batch(device=None)
                x1 = __x[0].clone()
                y_NSP.append(torch.tensor(0))
            y_MLM.append(self._make_input(x0, x1))
            x.append(self._make_input(self._mask(x0), self._mask(x1)))
            
        if test:
            for i, xx in enumerate(x):
                assert (yy[0][1:1+BERTConfig.block_size] == _x[i]).all()
                assert (yy[0][1+BERTConfig.block_size+1:1+2*BERTConfig.block_size+1] == _x[i+1]).all(), _x[i+1]
                print(f"{i} TEST PASSED same sentence")
            
        return torch.stack(x).to(device), torch.stack(y_MLM).to(device), torch.stack(y_NSP).to(device)
    
    def _make_input(self, x0, x1):
        return torch.cat([
            torch.tensor([self.CLS]), 
            x0,
            torch.tensor([self.SEP]), 
            x1,
            torch.tensor([self.SEP])
        ])
    
    def _mask(self, x):
        for i, v in enumerate(x):
            if random() < 0.15:
                r2 = random()
                if r2 < 0.80:
                    x[i] = data_loader.MASK
                elif 0.80 <= r2 < 0.90:
                    x[i] = int(random() * (self.max_vocab_size - 100))
        return x

In [246]:
data_loader = BERTDataLoader(config)

max vocabulary size=2213, compression ratio=0.4458684554516162
train_data.shape=torch.Size([397855]), val_data.shape=torch.Size([99459])
new tokens: 2215, 2216, 2217, 2218


In [247]:
x, y_MLM, y_NSP = data_loader.next_batch(device=device)

tensor([[ 813, 1257,  274,  736, 1458],
        [ 114,  109,  409,  257, 1199],
        [ 108,  297,  380,  652,  277],
        [1510,  256, 2125,  465,  260],
        [ 369,  653,  565,  415,  258],
        [ 264,  287,  111, 1257,  274],
        [ 653,  573, 1673,  275,  329],
        [ 533, 2125,  114,  100,  259],
        [2125,  465,  260, 1673,  110],
        [ 954, 1398,  114,  391,  269],
        [ 328, 1316,  380, 1410,  422],
        [ 104,  320,  391,  269,  594]])


In [248]:
x

tensor([[2216,  813, 1257,  274,  736, 1458, 2217,  504,  487, 2218,  272,  306,
         2217],
        [2216,   56,  109, 2218,  257, 1199, 2217,  307,  319,  400,  282, 2218,
         2217],
        [2216,  108,  297,  380,  652,  277, 2217,  276, 1410,  282,  341,  264,
         2217],
        [2216, 1510,  256, 2125,  465,  260, 2217, 1215,  343,  827,  653,  573,
         2217],
        [2216,  369,  653,  565,  415,  258, 2217,  264,  287,  111, 1257,  274,
         2217],
        [2216,  264,  287, 2218, 1257,  274, 2217,  265,  365,  515,  261,  294,
         2217]], device='cuda:0')

In [249]:
y_MLM

tensor([[2216,  813, 1257,  274,  736, 1458, 2217,  504,  487,  367,  272,  306,
         2217],
        [2216,  114,  109,  409,  257, 1199, 2217,  307,  319,  400,  282, 1590,
         2217],
        [2216,  108,  297,  380,  652,  277, 2217,  276, 1410,  282,  341,  264,
         2217],
        [2216, 1510,  256, 2125,  465,  260, 2217, 1215,  343,  827,  653,  573,
         2217],
        [2216,  369,  653,  565,  415,  258, 2217,  264,  287,  111, 1257,  274,
         2217],
        [2216,  264,  287,  111, 1257,  274, 2217,  265,  365,  515,  261,  294,
         2217]], device='cuda:0')

In [251]:
y_NSP

tensor([0, 0, 0, 0, 1, 0], device='cuda:0')

In [235]:
torch.stack(x), torch.stack(y)

TypeError: expected Tensor as element 0 in argument 0, but got list

In [236]:
y

[[tensor([2216,  343,  347,  404,  827,   10, 2217, 1733,  256,  289, 2010,  709,
          2217]),
  False],
 [tensor([2216,   39,   83,  595,  383,  105, 2217,  573,  320, 1483,  111,   45,
          2217]),
  True],
 [tensor([2216,  573,  320, 1483,  111,   45, 2217, 1013,  102, 1004,  316,  305,
          2217]),
  False],
 [tensor([2216, 1312,   58, 1194,  536,  320, 2217,  424,  359,  341, 1790, 1935,
          2217]),
  False],
 [tensor([2216,  413,  272,  405,  841,  627, 2217,  119,  316,  104,  395,  276,
          2217]),
  False],
 [tensor([2216,  270,  400, 1111,  405,  441, 2217,  258,  316,   32, 2082,  370,
          2217]),
  True]]

In [101]:
get_BERT_batch(x)

(tensor([2216, 2218,  309,  651,  108, 2218,  260, 2218, 2217,  368,  391,  269,
         2094,  773,  391,  269, 2218]),
 tensor([2216,   32,  309,  651,  108, 1410,  260,  330, 2217,  368,  391,  269,
         2094,  773,  391,  269, 2218]))

In [97]:
data_loader.MASK

2218