# Ref:
- https://github.com/TimS-ml/nanoGPT/blob/master/gpt.py
- https://github.com/lucidrains/x-transformers/blob/main/examples/enwik8_simple/train.py

wip
You can specify the data directory by setting the `DATA_DIR` environment variable.

In [1]:
import os
from boring_llm_base.constants import PROJECT_HOME_DIR
import sys; sys.path.append(str(PROJECT_HOME_DIR)); os.chdir(PROJECT_HOME_DIR)

In [2]:
import random
import tqdm
import gzip
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset

In [3]:
from boring_utils.utils import (
    cprint, 
    tprint, 
    get_device
)

# Config and Data Loading

In [4]:
DEV = True 

if not DEV:
    batch_size = 64  # how many independent sequences will we process in parallel?
    block_size = 256  # what is the maximum context length for predictions?
    # max_iters = 5000
    max_iters = 4000
    eval_interval = 500
    learning_rate = 3e-4
    eval_iters = 200
    n_embd = 384
    n_embed = n_embd
    n_head = 6
    n_layer = 6
    dropout = 0.2

else:
    batch_size = 32
    block_size = 8
    # max_iters = 1000
    max_iters = 100
    eval_interval = 500
    learning_rate = 3e-4
    eval_iters = 200
    n_embd = 32
    n_embed = n_embd
    n_head = 4
    n_layer = 4
    dropout = 0.2

device = get_device()
# vocab_size = len(set(text))
cprint(device)

[93m<module> -> device:[0m
device(type='mps')


In [5]:
data_dir = os.getenv('DATA_DIR', './data')

# read nanoGPT's shakespeare_char
# with open(os.path.join(data_dir, 'shakespeare_char/input.txt'), 'r', encoding='utf-8') as f:
#     text = f.read()

# read enwik8 first 10M bytes
with gzip.open(os.path.join(data_dir, 'enwik8/enwik8.gz')) as file:
    text = file.read(int(10e6)).decode('utf-8')

# Char Level Tokenization

[1] nanoGPT: create mapping from characters to indices

```python
chars = sorted(list(set(text)))
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }

# encode
text = "Hello"
tokens = [stoi[c] for c in text]  # char ids
# decode
decoded = ''.join([itos[t] for t in tokens])  # "Hello"
```

[2] x-transformers: ASCII, and the printable characters starting at 32

```python
# encode
text = "Hello"
tokens = [ord(c) for c in text]  # ASCII values

# decode
decoded = ''.join(chr(t) for t in tokens)  # "Hello"
```

In [6]:
chars = sorted(list(set(text)))
vocab_size = len(set(text))

stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] 
decode = lambda l: ''.join([itos[i] for i in l]) 

# Train Test Split

In [7]:
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data))  # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

preview the input data

In [8]:
cprint(text[:1000])
cprint(text[10000:11000])
cprint(text[-1000:])

[93m<module> -> text[:1000]:[0m
('<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/" '
 'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" '
 'xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.3/ '
 'http://www.mediawiki.org/xml/export-0.3.xsd" version="0.3" xml:lang="en">\n'
 '  <siteinfo>\n'
 '    <sitename>Wikipedia</sitename>\n'
 '    <base>http://en.wikipedia.org/wiki/Main_Page</base>\n'
 '    <generator>MediaWiki 1.6alpha</generator>\n'
 '    <case>first-letter</case>\n'
 '      <namespaces>\n'
 '      <namespace key="-2">Media</namespace>\n'
 '      <namespace key="-1">Special</namespace>\n'
 '      <namespace key="0" />\n'
 '      <namespace key="1">Talk</namespace>\n'
 '      <namespace key="2">User</namespace>\n'
 '      <namespace key="3">User talk</namespace>\n'
 '      <namespace key="4">Wikipedia</namespace>\n'
 '      <namespace key="5">Wikipedia talk</namespace>\n'
 '      <namespace key="6">Image</namespace>\n'
 '      <namespace key="7">Image ta

In [9]:
cprint(type(train_data))
cprint(train_data.shape)
cprint(train_data[:10])


[93m<module> -> type(train_data):[0m
<class 'torch.Tensor'>
[93m<module> -> train_data.shape:[0m
torch.Size([8956860])
[93m<module> -> train_data[:10]:[0m
tensor([30, 79, 71, 70, 75, 67, 89, 75, 77, 75])


# Data Loader

[1] Karpathy's get_batch numpy based data loader
```python
# randomly select len=batch_size start position
# torch.randint(end, size)
ix = torch.randint(
    len(data) - block_size, 
    (batch_size,)
)

# then stack the data
x = torch.stack([data[i:i+block_size] for i in ix])
y = torch.stack([data[i+1:i+block_size+1] for i in ix])

# usage
x, y = get_batch(split)
logits, loss = model(x, y)
```


[2] Phil Wang's TextSamplerDataset torch based data loader
```python
# randomly select len=1 start position
# torch.randint(0, end, size)
ix = torch.randint(
    0, 
    len(self.data) - self.block_size - 1, 
    (1,)
)

# then stack the data
full_seq = self.data[ix:ix + self.block_size + 1]

# usage
for x, y in train_loader:  # run batch_size times
    logits, loss = model(x, y)
```

The inf training loop
```python
def cycle(loader):
    while True:
        for data in loader:
            yield data

train_loader = cycle(DataLoader(train_dataset, batch_size=batch_size))
```

In [10]:
class TextSamplerDataset(Dataset):
    def __init__(self, data, block_size):
        self.data = data
        self.block_size = int(block_size)

    def __getitem__(self, index):
        # single sample
        ix = torch.randint(
            len(self.data) - self.block_size - 1, (1,)
        )
        full_seq = self.data[ix:ix + self.block_size + 1]
        x = full_seq[:-1]
        y = full_seq[1:]
        x, y = x.to(device), y.to(device)
        return x, y

    def __len__(self):
        return len(self.data) // self.block_size


train_dataset = TextSamplerDataset(train_data, block_size)
val_dataset = TextSamplerDataset(val_data, block_size)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [11]:
dataiter = iter(train_loader)
x, y = next(dataiter)
cprint(x, y)

[93m<module> -> x:[0m
tensor([[81, 79, 75, 80, 73,  2, 75, 79],
        [67, 86, 86, 71, 84, 16,  2,  2],
        [80, 77,  2, 52, 71, 67, 84, 70],
        [71, 85, 86, 63, 63,  1, 61, 61],
        [75, 84,  2, 81, 72,  2, 78, 75],
        [74, 71, 71, 84, 78, 71, 67, 70],
        [87, 73, 74,  2, 35, 70, 88, 71],
        [81, 78, 75, 70, 14,  2, 78, 75],
        [75, 80, 73,  2, 41, 84, 67, 69],
        [ 2, 69, 67, 80,  2, 67, 82, 82],
        [82, 75, 84, 71, 94, 52, 81, 79],
        [80, 73, 87, 67, 73, 71, 94, 41],
        [ 2, 74, 75, 85,  2, 89, 75, 72],
        [75, 77, 75, 82, 71, 70, 75, 67],
        [67, 85,  2, 71, 67, 84, 78, 91],
        [81, 87, 73, 74,  2, 86, 74, 71],
        [61, 61, 37, 74, 71, 79, 75, 69],
        [67, 78,  2, 50, 81, 85, 75, 86],
        [80, 70,  2, 43, 81, 89, 67,  2],
        [63, 11,  1, 12, 61, 61, 19, 26],
        [81, 69, 71, 67, 80, 81, 73, 84],
        [69, 74, 79, 81, 80, 70, 11,  1],
        [75, 78,  2, 69, 67, 16,  2, 26],
        [ 

# Model

In [12]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        embedding_dim = vocab_size
        # embedding_dim = 128
        # each token is represented by a one-hot vector
        # directly reads off the logits for the next token from the embedding table
        # for example: 24 will reads off the 24th column of the embedding table
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, idx, targets=None):
        # idx is (batch_size, block_size)
        logits = self.embedding(idx)  # B, T, C: (batch_size, block_size, embedding_dim)

        if targets == None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)  # (batch_size * block_size, embedding_dim)
            targets = targets.view(-1)  # (batch_size * block_size)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

model = BigramLanguageModel(vocab_size)
model.to(device)

BigramLanguageModel(
  (embedding): Embedding(2102, 2102)
)

In [13]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Training

If you want to train indefinitely, you can use the following code:

```python
def cycle(loader):
    while True:
        for data in loader:
            yield data

train_iter = cycle(train_loader)
val_iter = cycle(val_loader)
```

Or 
```python
train_iter = iter(train_loader)
val_iter = iter(val_loader)
```


In [15]:
def cycle(loader):
    while True:
        for data in loader:
            yield data

train_iter = cycle(train_loader)
val_iter = cycle(val_loader)

for iter in range(max_iters):
    if iter % eval_interval == 0 or iter == max_iters - 1:
        model.eval()
        with torch.no_grad():
            val_losses = []
            for _, (x, y) in zip(range(eval_iters), val_iter):
                _, loss = model(x, y)
                val_losses.append(loss.item())
            val_loss = np.mean(val_losses)
            
            train_losses = []
            for _, (x, y) in zip(range(eval_iters), train_loader):
                _, loss = model(x, y)
                train_losses.append(loss.item())
            train_loss = np.mean(train_losses)
            
            print(f"step {iter}: train loss {train_loss:.4f}, val loss {val_loss:.4f}")
        model.train()

    x, y = next(train_iter)  # replace get_batch
    logits, loss = model(x, y)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 8.1311, val loss 8.1381
step 99: train loss 8.0976, val loss 8.0863
