In [None]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
!pip install tiktoken

In [None]:
import torch
import torch.nn as nn
from dataclasses import dataclass
import torch.nn.functional as F
import time
import math
import inspect

In [None]:
import tiktoken
enc = tiktoken.get_encoding('gpt2')

class DataLoaderLite:
    def __init__(self,B, T):
        self.B = B
        self.T = T
        with open('input.txt') as f:
            text = f.read()
        enc = tiktoken.get_encoding('gpt2')
        tokens = enc.encode(text)
        self.tokens = torch.tensor(tokens, dtype=torch.long)
        print(f"loaded {len(self.tokens)} tokens")
        print(f"1 epoch = {len(self.tokens) // (B*T)}")
        self.current_position = 0
        
    def next_batch(self):
        current_tokens = self.tokens[self.current_position: self.current_position + self.B*self.T+1]
        x = current_tokens[:-1].view(self.B, self.T)
        y = current_tokens[1:].view(self.B, self.T)
        self.current_position += self.B * self.T
        if self.current_position + self.B * self.T + 1 > len(self.tokens):
            self.current_position = 0
        return x, y

In [None]:
class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.gelu = nn.GELU(approximate='tanh')
        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
        self.c_proj.NANOGPT_SCALE_INIT = 1

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        return self.c_proj(x)

class CausalSelfAttetion(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=True)
        self.c_proj = nn.Linear(config.n_embd,config.n_embd, bias=True)
        self.c_proj.NANOGPT_SCALE_INIT = 1
        self.register_buffer('bias', torch.triu(torch.ones(config.block_size, config.block_size), diagonal=1).view(1, 1, config.block_size, config.block_size))
    
    def forward(self, x):
        B, T, C = x.size()
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.config.n_embd, dim = 2)
        q = q.view(B, T, self.config.n_head, C // self.config.n_head).transpose(1, 2)
        k = k.view(B, T, self.config.n_head, C // self.config.n_head).transpose(1, 2)
        v = v.view(B, T, self.config.n_head, C // self.config.n_head).transpose(1, 2)
        
        # att = q @ k.transpose(-2, -1) * (k.size(-1) ** -0.5)
        # att = att.masked_fill(self.bias[:,:,:T,:T] == 1,  float("-inf"))
        # probs = F.softmax(att, dim=-1)
        # y = probs @ v

        y = F.scaled_dot_product_attention(q, k, v, is_causal=True)
        
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.c_proj(y)
        return y

class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttetion(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

@dataclass
class GPTConfig:
    block_size: int = 1024
    vocab_size: int = 50688
    n_layer: int = 12
    n_head: int = 12
    n_embd: int = 768
    
class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(1024, config.n_embd),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd)
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        #weight sharing scheme
        self.transformer.wte.weight = self.lm_head.weight
    
        # self.apply iterates through all modules and applyes self._init_weights function
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            std = 0.02
            if hasattr(module, 'NANOGPT_SCALE_INIT'):
                std *= (2 * self.config.n_layer) ** -0.5
            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
        
    def forward(self, x, targets=None):
        B, T = x.size()
        assert T <= self.config.block_size, f"Cannot forward the sequence of length {T}, block_size is smaller"
        pos = torch.arange(0, T, dtype=torch.long, device=x.device)
        pos_embd = self.transformer.wpe(pos)
        tok_embd = self.transformer.wte(x)
        x = pos_embd + tok_embd

        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)

        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss

    def configure_optimizers(self, weight_decay, learning_rate, device):
        # start with all of the candidate parameters (that require grad)
        param_dict = {pn: p for pn, p in self.named_parameters()}
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)
        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
        # Create AdamW optimizer and use the fused version if it is available
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and 'cuda' in (device.type, )
        print(f"using fused AdamW: {use_fused}")
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8, fused=use_fused)
        return optimizer

    @classmethod
    def from_pretrained(cls, model_type):
        assert model_type in ('gpt2', 'gpt2-medium', 'gpt2-large', 'gpt-xl')
        from transformers import GPT2LMHeadModel
        print("Loading weight from pretrained %s" % model_type)
    
        config_args = {
            'gpt2': dict(n_layer=12, n_head=12, n_embd=768),
            'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), 
            'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280),
            'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600)
        }[model_type]    
        config_args['vocab_size'] = 50257
        config_args['block_size'] = 1024
        
        config = GPTConfig(**config_args)
        model = GPT(config)
        sd = model.state_dict()
        sd_keys = sd.keys()
        sd_keys = [key for key in sd_keys if not key.endswith('.attn.bias')]
        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()

        sd_keys_hf = sd_hf.keys()
        sd_keys_hf = [key for key in sd_keys_hf if not key.endswith('.attn.bias')]
        sd_keys_hf = [key for key in sd_keys_hf if not key.endswith('.attn.masked_bias')]
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']

        assert len(sd_keys) == len(sd_keys_hf), f"{len(sd_keys)} != {len(sd_keys_hf)}"

        for k in sd_keys_hf:
            if any(k.endswith(w) for w in transposed):
                assert sd[k].shape == sd_hf[k].shape[::-1]
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].t())
            else:
                assert sd[k].shape == sd_hf[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k])

        return model

In [None]:
for i in range(50257, 52000):
    ans = 0
    tmp = i
    while tmp % 2 == 0:
        ans += 1
        tmp = tmp // 2
    if ans > 7:
        print(i, ans)

In [None]:
50688 / 512

In [None]:
total_batch_size = 524288 # 2**19, ~0.5M, in number of tokens
B = 16 # micro batch size
T = 1024 # sequence length
assert total_batch_size % (B * T) == 0, "make sure total_batch_size is divisible by B * T"
grad_accum_steps = total_batch_size // (B * T)
print(f"total desired batch size: {total_batch_size}")
print(f"=> calculated gradient accumulation steps: {grad_accum_steps}")

In [None]:
#model = GPT.from_pretrained('gpt2')

torch.manual_seed(1337)
if torch.cuda.is_available():
    torch.cuda.manual_seed(1337)
    
train_loader = DataLoaderLite(B=4, T=1024)
#torch.set_float32_matmul_precision('high')

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = GPT(GPTConfig())
#model = torch.compile(model)
if torch.cuda.device_count() > 1:
    print(f"Используем {torch.cuda.device_count()} GPU")
    # Оборачиваем модель в DataParallel
    model = nn.DataParallel(model)
model = model.to(device)

In [None]:
max_lr = 6e-4
min_lr = max_lr * 0.1
warmup_steps = 10
max_steps = 200
total_iterations=500
def get_lr(it):
    # 1) linear warmup for warmup_iters steps
    if it < warmup_steps:
        return max_lr * (it+1) / warmup_steps
    # 2) if it > lr_decay_iters, return min learning rate
    if it > max_steps:
        return min_lr
    # 3) in between, use cosine decay down to min learning rate
    decay_ratio = (it - warmup_steps) / (max_steps - warmup_steps)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff starts at 1 and goes to 0
    return min_lr + coeff * (max_lr - min_lr)

In [None]:
optimizer = model.module.configure_optimizers(weight_decay=0.1, learning_rate=6e-4, device=device)
for step in range(total_iterations):
    t0 = time.time()
    optimizer.zero_grad()
    loss_accum = 0.0
    for grad_step in range(grad_accum_steps):
        x, y = train_loader.next_batch()
        x, y = x.to(device), y.to(device)
        # with torch.autocast(device_type=device.type, dtype=torch.float16):
        #     logits, loss = model(x, y)
        logits, loss = model(x, y)
        loss = loss / grad_accum_steps
        if loss.dim() > 0:
            loss = loss.mean()
        loss_accum += loss.detach()
        loss.backward()
        
    norm = torch.nn.utils.clip_grad_norm_(model.module.parameters(), 1.0)
    lr = get_lr(step)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr  
    optimizer.step()
    torch.cuda.synchronize()
    t1 = time.time()
    dt = (t1 - t0)
    tokens_per_sec = (train_loader.B * train_loader.T * grad_accum_steps) / (t1 - t0)
    
    print(f"step {step:4d} | loss: {loss_accum.item():.4f} | norm: {norm.item():.2f} | dt: {dt:.2f}s | tok/sec: {tokens_per_sec:.2f}")

In [None]:
num_params = 0
lm_head = model.lm_head.weight.size(0) * model.lm_head.weight.size(1)
for param in model.parameters():
    num_params += param.numel()

print(f"total_number of parameters: {num_params}")
print(f"number of parameters in head linear layer {lm_head}")
print(f"proportion of parameters: {(lm_head / num_params):.4f}")
#print(f"new total number of parameters after sharing wte and lm_head {num_params - lm_head}")

In [None]:
max_length = 300
torch.manual_seed(42)
torch.cuda.manual_seed(42)

x = torch.zeros((1,1), dtype=torch.long).to(device)

while x.size(1) < max_length:
    with torch.no_grad():
        logits, loss = model(x)
        logits = logits[:, -1, :]
        probs = F.softmax(logits, dim=-1)
        topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
        ix = torch.multinomial(topk_probs, 1)
        new_token = torch.gather(topk_indices, -1, ix)
        x = torch.cat([x, new_token], dim=1)

In [None]:
for sample in x:
    print(enc.decode(sample.cpu().tolist()), end='\n')