<a href="https://colab.research.google.com/github/benisalla/BigMart-Data-Analysis-and-Prediction/blob/main/GPT_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Loading Dummy Data for test

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
path = "/content/drive/MyDrive/translator-first-try/europarl-v7.fr-en.en"
with open(path, 'r', encoding='utf-8') as f:
    text = f.read()
text[:100]

'Resumption of the session\nI declare resumed the session of the European Parliament adjourned on Frid'

In [3]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(f"vocab_size : {vocab_size}")

vocab_size : 315


In [4]:
# create a mapping from characters to integers
word2number = { ch:i for i,ch in enumerate(chars) }
number2word = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [word2number[c] for c in s]
decode = lambda l: ''.join([number2word[i] for i in l])

print(encode("I love neural networks"))
print(decode(encode("I love neural networks")))

[42, 1, 76, 79, 86, 69, 1, 78, 69, 85, 82, 65, 76, 1, 78, 69, 84, 87, 79, 82, 75, 83]
I love neural networks


In [5]:
import torch
text_data = torch.tensor(encode(text), dtype=torch.long)
print(text_data.shape, text_data.dtype)
print(text_data[:10])

torch.Size([301210536]) torch.int64
tensor([51, 69, 83, 85, 77, 80, 84, 73, 79, 78])


#Importing required libraries.

In [6]:
%%capture
!pip install transformers
!pip install wandb

In [7]:
import math
import inspect
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F
import os
import time
import pickle
from contextlib import nullcontext
import numpy as np
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group
import wandb

#The Core of GPT-2

<center>
  <img src="https://github-production-user-asset-6210df.s3.amazonaws.com/89405673/258571749-4954bee7-47e4-4848-a842-affc25c3cc17.png" width="900" height="260"/>
</center>

#####Layer Norm (Normalization Layer)

In [8]:
class LayerNorm(nn.Module):

    """
        Layer Normalization module. (different than batch normalization and better in seq problems)

        Tensor          Type            Shape
        ===========================================================================
        input           long            (batch_size, seq_len, n_embd)
        ---------------------------------------------------------------------------
        output          float           (batch_size, seq_len, n_embd)
        ===========================================================================
    """

    def __init__(self, ndim, bias):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(ndim))
        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None

    def forward(self, input):
        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)

#####Causal Self Attention

In [9]:
class CausalSelfAttention(nn.Module):

    """
        Causal Self-Attention Module
        ( why causal? well because attention mechanism is a causal operation not a correlation )

        Tensor          Type            Shape
        ===========================================================================
        input           long            (batch_size, seq_len, n_embd)
        ---------------------------------------------------------------------------
        output          float           (batch_size, seq_len, n_embd)
        ===========================================================================
    """

    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0 ; "n_embd % n_head should be 0."
        # key, query, and value projections for all heads, but do so in a batch-wise manner.
        # in normal Head Attention : nn.Linear(n_embd, head_size), however we are batching all of them

        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
        # final projection (applied on the output): it concatinate and merge diff info from heads

        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)

        # regularization
        self.attn_dropout = nn.Dropout(config.dropout)
        self.resid_dropout = nn.Dropout(config.dropout)
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.dropout = config.dropout
        # Flash attention significantly accelerates GPU processing, but requires PyTorch version 2.0 or higher for support.
        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
        if not self.flash:
            print("WARNING: Using the standard attention mechanism. Flash Attention requires PyTorch version 2.0 or higher.")
            # causal mask (communicate with previous tokens only)
            self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
                                        .view(1, 1, config.block_size, config.block_size))

    def forward(self, x):
        B, T, C = x.size() # batchs, Time, channels
        # Calculating Q,K,V and performing the multi-head attention in a single matrix operation.
        q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)   # (B, nh, T, hs) headsize(hs) = n_embd // n_head
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)   # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)   # (B, nh, T, hs)

        # Causal self-attention ( computes all self-attention operations ) : (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        if self.flash:
            # efficient attention using Flash Attention CUDA kernels
            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
        else:
            # manual implementation of attention mechanism (less efficient)
            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
            att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
            att = F.softmax(att, dim=-1)
            att = self.attn_dropout(att)
            y = att @ v   # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # Multi-HeadAttention : (B, nh, T, hs) ==> (B, T, nh*hs)

        # output projection
        y = self.resid_dropout(self.c_proj(y))
        return y


#####MLP(Multi-Layer Perceptron)

In [10]:
class MLP(nn.Module):

    """
        Multi-Layer Perceptron (MLP): computation layer.
        Non-linearity = complex mappings + feature extraction  + ... + processing data (reasoning)

        Tensor          Type            Shape
        ===========================================================================
        input           long            (batch_size, seq_len, n_embd)
        ---------------------------------------------------------------------------
        output          float           (batch_size, seq_len, n_embd)
        ===========================================================================

    """
    def __init__(self, config):
        super().__init__()
        self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
        self.gelu    = nn.GELU()
        self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        x = self.dropout(x)
        return x

#####Block(communication + computation)

In [11]:
class Block(nn.Module):

    """
        Transformer Block: communication layer (att mechanism) + computation layer ( feedforward).

        Tensor          Type            Shape
        ===========================================================================
        input           long            (batch_size, seq_len, n_embd)
        ---------------------------------------------------------------------------
        output          float           (batch_size, seq_len, n_embd)
        ===========================================================================
    """

    def __init__(self, config):
        super().__init__()
        self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))   # x = x + f(x) : called residual connection
        x = x + self.mlp(self.ln_2(x))    # or skip connections (very crucial in backprop)
        return x

#####Config Object

In [12]:
@dataclass
class GPTConfig:
    block_size: int = 1024      # The maximum sequence length allowed for inputs.
    vocab_size: int = 50304     # For GPT-2, it is 50257, and it's padded to the nearest multiple of 64 for efficiency.
    n_layer: int = 12           # The number of layers (blocks) in the transformer architecture.
    n_head: int = 12            # The number of attention heads in each multi-head attention layer.
    n_embd: int = 768           # The size of the embedding dimension for tokens.
    dropout: float = 0.0        # The dropout probability to prevent overfitting during training.
    bias: bool = True           # Whether to include biases in Linears and LayerNorms;
                                # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster

#####Body of GPT

In [13]:
class GPT(nn.Module):

    def __init__(self, config):
        super().__init__()
        assert config.vocab_size is not None
        assert config.block_size is not None
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),                 # Embeddings
            wpe = nn.Embedding(config.block_size, config.n_embd),                 # positional encodings
            drop = nn.Dropout(config.dropout),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),    # Block X N
            ln_f = LayerNorm(config.n_embd, bias=config.bias),
        ))

        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)    # proj(n_embd) => vocab_size
        # weight tying: could be harmfull in futur (https://paperswithcode.com/method/weight-tying)
        # NOTE : this technique is used in auto-encoder: read the paper (https://arxiv.org/pdf/1608.05859v3.pdf)
        #        1) This operation requires the number of weights in these layers to be the same.
        #        2) We can achieve this by explicitly duplicating the same layer twice.
        #        3) Alternatively, as shown below, we can implicitly tie them together.
        self.transformer.wte.weight = self.lm_head.weight

        self.apply(self._init_weights)   # init all weights (remember that apply === foreach)

        # apply special scaled init to the residual projections, per GPT-2 paper
        for param_name, param in self.named_parameters():
            if param_name.endswith('c_proj.weight'):
                torch.nn.init.normal_(param, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))

        # report number of parameters (unit: M)
        print("number of parameters: %.2fM" % (self.get_num_params()/1e6,))

    def get_num_params(self, non_embedding=True):
        n_params = sum(p.numel() for p in self.parameters())
        if non_embedding:
            n_params -= self.transformer.wpe.weight.numel()
        return n_params

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        device = idx.device
        B, T = idx.size()
        assert T <= self.config.block_size, f"Max sequence length is {self.config.block_size}"
        pos = torch.arange(0, T, dtype=torch.long, device=device)   # (T)

        tok_emb = self.transformer.wte(idx)                         # (B, T) ==> (B, T, C)
        pos_emb = self.transformer.wpe(pos)                         # (T) ==> (T, C)

        # Performing the forward pass through the entire GPT model.
        x = self.transformer.drop(tok_emb + pos_emb)
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)

        if targets is not None:
            logits = self.lm_head(x)
            print(f"logits : {logits.shape}")
            print(f"targets : {targets.shape}")
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        else:
            logits = self.lm_head(x[:, [-1], :])    # NOTE: LIST[-1] to preserve the time dim
            loss = None

        return logits, loss

    def crop_block_size(self, block_size):
        # Model surgery to decrease the block size if necessary for specific use cases.
        assert block_size <= self.config.block_size
        self.config.block_size = block_size

        # Crop the positional embeddings tensor to match the new block size
        self.transformer.wpe.weight = nn.Parameter(self.transformer.wpe.weight[:block_size])

        # Loop through each transformer block and crop the attention biases if they exist.
        for block in self.transformer.h:
            if hasattr(block.attn, 'bias'):
                block.attn.bias = block.attn.bias[:,:,:block_size,:block_size]

    @classmethod
    def from_pretrained(cls, model_type, override_args=None):
        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
        override_args = override_args or {}                 # empty dict {} by default
        assert all(k == 'dropout' for k in override_args)   # Only dropout is modifiable
        from transformers import GPT2LMHeadModel

        print(f"loading weights from pretrained gpt: {model_type}")

        # n_layer, n_head and n_embd are determined from available model_type
        config_args = {
            'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),    # 124M params
            'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024),   # 350M params
            'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280),   # 774M params
            'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600),   # 1558M params
        }[model_type]

        print("forcing vocab_size=50257, block_size=1024, bias=True")
        config_args['vocab_size'] = 50257     # always 50257 for GPT model checkpoints
        config_args['block_size'] = 1024      # always 1024 for GPT model checkpoints
        config_args['bias'] = True            # always True for GPT model checkpoints

        # overriding the dropout rate if dropout c override_args
        if 'dropout' in override_args:
            print(f"Overriding dropout rate to {override_args['dropout']}")
            config_args['dropout'] = override_args['dropout']

        # Re-Initializing GPT Model to fit the specified type requirement
        config = GPTConfig(**config_args)  # overriding the the previous config object
        model = GPT(config)                # Re-Init model
        sd = model.state_dict()            # dict of learnable params
        sd_keys = sd.keys()                # list of these params' names
        sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param

        # Initializing the huggingface/transformers Pre-Trained model
        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()

        # Our-Model params = Copy(Pre-Trained Model params)
        # Note : parameters most be aligned have same names and shapes
        sd_keys_hf = sd_hf.keys()
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')]   # ignore these, just a buffer
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')]          # same, just the mask (buffer)
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']

        # The OpenAI checkpoints use "Conv1D" module, but we are using a simple Linear layer.
        # Therefore, we have to transpose the weights when importing the checkpoints.
        assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
        for k in sd_keys_hf:
            if any(k.endswith(w) for w in transposed):
                assert sd_hf[k].shape[::-1] == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].t())
            else:
                assert sd_hf[k].shape == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k])

        return model

    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
        param_dict = {pn: p for pn, p in self.named_parameters()}                 # All params
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}   # filter those who did not require grad
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]         # i.e. weight tensors( matmuls + embeddings + ...)
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]        # i.e. layer norms
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]

        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)

        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")

        # Create AdamW optimizer and use the fused version if it is available
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == 'cuda'
        extra_args = dict(fused=True) if use_fused else dict()
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
        print(f"using fused AdamW: {use_fused}")

        return optimizer

    def estimate_mfu(self, fwdbwd_per_iter, dt):
        """ Model Flops Utilization rate = empirical performance / peak performance """
        # Model Flops Utilization (MFU): It is a metric used to measure how efficiently
        # a deep learning model utilizes the available computational resources.
        N = self.get_num_params()
        cfg = self.config
        L, H, Q, T = cfg.n_layer, cfg.n_head, cfg.n_embd//cfg.n_head, cfg.block_size
        flops_per_token = 6*N + 12*L*H*Q*T
        flops_per_fwdbwd = flops_per_token * T
        flops_per_iter = flops_per_fwdbwd * fwdbwd_per_iter
        # express our flops throughput as ratio of A100 bfloat16 peak flops
        flops_achieved = flops_per_iter * (1.0/dt)    # per second (actual flops)
        flops_promised = 312e12                       # A100 GPU bfloat16 peak flops is 312 TFLOPS (theoretical flops)
        mfu = flops_achieved / flops_promised
        return mfu

    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
        for _ in range(max_new_tokens):
            # If the conditioning sequence exceeds the 'block_size', crop it to the most recent 'block_size' tokens.
            idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]

            # Forward pass through the model to obtain logits for the next token in the sequence.
            logits, _ = self(idx_cond)

            # Scale the logits at the final step by the 'temperature' to control the randomness of token sampling.
            logits = logits[:, -1, :] / temperature

            # Optionally crop the logits to only the top k options, ensuring more focused sampling.
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('Inf')

            # Apply the softmax function to convert logits into probabilities for sampling.
            probs = F.softmax(logits, dim=-1)

            # Sample from the probability distribution to get the next token indices.
            idx_next = torch.multinomial(probs, num_samples=1)

            # Append the sampled index to the running sequence and continue generation.
            idx = torch.cat((idx, idx_next), dim=1)

        return idx

#Initializing Model and Train it

#####Model Configuration and Hyperparameters

In [32]:
#---------------------(NOTE: this is gpt-2 configuration)----------------------#
checkpts_dir = 'check_points'
eval_interval = 2000
log_interval = 1
eval_iters = 200
init_from = 'scratch'                  # ['scratch', 'resume', 'gpt2*']
save_checkpoint = True
eval_only = False

#-------------------------------(wandb logging)--------------------------------#
wandb_log = True
wandb_project = 'owt'
wandb_run_name = 'gpt2'                # 'run' + str(time.time())

#----------------------------------(data)--------------------------------------#
db_name = 'openwebtext'
grad_accum_steps = 5 * 8        # used to simulate larger batch sizes
batch_size = 12                 # grad_accum_steps>1 => micro-batch-size=12
block_size = 1024

#----------------------------------(model)-------------------------------------#
n_layer = 12
n_head = 12
n_embd = 768
dropout = 0.0   # NOTE: while fine-tuning it is recommanded to use dropout=0.1+
bias = False

#----------------------------(adamw optimizer)---------------------------------#
learning_rate = 6e-4
max_iters = 600000
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.95
grad_clip = 1.0                 # set(0.0) ==> disable clipping

#---------------------------------(lr decay)-----------------------------------#
decay_lr = True
warmup_iters = 2000
lr_decay_iters = 600000     # ~= max_iters per 'Chunk of Data'
min_lr = 6e-5               # ~= learning_rate/10 per 'Chunk of Data'

#--------------------------------(system)--------------------------------------#
device = 'cuda' if torch.cuda.is_available() else 'cpu'   # ['cpu', 'cuda', 'cuda:0', 'cuda:1', ... 'mps']
dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16'   # ['float32', 'bfloat16', 'float16']
compile = True      # use PyTorch 2.0 to compile the model to be faster

#####Data Loader

<center>
  <h6>Loading Data By chuncks using NumPy Memory Mapped Array</h6>
  <img src="https://github-production-user-asset-6210df.s3.amazonaws.com/89405673/259172272-6434381c-b51a-4974-ba66-399ed7fc5778.png" />
</center>

In [33]:
################################################################################
#                                 Data Loader                                  #
#                Note: np.memmap loads the data when it is needed              #
################################################################################
data_dir = os.path.join('/content/drive/MyDrive/GPT_FROM_SCRATCH', db_name)
# train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
# val_data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')


# def get_batch(split):
#     data = train_data if split == 'train' else val_data
#     ix = torch.randint(len(data) - block_size, (batch_size,))
#     x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
#     y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
#     if device == 'cuda':
#         # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
#         x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
#     else:
#         x, y = x.to(device), y.to(device)
#     return x, y


#======> just to test the model in a tiny dataset
def get_batch(data):
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+1+block_size] for i in ix])
    if device == 'cuda':
        # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
        x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
    else:
        x, y = x.to(device), y.to(device)
    return x, y

#####Initialization

<center>
  <h6>AUTOMATIC MIXED PRECISION</h6>
  <img src="https://github-production-user-asset-6210df.s3.amazonaws.com/89405673/259163442-802ef1b4-8130-44f5-86be-90e34dcd437e.png" width="900" height="200"/>
  <img src="https://github-production-user-asset-6210df.s3.amazonaws.com/89405673/259163449-0fd88fd1-3c32-4b79-807e-e4e281819a13.png" width="500" height="140"/>
</center>

In [28]:
os.makedirs(checkpts_dir, exist_ok=True)            # Create checkpts dir if not exist
torch.manual_seed(2000)                             # Reproducibility
torch.backends.cuda.matmul.allow_tf32 = True        # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True              # allow tf32 on cudnn

# NOTE: float16 data type will automatically use a GradScaler
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device == 'cpu' else torch.amp.autocast(device_type=device, dtype=ptdtype)


#----------------------[Note: init_from='resume' or ...)]----------------------#
iter_num = 0
best_val_loss = 1e9
init_from = 'scratch'


#-----------------------------[Restore Vocab_Size]-----------------------------#
meta_path = os.path.join(data_dir, 'meta.pkl')
meta_vocab_size = None
if os.path.exists(meta_path):
    with open(meta_path, 'rb') as f:
        meta = pickle.load(f)
    meta_vocab_size = meta['vocab_size']
    print(f"found vocab_size = {meta_vocab_size} (inside {meta_path})")


#------------------------------[Init Our Model]--------------------------------#
model_args = dict(n_layer=n_layer,
                  n_head=n_head,
                  n_embd=n_embd,
                  block_size=block_size,
                  bias=bias,
                  vocab_size=None,
                  dropout=dropout)


#---------------[Building Model as specified here 'init_from' ]----------------#
print(f"Initializing The Model from : {init_from}")
if init_from == 'scratch':
    # NOTE: GPT-2 original vocab size is 50,257 but for effeciency we are using 50,304
    model_args['vocab_size'] = meta_vocab_size if meta_vocab_size is not None else 50304
    gptconf = GPTConfig(**model_args)
    model = GPT(gptconf)
elif init_from == 'resume':
    ckpt_path = os.path.join(checkpts_dir, 'ckpt.pt')
    checkpoint = torch.load(ckpt_path, map_location=device)
    checkpoint_model_args = checkpoint['model_args']
    for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
        model_args[k] = checkpoint_model_args[k]
    gptconf = GPTConfig(**model_args)
    model = GPT(gptconf)
    state_dict = checkpoint['model']
    # NOTE: checkpoints sometimes get _orig_mod prefix :( "i will fix it later"
    unwanted_prefix = '_orig_mod.'
    for key, value in list(state_dict.items()):
        if key.startswith(unwanted_prefix):
            state_dict[key[len(unwanted_prefix):]] = state_dict.pop(key)
    model.load_state_dict(state_dict)
    iter_num = checkpoint['iter_num']
    best_val_loss = checkpoint['best_val_loss']
elif init_from.startswith('gpt2'):
    override_args = dict(dropout=dropout)  # initialize from OpenAI GPT-2 weights
    model = GPT.from_pretrained(init_from, override_args)
    for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
        model_args[k] = getattr(model.config, k)
else:
    print("Oups, There is no such option !!")


#------------------[Crop block size of the original model]---------------------#
if block_size < model.config.block_size:
    model.crop_block_size(block_size)
    model_args['block_size'] = block_size # so that the checkpoint will have the right value
model.to(device)

# initialize a GradScaler. If enabled=False scaler is a no-op
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))

print(f"scaler : {scaler}")

optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device)
if init_from == 'resume':
    optimizer.load_state_dict(checkpoint['optimizer'])
checkpoint = None   # free up memory

if compile:
    print("Compiling the model... (Estimated duration: ~= 1min)")
    model = torch.compile(model) # requires PyTorch 2.0

# if wandb_log:
#     wandb.init(project=wandb_project, name=wandb_run_name, config=config)

Initializing The Model from : scratch
number of parameters: 123.59M
scaler : <torch.cuda.amp.grad_scaler.GradScaler object at 0x7a817cb08310>
num decayed parameter tensors: 50, with 124,354,560 parameters
num non-decayed parameter tensors: 25, with 19,200 parameters
using fused AdamW: True
Compiling the model... (Estimated duration: ~= 1min)


#####Utils


<center>
  <h6><strong>Learning Rate Graph</strong> ( Linear -> Cosine Decay -> lr_min )</h6>
  <img src="https://github-production-user-asset-6210df.s3.amazonaws.com/89405673/259077879-acb08446-83da-46ea-86ea-d4d4f3e332ac.png" />
</center>

In [35]:
################################################################################
#                   Estimation of Loss for Arbitrary Batches                   #
################################################################################
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            with ctx:
                logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out


################################################################################
#         Utilizing a Cosine Learning Rate Decay Scheduler with Warmup         #
################################################################################
def get_lr(it):
    w = warmup_iters
    lr = learning_rate
    lr_di = lr_decay_iters
    #-------------------[ Linear Warmup ]------------------#
    if it < w:
        return lr * it / w
    #-------------------[ Enough WarmUp ]------------------#
    if it > lr_di:
        return min_lr
    #---------[ w < Cosine_Decay(min_lr) < lr_di ]---------#
    decay_ratio = (it - w) / (lr_di - w)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return min_lr + coeff * (lr - min_lr)


################################################################################
#                                 Save Checkpoints                             #
################################################################################
def save_checkpts(model, optimizer, model_args, iter_num, best_val_loss, config):
      print("Saving checkpoints ...")
      checkpoint = {
          'model': raw_model.state_dict(),
          'optimizer': optimizer.state_dict(),
          'model_args': model_args,
          'iter_num': iter_num,
          'best_val_loss': best_val_loss,
          'config': config,
          }
      print("Checkpoints Saved Successfully :)")
      torch.save(checkpoint, os.path.join(checkpts_dir, 'ckpt.pt'))


################################################################################
#                                   Evaluate Model                             #
################################################################################
def evaluate_model(iter_num, eval_interval, best_val_loss, save_checkpoint):
    if iter_num % eval_interval == 0 and False:
        losses = estimate_loss()
        print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        if wandb_log:
            wandb.log({
                "iter": iter_num,
                "train/loss": losses['train'],
                "val/loss": losses['val'],
                "lr": lr,
                "mfu": running_mfu*100,
            })
        if losses['val'] < best_val_loss or save_checkpoint:
            best_val_loss = losses['val']
            if iter_num > 0:
                save_checkpts(
                    model=raw_model.state_dict(),
                    optimizer=optimizer.state_dict(),
                    model_args=model_args,
                    iter_num=iter_num,
                    best_val_loss=best_val_loss,
                    config=config
                )

#####Training Loop

In [39]:
# X, Y = get_batch('train') # fetch the very first batch
X, Y = get_batch(text_data)
t0 = time.time()
local_iter_num = 0
raw_model =  model
running_mfu = -1.0

while True:
    lr = get_lr(iter_num) if decay_lr else learning_rate
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

    # evaluation (Train / val)



    if iter_num == 0 and eval_only:
        break

    # forward backward update, with optional gradient accumulation to simulate larger batch size
    # and using the GradScaler if data type is float16
    for micro_step in range(grad_accum_steps):
        with ctx:
            logits, loss = model(X, Y)
            loss = loss / grad_accum_steps
        # immediately async prefetch next batch while model is doing the forward pass on the GPU
        X, Y = get_batch('train')
        # backward pass, with gradient scaling if training in fp16
        scaler.scale(loss).backward()

    # clip the gradient
    if grad_clip != 0.0:
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)

    # step the optimizer and scaler if training in fp16
    scaler.step(optimizer)
    scaler.update()
    # flush the gradients as soon as we can, no need for this memory anymore
    optimizer.zero_grad(set_to_none=True)

    # timing and logging
    t1 = time.time()
    dt = t1 - t0
    t0 = t1
    if iter_num % log_interval == 0:
        # get loss as float. note: this is a CPU-GPU sync point
        # scale up to undo the division above, approximating the true total loss (exact would have been a sum)
        lossf = loss.item() * grad_accum_steps
        if local_iter_num >= 5: # let the training loop settle a bit
            mfu = raw_model.estimate_mfu(batch_size * grad_accum_steps, dt)
            running_mfu = mfu if running_mfu == -1.0 else 0.9*running_mfu + 0.1*mfu
        print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms, mfu {running_mfu*100:.2f}%")
    iter_num += 1
    local_iter_num += 1

    # termination conditions
    if iter_num > max_iters:
        break

logits : (12, 1024, 50304)
targets : (12, 1024)


RuntimeError: ignored