In [1]:
from thop import profile
from importlib.metadata import version
import torch,torch.nn as nn
pkgs=['thop','torch']
for p in pkgs:
    print(f'{p} version: {version(p)}')

thop version: 0.1.1-2209072238
torch version: 2.9.1


# Simple Benchmark With Fixed Batch Size

In [2]:
BASE_CONFIG={'vocab_size':50257,
             'context_length':1024,
             'drop_rate':0,
             'qkv_bias':True}
model_configs={'gpt-small (124M)':{'emb_dim':768,
                                   'n_layers':12,
                                   'n_heads':12},
               'gpt-medium (355M)':{'emb_dim':1024,
                                   'n_layers':24,
                                   'n_heads':16},
               'gpt-large (774M)':{'emb_dim':1280,
                                   'n_layers':36,
                                   'n_heads':20},
               'gpt-xl (1558M)':{'emb_dim':1600,
                                 'n_layers':48,
                                 'n_heads':25}}
device=torch.device('mps')
batch_size=2
input_tensor=torch.randint(0,50257,(batch_size,1024)).to(device)
class LayerNorm(nn.Module):
    def __init__(self,emb_dim):
        super().__init__()
        self.eps=1e-5
        self.scale=nn.Parameter(torch.ones(emb_dim))
        self.shift=nn.Parameter(torch.zeros(emb_dim))
    def forward(self,x):
        mean=x.mean(dim=-1,
                    keepdim=True)
        var=x.var(dim=-1,
                  keepdim=True,
                  unbiased=False)
        norm_x=(x-mean)/torch.sqrt(var+self.eps)
        return self.scale*norm_x+self.shift
class GELU(nn.Module):
    def __init__(self):
        super().__init__()
    def forward(self,x):
        return .5*x*(1+torch.tanh(torch.sqrt(torch.tensor(2/torch.pi))*(x+.044715*torch.pow(x,3))))
class FeedForward(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.layers=nn.Sequential(nn.Linear(cfg['emb_dim'],4*cfg['emb_dim']),GELU(),nn.Linear(4*cfg['emb_dim'],cfg['emb_dim']))
    def forward(self,x):
        return self.layers(x)
class MultiHeadAttention(nn.Module):
    def __init__(self,d_in,d_out,context_length,dropout,num_heads,qkv_bias=False):
        super().__init__()
        assert d_out%num_heads==0,'d_out must be divisible by n_heads.'
        self.d_out=d_out
        self.num_heads=num_heads
        self.head_dim=d_out//num_heads
        self.W_query=nn.Linear(d_in,d_out,bias=qkv_bias)
        self.W_key=nn.Linear(d_in,d_out,bias=qkv_bias)
        self.W_value=nn.Linear(d_in,d_out,bias=qkv_bias)
        self.out_proj=nn.Linear(d_out,d_out)
        self.dropout=nn.Dropout(dropout)
        self.register_buffer('mask',torch.triu(torch.ones(context_length,context_length),diagonal=1))
    def forward(self,x):
        b,num_tokens,d_in=x.shape
        keys=self.W_key(x)
        queries=self.W_query(x)
        values=self.W_value(x)
        keys=keys.view(b,num_tokens,self.num_heads,self.head_dim)
        values=values.view(b,num_tokens,self.num_heads,self.head_dim)
        queries=queries.view(b,num_tokens,self.num_heads,self.head_dim)
        keys=keys.transpose(1,2)
        queries=queries.transpose(1,2)
        values=values.transpose(1,2)
        attn_scores=queries@keys.transpose(2,3)
        mask_bool=self.mask.bool()[:num_tokens,
                                   :num_tokens]
        attn_scores.masked_fill_(mask_bool,-torch.inf)
        attn_weights=torch.softmax(attn_scores/keys.shape[-1]**.5,dim=-1)
        attn_weights=self.dropout(attn_weights)
        context_vec=(attn_weights@values).transpose(1,2)
        context_vec=context_vec.reshape(b,num_tokens,self.d_out)
        context_vec=self.out_proj(context_vec)
        return context_vec
class TransformerBlock(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.att=MultiHeadAttention(d_in=cfg['emb_dim'],
                                    d_out=cfg['emb_dim'],
                                    context_length=cfg['context_length'],
                                    num_heads=cfg['n_heads'],
                                    dropout=cfg['drop_rate'],
                                    qkv_bias=cfg['qkv_bias'])
        self.ff=FeedForward(cfg)
        self.norm1=LayerNorm(cfg['emb_dim'])
        self.norm2=LayerNorm(cfg['emb_dim'])
        self.drop_resid=nn.Dropout(cfg['drop_rate'])
    def forward(self,x):
        shortcut=x
        x=self.norm1(x)
        x=self.att(x)
        x=self.drop_resid(x)
        x=x+shortcut
        shortcut=x
        x=self.norm2(x)
        x=self.ff(x)
        x=self.drop_resid(x)
        x=x+shortcut
        return x
class GPTModel(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.tok_emb=nn.Embedding(cfg['vocab_size'],
                                  cfg['emb_dim'])
        self.pos_emb=nn.Embedding(cfg['context_length'],
                                  cfg['emb_dim'])
        self.drop_emb=nn.Dropout(cfg['drop_rate'])
        self.trf_blocks=nn.Sequential(*[TransformerBlock(cfg) for _ in range(cfg['n_layers'])])
        self.final_norm=LayerNorm(cfg['emb_dim'])
        self.out_head=nn.Linear(cfg['emb_dim'],cfg['vocab_size'],bias=False)
    def forward(self,in_idx):
        batch_size,seq_len=in_idx.shape
        tok_embeds=self.tok_emb(in_idx)
        pos_embeds=self.pos_emb(torch.arange(seq_len,device=in_idx.device))
        x=tok_embeds+pos_embeds
        x=self.drop_emb(x)
        x=self.trf_blocks(x)
        x=self.final_norm(x)
        logits=self.out_head(x)
        return logits
for size in model_configs:
    BASE_CONFIG.update(model_configs[size])
    model=GPTModel(BASE_CONFIG).bfloat16()
    model.to(device)
    macs,params=profile(model,inputs=(input_tensor,),verbose=False)
    flops=2*macs
    print(f'{size:18}: {flops:.1e} FLOPS')
    del model
    torch.mps.empty_cache()

gpt-small (124M)  : 5.1e+11 FLOPS
gpt-medium (355M) : 1.4e+12 FLOPS
gpt-large (774M)  : 3.2e+12 FLOPS
gpt-xl (1558M)    : 6.4e+12 FLOPS
