In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.4


# ***Cerebras-GPT-13B parameters***

In [None]:
from accelerate import infer_auto_device_map, init_empty_weights
from transformers import AutoConfig, AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("cerebras/Cerebras-GPT-13B")
config = AutoConfig.from_pretrained("cerebras/Cerebras-GPT-13B")
with init_empty_weights():
  model = AutoModelForCausalLM.from_config(config)
device_map = infer_auto_device_map(model)

In [None]:
model_size = sum(t.numel() for t in model.parameters())
print(f"Model size: {model_size/1000**2:.1f}M parameters")

Model size: 13110.7M parameters


In [None]:
model.parameters

<bound method Module.parameters of GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 5120)
    (wpe): Embedding(2048, 5120)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((5120,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
        )
        (ln_2): LayerNorm((5120,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): GELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((5120,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.0, inplace=

In [None]:
device_map

{'transformer.wte': 'cpu',
 'transformer.wpe': 'cpu',
 'transformer.drop': 'cpu',
 'transformer.h.0': 'cpu',
 'transformer.h.1': 'cpu',
 'transformer.h.2': 'cpu',
 'transformer.h.3': 'cpu',
 'transformer.h.4': 'cpu',
 'transformer.h.5': 'cpu',
 'transformer.h.6': 'cpu',
 'transformer.h.7.ln_1': 'cpu',
 'transformer.h.7.attn': 'cpu',
 'transformer.h.7.ln_2': 'cpu',
 'transformer.h.8': 'disk',
 'transformer.h.9': 'disk',
 'transformer.h.10': 'disk',
 'transformer.h.11': 'disk',
 'transformer.h.12': 'disk',
 'transformer.h.13': 'disk',
 'transformer.h.14': 'disk',
 'transformer.h.15': 'disk',
 'transformer.h.16': 'disk',
 'transformer.h.17': 'disk',
 'transformer.h.18': 'disk',
 'transformer.h.19': 'disk',
 'transformer.h.20': 'disk',
 'transformer.h.21': 'disk',
 'transformer.h.22': 'disk',
 'transformer.h.23': 'disk',
 'transformer.h.24': 'disk',
 'transformer.h.25': 'disk',
 'transformer.h.26': 'disk',
 'transformer.h.27': 'disk',
 'transformer.h.28': 'disk',
 'transformer.h.29': 'disk

# **Initializing a new model**

Our first step is to freshly initialize a GPT-2 model. We’ll use the same configuration for our model as for the small GPT-2 model, so we load the pretrained configuration, make sure that the tokenizer size matches the model vocabulary size and pass the bos and eos (beginning and end of sequence) token IDs:

GPT2

In [None]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig, AutoModelForCausalLM

In [None]:
context_length = 512
tokenizer = AutoTokenizer.from_pretrained("gpt2")
config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

With that configuration, we can load a new model. Note that this is the first time we don’t use the from_pretrained() function, since we’re actually initializing a model ourself:

In [None]:
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

GPT-2 size: 124.4M parameters


In [None]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

# ***Flan-t5-large-grammar-synthesis***

In [None]:
from accelerate import infer_auto_device_map, init_empty_weights
from transformers import AutoConfig, AutoModelForCausalLM, AutoModelForSeq2SeqLM
#config = AutoConfig.from_pretrained("decapoda-research/llama-65b-hf", low_cpu_mem_usage=True)
config = AutoConfig.from_pretrained("pszemraj/flan-t5-large-grammar-synthesis")
with init_empty_weights():
  model = AutoModelForSeq2SeqLM.from_config(config) 
device_map = infer_auto_device_map(model)


Downloading (…)lve/main/config.json:   0%|          | 0.00/892 [00:00<?, ?B/s]

In [None]:
model_size = sum(t.numel() for t in model.parameters())
print(f"Model size: {model_size/1000**2:.1f}M parameters")

Model size: 783.2M parameters


# ***How to calculate total number of parameters in PyTorch***

PyTorch doesn't have a function to calculate the total number of parameters as Keras does(model.count_params()), but it's possible to sum the number of elements for every parameter group:


In [None]:
pytorch_total_params = sum(p.numel() for p in model.parameters())


If you want to calculate only the trainable parameters:



In [None]:
pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

In [None]:
pytorch_total_params

783150080

In [None]:
param_size = 0
for param in model.parameters():
    param_size += param.nelement() * param.element_size()
buffer_size = 0
for buffer in model.buffers():
    buffer_size += buffer.nelement() * buffer.element_size()

size_all_mb = (param_size + buffer_size) / 1024**2
print('model size: {:.3f}MB'.format(size_all_mb))

model size: 2987.480MB


In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# ***Count parameters with equations***

In [29]:
import torch

# https://discuss.pytorch.org/t/how-do-i-check-the-number-of-parameters-of-a-model/4325/9
def count_parameters(model: torch.nn.Module) -> int:
    """ Returns the number of learnable parameters for a PyTorch model """
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [30]:
from torch import nn

d_model = 4096
n_heads = 16  # must be a divisor of `d_model`

multi_head_attention = nn.MultiheadAttention(embed_dim=d_model, num_heads=n_heads)
print(count_parameters(multi_head_attention))  
print(4 * (d_model * d_model + d_model))

67125248
67125248


In [31]:
from torch import nn

class TransformerFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(TransformerFeedForward, self).__init__()
        self.d_model = d_model
        self.d_ff = d_ff

        self.linear1 = nn.Linear(self.d_model, self.d_ff)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(self.d_ff, self.d_model)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.linear2(x)
        return x

d_model = 4096
d_ff = 4*d_model

feed_forward = TransformerFeedForward(d_model, d_ff)
print(count_parameters(feed_forward))  
print(2 * d_model * d_ff + d_model + d_ff)  

134238208
134238208


In [32]:
from torch import nn

d_model = 4096

layer_normalization = nn.LayerNorm(d_model)
print(count_parameters(layer_normalization))  
print(d_model * 2)  

8192
8192


In [33]:
def transformer_count_params(d_model=512, d_ff=2048, encoder=True, approx=False):
    """
    Calculate the number of parameters in Transformer Encoder/Decoder.
    Formulas are the following:
        multi-head attention: 4*(d_model^2 + d_model)
            if approx=False, 4*d_model^2 otherwise
        feed-forward: 2*d_model*d_ff + d_model + d_ff 
            if approx=False, 2*d_model*d_ff otherwise
        layer normalization: 2*d_model if approx=False, 0 otherwise
    Encoder block consists of: 
        1 multi-head attention block, 
        1 feed-forward net, and 
        2 layer normalizations.
    Decoder block consists of: 
        2 multi-head attention blocks, 
        1 feed-forward net, and 
        3 layer normalizations.
    :param d_model: (int) model dimensionality
    :param d_ff: (int) internal dimensionality of a feed-forward neural network
    :param encoder: (bool) if True, return the number of parameters of the Encoder, 
        otherwise the Decoder
    :param approx: (bool) if True, result is approximate (see formulas)
    :return: (int) number of learnable parameters in Transformer Encoder/Decoder
    """

    attention = 4 * (d_model ** 2 + d_model) if not approx else 4 * d_model ** 2
    feed_forward = 2 * d_model * d_ff + d_model + d_ff if not approx else 2 * d_model * d_ff
    layer_norm = 2 * d_model if not approx else 0

    return attention + feed_forward + 2 * layer_norm \
        if encoder else 2 * attention + feed_forward + 3 * layer_norm

In [34]:
from torch import nn

encoder_layer = nn.TransformerEncoderLayer(d_model=4096, nhead=16)
print(count_parameters(encoder_layer))  
print(transformer_count_params(d_model=4096, d_ff=16384, encoder=True, approx=False))  
print(transformer_count_params(d_model=4096, d_ff=16384, encoder=True, approx=True))   


decoder_layer = nn.TransformerDecoderLayer(d_model=4096, nhead=16)
print(count_parameters(decoder_layer))  
print(transformer_count_params(d_model=4096, d_ff=16384, encoder=False, approx=False))  
print(transformer_count_params(d_model=4096, d_ff=16384, encoder=False, approx=True))   


83924992
201379840
201326592
151058432
268513280
268435456
