# Inference using packages

Differen gpt2 model options
- gpt2: This is the "small" version of GPT-2. It has 124 million parameters. 768 context size, 12 decode blocks
- gpt2-medium: This is the "medium" version of GPT-2. It has 355 million parameters.
- gpt2-large: This is the "large" version of GPT-2. It has 774 million parameters. 1280 context size 36 decode blocks
- gpt2-xl: This is the "extra large" version of GPT-2. It has 1.5 billion parameters.

@article{radford2019language,
  title={Language Models are Unsupervised Multitask Learners},
  author={Radford, Alec and Wu, Jeff and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya},
  year={2019}
}


In [8]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torchinfo import summary
import torch

gpt2 = GPT2LMHeadModel.from_pretrained('gpt2', output_attentions=True, activation_function = 'gelu') # loading gpt2 from transformers library
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2') # loading gpt2 tokenizer from transformers library
print('\ngeneral arcatecture')
print(gpt2)
print('\nconfig')
print(gpt2.config)

print('\n arcatecture')
input_ids = torch.ones((1, 11), dtype=torch.long)

# Print detailed summary
model_summary = summary(
    gpt2,
    input_data=input_ids,
    depth=6,
    verbose=2,
    col_names=["input_size", "output_size", "num_params", "trainable"],  # Custom columns
    col_width=20,
    row_settings=["var_names"],
    dtypes=[torch.long],
    device="cpu"
)
print(model_summary)


general arcatecture
GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): GELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

config
GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu",
  "architectu

In [9]:
# https://huggingface.co/docs/transformers/en/model_doc/gpt2
prompt = "my favorite music is"
input_ids = gpt2_tokenizer(prompt, return_tensors="pt").input_ids
# print(f'{type(input_ids)}{input_ids}, :{len(input_ids)}')
gen_tokens = gpt2.generate(input_ids, do_sample=True, temperature=0.9, max_length=100)
# print(f'{type(gen_tokens)} {gen_tokens}, :{len(gen_tokens[0])}')
gen_text = gpt2_tokenizer.batch_decode(gen_tokens)[0]

# gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token # set the padding token
# input_ids = gpt2_tokenizer.encode(input_text, return_tensors='pt') # tokenize input
# output = gpt2.generate(input_ids, max_length=max) # run inference
# generated_text = gpt2_tokenizer.decode(output[0], skip_special_tokens=True) # decode output tokens
print(gen_text)

# tok = gpt2_tokenizer.encode(tok, return_tensors='np', padding='max_length', truncation=True, max_length=max_token_len)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


my favorite music is "Kali," and I thought it would be nice to have an album in that genre. The title of the album was "Tribunal" which was a sort of psychedelic love song. I'm an artist that loves my work from both a physical and sonic distance. I'm an electronic producer. In those days of music, it was more of a physical thing, it was more of a sonic thing. When it was time for me to take a step out of all


# Inference using Numpy

In [10]:
import numpy as np
import heapq
import random
import torch.nn.functional as F
import torch
import copy
import tensorflow as tf

state_dict = gpt2.state_dict()
parameters = {}
for name, val in state_dict.items():
    parameters[name] = val.numpy().astype(np.float32)


for name, param in state_dict.items():
    ans = param.numpy()
    if 'h.' not in name: # each h.# refers to a transformer blocks
        print(f'{name}: {param.shape}')
        pass

for i in range(36):
    counter = 0
    for name, param in state_dict.items():
        ans = param.numpy()
        if 'h.'+ str(i)+ '.' in name and i == 0: # each h.# refers to a transformer block
            print(f'{name}: {ans.shape}')
            counter +=1
    # print(f'h.{i}: {counter}')

transformer.wte.weight: torch.Size([50257, 768])
transformer.wpe.weight: torch.Size([1024, 768])
transformer.ln_f.weight: torch.Size([768])
transformer.ln_f.bias: torch.Size([768])
lm_head.weight: torch.Size([50257, 768])
transformer.h.0.ln_1.weight: (768,)
transformer.h.0.ln_1.bias: (768,)
transformer.h.0.attn.c_attn.weight: (768, 2304)
transformer.h.0.attn.c_attn.bias: (2304,)
transformer.h.0.attn.c_proj.weight: (768, 768)
transformer.h.0.attn.c_proj.bias: (768,)
transformer.h.0.ln_2.weight: (768,)
transformer.h.0.ln_2.bias: (768,)
transformer.h.0.mlp.c_fc.weight: (768, 3072)
transformer.h.0.mlp.c_fc.bias: (3072,)
transformer.h.0.mlp.c_proj.weight: (3072, 768)
transformer.h.0.mlp.c_proj.bias: (768,)


In [11]:
parameters = parameters

def torch_to_numpy(tensor): # not nessessarry?
    if tensor.is_cuda:
        tensor = tensor.cpu()
    numpy_array = tensor.numpy()
    return numpy_array.copy()

def softmax(vec, temperature = 1):
    max_val = np.max(vec)
    exp = np.exp((vec - max_val)/ temperature)
    norm_vec = exp/np.sum(exp)
    assert 0.975 < np.sum(norm_vec) < 1.025
    return norm_vec

def log_softmax(vec, epsilon=1e-05): # puncuation
    max_val = np.max(vec)
    exp = np.exp(vec - max_val)
    log_sum_exp = max_val + np.log(np.sum(exp))
    return vec - log_sum_exp

# activation functions
def gelu(x):
    return 0.5*x*(1+np.tanh(np.sqrt(2/np.pi)*(x+0.044715*np.power(x, 3))))

def ReLU(x):
    # x(np_array) clip negitive activation
    return np.maximum(0, x)

# Transformer functions
def embed(tok):
    '''
    creat embeding matrix (token, token embeding vector 768)
    tok(np_array): 1d array of toek encodings
    paramaters(dict): dictionary maping names to tensors
    '''
    # sequence_length = tok.shape[0]
    # position_ids = torch.tensor(np.arange(sequence_length)) #indicies
    # tok = torch.tensor(tok)
    # return (gpt2.transformer.wte(tok) + gpt2.transformer.wpe(position_ids)).detach().numpy()


    # word token embeddings
    tok_emb = parameters['transformer.wte.weight'][tok,:]

    # word position embeddings
    sequence_length = tok.shape[0]
    position_ids = np.arange(sequence_length) #indicies
    position_emb = parameters['transformer.wpe.weight'][position_ids,:]
    assert tok_emb.shape == position_emb.shape
    return tok_emb + position_emb

def li_norm(x, gamma, beta, epsilon=1e-5):
    '''
    layer batch normalization
    x(np_array): array to normalize
    gamma(np_array): scailing paramater vector
    beta(np_array): offset paramater vector
    epsilon(float): div_0_error prevention
    '''

    u = np.mean(x, axis=-1, keepdims=True)
    s = np.mean(np.square(x-u))
    x = (x - u) / np.sqrt(s + epsilon)
    return x*gamma + beta

def self_attn(emb, block_num, attn_heads = 12):
    '''
    attention block. 12 heads per block
    emb(np_matrix): (tokens, Embedding Size 768)
    paramaters(dict): dictionary maping names to tensors
    block_num: current head
    '''
    # attn
    attn_weights = parameters['transformer.h.'+ str(block_num) + '.attn.c_attn.weight']
    attn_bias = parameters['transformer.h.'+ str(block_num) + '.attn.c_attn.bias']

    context_matrix = np.empty((0,emb.shape[1]))
    tok_k_matrx = [] # each index is a token
    for tok_index, tok in enumerate(emb):
        qvk_vec = np.split(tok @ attn_weights + attn_bias, 3, axis=0)
        Q_m, K_m, V_m = [s.reshape(attn_heads,int(attn_weights.shape[1]/3/attn_heads)) for s in qvk_vec]
        assert Q_m.shape == K_m.shape == V_m.shape == (12, 64)
        tok_k_matrx.append(K_m)

        masked_k = np.empty((0,K_m.shape[1]))

        context_vec = np.array([])
        for head in range(attn_heads):

            masked_k = np.empty((0,K_m.shape[1]))
            for prev_tok in range(tok_index):
                masked_k = np.vstack((masked_k, tok_k_matrx[prev_tok][head]))
            assert masked_k.shape[1] == 64

            score_vec = Q_m[head] @ masked_k.T

            sub_context_vec = np.full((1 ,K_m.shape[1]), 0.0)
            for tok, score in enumerate(score_vec):
                sub_context_vec += score * K_m[head]
            context_vec = np.append(context_vec, sub_context_vec)

        context_vec_norm = softmax(context_vec, temperature = 1)
        context_matrix = np.vstack((context_matrix, context_vec_norm.reshape(1, -1)))

    # projection
    weights = parameters['transformer.h.'+ str(block_num) + '.attn.c_proj.weight']
    bias = parameters['transformer.h.'+ str(block_num) + '.attn.c_proj.bias']
    context_proj = (context_matrix @ weights) + bias
    return context_proj

def matrix_self_attn(emb, block_num, attn_heads = 12):
    '''
    attention block. 12 heads per block
    emb(np_matrix): (tokens, Embedding Size 768)
    paramaters(dict): dictionary maping names to tensors
    block_num: current head
    '''
    def reshape_weights(x, w, b):
        *start, nx = x.shape
        nf = w.shape[-1]
        new_x = np.reshape(x, (-1, nx))
        new_w = np.reshape(w, (-1, nf))
        a = new_x @ new_w + b
        return(np.reshape(a, start + [nf]))

    def split_heads(x,attn_heads):
        *start, m = x.shape
        a = np.reshape(x, start + [attn_heads, m//attn_heads]) # matrix to tensor with heads dimention
        return np.transpose(a, [1, 0, 2]) #[heads, sequence, features]

    def merge_heads(x):
        x = np.transpose(x, [1, 0, 2]) #[sequence, heads, features]
        *start, a, b = x.shape
        return np.reshape(x, start + [a*b])

    # attn
    attn_weights = parameters['transformer.h.'+ str(block_num) + '.attn.c_attn.weight']
    attn_bias = parameters['transformer.h.'+ str(block_num) + '.attn.c_attn.bias']
    c = reshape_weights(emb, attn_weights, attn_bias)
    q,k,v = map(lambda x: split_heads(x, attn_heads), np.split(c, 3, axis=-1))
    assert q.shape == k.shape == v.shape
    print(q.shape)

    # multi_headed_attn
    w = np.matmul(q, np.transpose(k, (0, 2, 1)))
    w = w * 1/np.sqrt(np.float32(v.shape[-1]))

    *start, nd, ns = w.shape
    mask = np.full((nd, ns), float(1))
    mask[np.triu_indices_from(mask, k=1)] = float(0)
    attn_score_mask = w * mask
    attn_score_norm = np.apply_along_axis(lambda x: softmax(x, temperature = 1), axis=1, arr=attn_score_mask) # (1024, 1024)
    a = np.matmul(attn_score_norm, v)

    a = merge_heads(a)

    weights = parameters['transformer.h.'+ str(block_num) + '.attn.c_proj.weight']
    bias = parameters['transformer.h.'+ str(block_num) + '.attn.c_proj.bias']
    context_proj = (a @ weights) + bias
    return context_proj

def mlp(emb, block_num):
    '''
    2 layer multi layer perceptron with gelu activation
    emb(np_matrix): (tokens, Embedding Size 768)
    paramaters(dict): dictionary maping names to tensors
    block_num: current head
    '''
    weights = parameters['transformer.h.'+ str(block_num) + '.mlp.c_fc.weight']
    bias = parameters['transformer.h.'+ str(block_num) + '.mlp.c_fc.bias']
    embl1 = (emb @ weights) + bias

    embl1 = gelu(embl1)

    weights = parameters['transformer.h.'+ str(block_num) + '.mlp.c_proj.weight']
    bias = parameters['transformer.h.'+ str(block_num) + '.mlp.c_proj.bias']
    return (embl1 @ weights) + bias

def top_k(k, vec):
    largest = heapq.nlargest(k, range(len(vec)), vec.take)
    # print(gpt2_tokenizer.decode(largest, skip_special_tokens=True)) # see words its picking from.
    probs = np.array([vec[i] for i in largest])
    probs = probs / np.sum(probs) # normalize after the selection
    assert 0.975 < np.sum(probs) < 1.025
    print(np.max(probs))
    return random.choices(largest, weights=probs, k=1)[0]


In [12]:
parameters = parameters

def decode_block(emb, block_num):
    '''
    runs decode block with ln_1 -> attn -> ln_2 -> mlp
    emb (np_array): (tokens, Embedding Size 768)
    paramaters(dict): dictionary maping names to tensors
    block_num: current head
    '''

    # ln_1 normalization
    weights = parameters['transformer.h.'+ str(block_num) + '.ln_1.weight']
    bias = parameters['transformer.h.'+ str(block_num) + '.ln_1.bias']
    emb_norm1 = li_norm(emb, weights, bias)


    context_matrix = matrix_self_attn(emb_norm1, block_num)
    context_matrix += emb # Residual Connection


    # ln_2 normalization
    weights = parameters['transformer.h.'+ str(block_num) + '.ln_2.weight']
    bias = parameters['transformer.h.'+ str(block_num) + '.ln_2.bias']
    emb_norm2 = li_norm(context_matrix, weights, bias, epsilon=1e-5)

    emb_mlp = mlp(emb_norm2, block_num)

    emb_mlp = context_matrix + emb_mlp # Residual Connection
    return emb_mlp


In [13]:
def next_token(tok, transformer_blocks = 12):
    '''
    Generates the next token in sequence
    tok (np_array): 1D token encodigns
    parameters(dict): dictionary maping names to tensors
    '''
    emb = embed(tok) #(tokens, Embedding Size 768)

    block_result = copy.deepcopy(emb)
    for block in range(transformer_blocks):
        block_result = decode_block(block_result, block) # (tokens, Embedding Size 768)

    # ln_f
    weights = parameters['transformer.ln_f.weight']
    bias = parameters['transformer.ln_f.bias']
    head_norm = li_norm(block_result, weights, bias)

    # lm_head
    weights = parameters['lm_head.weight'] # (50257, 768)
    logit_matrix = head_norm @ weights.T

    # apply softmax to last words logit
    last_logit_distrabution = softmax(logit_matrix[-1], temperature = 1)
    return top_k(40, last_logit_distrabution)


In [14]:
def main(prompt, max_token_len = 100, num_generate = 10):
    '''
    creates generation feedback loop
    prompt(srt)
    start_dict(dict): name: paramaters
    '''
    for name, val in state_dict.items():
        parameters[name] = val.numpy().astype(np.float32)

    gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token
    tok = gpt2_tokenizer.encode(prompt, return_tensors='np', padding='max_length', truncation=True, max_length=max_token_len).squeeze()

    prompt_tok_index = np.where(tok == gpt2_tokenizer.eos_token_id)[0][0]
    print(prompt_tok_index)
    for _ in range(num_generate):
        print(tok[:10])
        new_tok = next_token(tok)
        tok[prompt_tok_index] = new_tok
        prompt_tok_index += 1


    tok = gpt2_tokenizer.decode(tok, skip_special_tokens=True)
    return tok

print(main('once appon a time ', num_generate = 20))

6
[27078   598   261   257   640   220 50256 50256 50256 50256]
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
0.08254032314978159
[27078   598   261   257   640   220    11 50256 50256 50256]
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
0.08283973079705785
[27078   598   261   257   640   220    11    14 50256 50256]
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
0.08313037425249414
[27078   598   261   257   640   220    11    14   379 50256]
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
(12, 100, 64)
0.0832735888890104