# Inference using packages

Differen gpt2 model options
- gpt2: This is the "small" version of GPT-2. It has 124 million parameters.
- gpt2-medium: This is the "medium" version of GPT-2. It has 355 million parameters.
- gpt2-large: This is the "large" version of GPT-2. It has 774 million parameters.
- gpt2-xl: This is the "extra large" version of GPT-2. It has 1.5 billion parameters.


In [2]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
gpt2 = GPT2LMHeadModel.from_pretrained('gpt2') # loading gpt2 from transformers library
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2') # loading gpt2 tokenizer from transformers library
print(gpt2)

  from .autonotebook import tqdm as notebook_tqdm


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


In [3]:
input_text = "A long time ago in a galaxy far far away ..."
input_ids = gpt2_tokenizer.encode(input_text, return_tensors='pt') # tokenize input
output = gpt2.generate(input_ids, max_length=100) # run inference
generated_text = gpt2_tokenizer.decode(output[0], skip_special_tokens=True) # decode output tokens
print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


A long time ago in a galaxy far far away...

The first human-made planet was discovered in the early 1960s by a team of astronomers from the University of California, Berkeley.

The discovery of the first human-made planet was made by a team of astronomers from the University of California, Berkeley.

The first human-made planet was discovered in the early 1960s by a team of astronomers from the University of California, Berkeley.

The first human-made


# Inference using Numpy

In [6]:
state_dict = gpt2.state_dict()
parameters = {}
for name, val in state_dict.items():
    parameters[name] = val.numpy()


for name, param in state_dict.items():
    ans = param.numpy()
    if 'h.' not in name: # each h.# refers to a transformer blocks
        # print(f'{name}: {ans.shape}')
        pass

for i in range(12):
    counter = 0
    for name, param in state_dict.items():
        ans = param.numpy()
        if 'h.'+ str(i)+ '.' in name: # each h.# refers to a transformer block
            print(f'{name}: {ans.shape}')
            counter +=1
    # print(f'h.{i}: {counter}')

transformer.h.0.ln_1.weight: (768,)
transformer.h.0.ln_1.bias: (768,)
transformer.h.0.attn.c_attn.weight: (768, 2304)
transformer.h.0.attn.c_attn.bias: (2304,)
transformer.h.0.attn.c_proj.weight: (768, 768)
transformer.h.0.attn.c_proj.bias: (768,)
transformer.h.0.ln_2.weight: (768,)
transformer.h.0.ln_2.bias: (768,)
transformer.h.0.mlp.c_fc.weight: (768, 3072)
transformer.h.0.mlp.c_fc.bias: (3072,)
transformer.h.0.mlp.c_proj.weight: (3072, 768)
transformer.h.0.mlp.c_proj.bias: (768,)
transformer.h.1.ln_1.weight: (768,)
transformer.h.1.ln_1.bias: (768,)
transformer.h.1.attn.c_attn.weight: (768, 2304)
transformer.h.1.attn.c_attn.bias: (2304,)
transformer.h.1.attn.c_proj.weight: (768, 768)
transformer.h.1.attn.c_proj.bias: (768,)
transformer.h.1.ln_2.weight: (768,)
transformer.h.1.ln_2.bias: (768,)
transformer.h.1.mlp.c_fc.weight: (768, 3072)
transformer.h.1.mlp.c_fc.bias: (3072,)
transformer.h.1.mlp.c_proj.weight: (3072, 768)
transformer.h.1.mlp.c_proj.bias: (768,)
transformer.h.2.ln_1.w

In [27]:
import numpy as np
parameters = parameters

def torch_to_numpy(tensor): # not nessessarry?
    if tensor.is_cuda:
        tensor = tensor.cpu()
    numpy_array = tensor.numpy()
    return numpy_array.copy()

# TODO which softmax?
def softmax(vec): # the, and
    max_val = np.max(vec)
    exp = np.exp(vec - max_val)
    sum_exp = np.sum(exp)
    return exp/sum_exp

def log_softmax(vec, epsilon=1e-12): # puncuation
    max_val = np.max(vec)
    exp = np.exp(vec - max_val)
    log_sum_exp = max_val + np.log(np.sum(exp))
    return vec - log_sum_exp

def repo_softmax(vec): # error
    x = vec - np.argmin(vec)
    ex = np.exp(x)
    return ex/ np.sum(ex)

# activation functions
def gelu(x):
    # from https://github.com/openai/gpt-2.git
    # x(np_array) Gausien error liner unit
    return 0.5*x*(1+np.tanh(np.sqrt(2/np.pi)*(x+0.044715*np.power(x, 3))))

def ReLU(x):
    # x(np_array) clip negitive activation
    return np.maximum(0, x)

# Transformer functions
def embed(tok):
    '''
    creat embeding matrix (token, token embeding vector 768)
    tok(np_array): 1d array of toek encodings
    paramaters(dict): dictionary maping names to tensors
    '''
    # word token embeddings
    tok_emb = parameters['transformer.wte.weight'][tok,:]

    # word position embeddings
    sequence_length = tok.shape[0]
    position_ids = np.arange(sequence_length) #indicies
    position_emb = parameters['transformer.wpe.weight'][position_ids,:]
    return tok_emb + position_emb

def li_norm(x, gamma, beta, epsilon=1e-5):
    '''
    layer batch normalization
    x(np_array): array to normalize
    gamma(np_array): scailing paramater vector
    beta(np_array): offset paramater vector
    epsilon(float): div_0_error prevention
    '''
    u = np.mean(x, axis=-1, keepdims=True)
    s = np.var(x, axis=-1, keepdims=True)
    # s = np.mean(np.square(x-u))
    x = (x - u) / np.sqrt(s + epsilon)
    return x*gamma + beta

# def self_attn(emb, block_num):
#     '''
#     attention block
#     emb(np_matrix): (tokens, Embedding Size 768)
#     paramaters(dict): dictionary maping names to tensors
#     block_num: current head
#     '''

#     # query, key, vector
#     # c_attn.weight (768, 2304)
#     q_weights = parameters['transformer.h.'+ str(block_num) + '.attn.c_attn.weight'][:, :768]
#     k_weights = parameters['transformer.h.'+ str(block_num) + '.attn.c_attn.weight'][:, 768:1536]
#     v_weights = parameters['transformer.h.'+ str(block_num) + '.attn.c_attn.weight'][:, 1536:]
#     assert q_weights.shape == k_weights.shape == k_weights.shape == (768, 768)

#     q_bias = parameters['transformer.h.'+ str(block_num) + '.attn.c_attn.bias'][:768]
#     k_bias = parameters['transformer.h.'+ str(block_num) + '.attn.c_attn.bias'][768:1536]
#     v_bias = parameters['transformer.h.'+ str(block_num) + '.attn.c_attn.bias'][1536:]
#     assert q_bias.shape == k_bias.shape == v_bias.shape == (768,)

#     Q = emb @ q_weights
#     K = emb @ k_weights
#     V = emb @ v_weights

#     attn_scores = Q @ K.T

#     # #applied to every token embeding vector in embeding matrix
#     # context_matrix = np.zeros_like(emb)
#     # for i, tok_embed_vector in enumerate(emb): # loop through each token embeding
#         # qkv vectors (786,)
#         # query = (tok_embed_vector @ q_weights.T) + q_bias
#         # key = (tok_embed_vector @ k_weights.T) + k_bias
#         # value = (tok_embed_vector @ v_weights.T) + v_bias

#         # attn_score = query.reshape(query.shape[0], 1) @ key.reshape(1, key.shape[0]) # matrix
#         # attn_score = attn_score / (key.shape[0])**(1/2)
#         # attn_prob = np.zeros_like(attn_score) #(786, 786)
#         # for j, row in enumerate(attn_score):
#         #     attn_prob[j] = softmax(row)

#         # context_vec =  attn_prob @ value

#         weights = parameters['transformer.h.'+ str(block_num) + '.attn.c_proj.weight']
#         bias = parameters['transformer.h.'+ str(block_num) + '.attn.c_proj.bias']
#         context_vec_scaled = (context_vec @ weights) + bias # (768,)
#         context_matrix[i] = context_vec_scaled
#     return context_matrix

def get_head_weights(head_tot, weights, bias):
    '''
    head_tot(int)
    weights(np.matrix) (tok)
    bias(np.vec)
    return ittorator (tup): head_w, head_b
    '''
    head_width = int(weights.shape[1] / head_tot)
    start = 0
    end = head_width
    for _ in range(head_tot):
        yield (weights[:, start:end], bias[start:end])
        start += head_width
        end += head_width



def self_attn(emb, block_num):
    '''
    attention block. 12 heads per block
    emb(np_matrix): (tokens, Embedding Size 768)
    paramaters(dict): dictionary maping names to tensors
    block_num: current head
    '''
    attn_weights = parameters['transformer.h.'+ str(block_num) + '.attn.c_attn.weight']
    attn_bias = parameters['transformer.h.'+ str(block_num) + '.attn.c_attn.bias']

    for head_param in get_head_weights(12, attn_weights, attn_bias):
        head_weight, head_bias = head_param



def mlp(emb, block_num):
    '''
    2 layer multi layer perceptron with gelu activation
    emb(np_matrix): (tokens, Embedding Size 768)
    paramaters(dict): dictionary maping names to tensors
    block_num: current head
    '''
    weights = parameters['transformer.h.'+ str(block_num) + '.mlp.c_fc.weight']
    bias = parameters['transformer.h.'+ str(block_num) + '.mlp.c_fc.bias']
    embl1 = (emb @ weights) + bias

    embl1 = gelu(embl1)

    weights = parameters['transformer.h.'+ str(block_num) + '.mlp.c_proj.weight']
    bias = parameters['transformer.h.'+ str(block_num) + '.mlp.c_proj.bias']
    return (embl1 @ weights) + bias


In [28]:
parameters = parameters

def decode_block(emb, block_num):
    '''
    runs decode block with ln_1 -> attn -> ln_2 -> mlp
    emb (np_array): (tokens, Embedding Size 768)
    paramaters(dict): dictionary maping names to tensors
    block_num: current head
    '''
    # ln_1 normalization
    weights = parameters['transformer.h.'+ str(block_num) + '.ln_1.weight']
    bias = parameters['transformer.h.'+ str(block_num) + '.ln_1.bias']
    emb_norm1 = li_norm(emb, weights, bias)


    context_matrix = self_attn(emb_norm1, block_num)
    context_matrix += emb # Residual Connection

    # ln_2 normalization
    weights = parameters['transformer.h.'+ str(block_num) + '.ln_2.weight']
    bias = parameters['transformer.h.'+ str(block_num) + '.ln_2.bias']
    emb_norm2 = li_norm(context_matrix, weights, bias, epsilon=1e-5)

    emb_mlp = mlp(emb_norm2, block_num)
    emb_mlp += emb_norm2    # Residual Connection
    return emb_mlp


In [29]:
def next_token(tok):
    '''
    Generates the next token in sequence
    tok (np_array): 1D token encodigns
    parameters(dict): dictionary maping names to tensors
    '''
    emb = embed(tok) #(tokens, Embedding Size 768)

    block_result = emb
    for block in range(12): # for every decode block
        block_result = decode_block(block_result, block) # (tokens, Embedding Size 768)

    # ln_f
    weights = parameters['transformer.ln_f.weight']
    bias = parameters['transformer.ln_f.bias']
    head_norm = li_norm(block_result, weights, bias)

    # lm_head
    weights = parameters['lm_head.weight'] # (50257, 768)
    logit_matrix = head_norm @ weights.T

    # apply softmax to last words logit
    last_logit_distrabution = softmax(logit_matrix[-1])
    next_token = np.argmax(last_logit_distrabution)
    return next_token


In [30]:
def main(prompt, state_dict, max_tok_gen = 5):
    '''
    creates generation feedback loop
    prompt(srt)
    start_dict(dict): name: paramaters
    '''
    tok = gpt2_tokenizer.encode(prompt, return_tensors='np')
    tok = tok.squeeze()
    print(tok)
    for i in range(max_tok_gen):
        new_tok = next_token(tok) ###
        tok = np.append(tok, new_tok)
        print(new_tok)

    token_decoded = gpt2_tokenizer.decode(tok, skip_special_tokens=True)
    return (token_decoded)

print(main('different input', state_dict))

[39799  5128]
(768, 192)
(192,)
(768, 192)
(192,)
(768, 192)
(192,)
(768, 192)
(192,)
(768, 192)
(192,)
(768, 192)
(192,)
(768, 192)
(192,)
(768, 192)
(192,)
(768, 192)
(192,)
(768, 192)
(192,)
(768, 192)
(192,)
(768, 192)
(192,)


TypeError: unsupported operand type(s) for +: 'NoneType' and 'float'