# Inference using packages

Differen gpt2 model options
- gpt2: This is the "small" version of GPT-2. It has 124 million parameters.
- gpt2-medium: This is the "medium" version of GPT-2. It has 355 million parameters.
- gpt2-large: This is the "large" version of GPT-2. It has 774 million parameters.
- gpt2-xl: This is the "extra large" version of GPT-2. It has 1.5 billion parameters.


In [106]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
gpt2 = GPT2LMHeadModel.from_pretrained('gpt2') # loading gpt2 from transformers library
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2') # loading gpt2 tokenizer from transformers library
print(gpt2)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


In [107]:
input_text = "A long time ago in a galaxy far far away ..."
input_ids = gpt2_tokenizer.encode(input_text, return_tensors='pt') # tokenize input
output = gpt2.generate(input_ids, max_length=100) # run inference
generated_text = gpt2_tokenizer.decode(output[0], skip_special_tokens=True) # decode output tokens
print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


A long time ago in a galaxy far far away...

The first human-made planet was discovered in the early 1960s by a team of astronomers from the University of California, Berkeley.

The discovery of the first human-made planet was made by a team of astronomers from the University of California, Berkeley.

The first human-made planet was discovered in the early 1960s by a team of astronomers from the University of California, Berkeley.

The first human-made


# Inference using Numpy

In [108]:
import numpy as np

def torch_to_numpy(tensor):
    # nessessarry because numpy will be run on CPU. More complicated otherwise.
    if tensor.is_cuda:
        tensor = tensor.cpu()
    numpy_array = tensor.numpy()
    return numpy_array.copy()

def softmax(vec):
    max_val = np.max(vec)
    exp = np.exp(vec - max_val)
    sum_exp = np.sum(exp)
    return exp/sum_exp

In [109]:
state_dict = gpt2.state_dict()
for name, param in state_dict.items():
    ans = torch_to_numpy(param)
    if 'h.' in name: # each h.# refers to a transformer blocks
        print(f'{name}: {ans.shape}')

for i in range(12):
    counter = 0
    for name, param in state_dict.items():
        ans = torch_to_numpy(param)
        if 'h.'+ str(i)+ '.' in name: # each h.# refers to a transformer block
            # print(f'{name}: {ans.shape}')
            counter +=1
    print(f'h.{i}: {counter}')

transformer.h.0.ln_1.weight: (768,)
transformer.h.0.ln_1.bias: (768,)
transformer.h.0.attn.c_attn.weight: (768, 2304)
transformer.h.0.attn.c_attn.bias: (2304,)
transformer.h.0.attn.c_proj.weight: (768, 768)
transformer.h.0.attn.c_proj.bias: (768,)
transformer.h.0.ln_2.weight: (768,)
transformer.h.0.ln_2.bias: (768,)
transformer.h.0.mlp.c_fc.weight: (768, 3072)
transformer.h.0.mlp.c_fc.bias: (3072,)
transformer.h.0.mlp.c_proj.weight: (3072, 768)
transformer.h.0.mlp.c_proj.bias: (768,)
transformer.h.1.ln_1.weight: (768,)
transformer.h.1.ln_1.bias: (768,)
transformer.h.1.attn.c_attn.weight: (768, 2304)
transformer.h.1.attn.c_attn.bias: (2304,)
transformer.h.1.attn.c_proj.weight: (768, 768)
transformer.h.1.attn.c_proj.bias: (768,)
transformer.h.1.ln_2.weight: (768,)
transformer.h.1.ln_2.bias: (768,)
transformer.h.1.mlp.c_fc.weight: (768, 3072)
transformer.h.1.mlp.c_fc.bias: (3072,)
transformer.h.1.mlp.c_proj.weight: (3072, 768)
transformer.h.1.mlp.c_proj.bias: (768,)
transformer.h.2.ln_1.w

In [140]:
def decode_block(emb, parameters, head_num):
    '''
    emb (): Something...
    paramaters(dict): dictionary maping names to tensors
    '''

    '''
    transformer.h.0.ln_1.weight: (768,)
    transformer.h.0.ln_1.bias: (768,)

    transformer.h.0.attn.c_attn.weight: (768, 2304)
    transformer.h.0.attn.c_attn.bias: (2304,)
    transformer.h.0.attn.c_proj.weight: (768, 768)
    transformer.h.0.attn.c_proj.bias: (768,)

    transformer.h.0.ln_2.weight: (768,)
    transformer.h.0.ln_2.bias: (768,)

    transformer.h.0.mlp.c_fc.weight: (768, 3072)
    transformer.h.0.mlp.c_fc.bias: (3072,)
    transformer.h.0.mlp.c_proj.weight: (3072, 768)
    transformer.h.0.mlp.c_proj.bias: (768,)
    '''
    # ln_1 normalization
    weights = parameters['transformer.h.'+ str(head_num) + '.ln_1.weight'].numpy()
    bias = parameters['transformer.h.'+ str(head_num) + '.ln_1.bias'].numpy()
    emb_norm1 =  (emb * weights) + bias # (768,)


    # attn (applied to every token embeding vector)
    # query, key, vector
    q_weights = parameters['transformer.h.'+ str(head_num) + '.attn.c_attn.weight'][:, :768].numpy()
    k_weights = parameters['transformer.h.'+ str(head_num) + '.attn.c_attn.weight'][:, 768:1536].numpy()
    v_weights = parameters['transformer.h.'+ str(head_num) + '.attn.c_attn.weight'][:, 1536:].numpy()

    q_bias = parameters['transformer.h.'+ str(head_num) + '.attn.c_attn.bias'][:768].numpy()
    k_bias = parameters['transformer.h.'+ str(head_num) + '.attn.c_attn.bias'][768:1536].numpy()
    v_bias = parameters['transformer.h.'+ str(head_num) + '.attn.c_attn.bias'][1536:].numpy()

    context_matrix = np.zeros_like(emb_norm1)
    for i, tok_embed_vector in enumerate(emb_norm1): # each loop is a head
        # qkv vectors (786,)
        query = (tok_embed_vector @ q_weights.T) + q_bias
        key = (tok_embed_vector @ k_weights.T) + k_bias
        value = (tok_embed_vector @ v_weights.T) + v_bias

        attn_score = query.reshape(query.shape[0], 1) @ key.reshape(1, key.shape[0]) # matrix
        attn_score = attn_score / (key.shape[0])**(1/2)
        attn_prob = np.zeros_like(attn_score) #(786, 786)
        for j, row in enumerate(attn_score):
            attn_prob[j] = softmax(row)

        context_vec =  attn_prob @ value

        weights = parameters['transformer.h.'+ str(head_num) + '.attn.c_proj.weight'].numpy()
        bias = parameters['transformer.h.'+ str(head_num) + '.attn.c_proj.bias'].numpy()
        context_vec_scaled = (context_vec @ weights) + bias # (768,)
        context_matrix[i] = context_vec_scaled

    print(context_matrix.shape)






    # # ln_2 normalization
    # weights = parameters['transformer.h.'+ str(head_num) + '.ln_2.weight']
    # bias = parameters['transformer.h.'+ str(head_num) + '.ln_2.bias']
    # emb_norm2 =  (_ * weights) + bias

    # # mlp
    # weights = parameters['transformer.h.'+ str(head_num) + '.ln_2.weight']
    # bias = parameters['transformer.h.'+ str(head_num) + '.ln_2.bias']
    # emb_mlp_l1 = (emb_norm2 @ weights.T) + bias

    # weights = parameters['transformer.h.'+ str(head_num) + '.mlp.c_proj.weight']
    # bias = parameters['transformer.h.'+ str(head_num) + '.mlp.c_proj.bias']
    # emb_mlp_l2 = (emb_mlp_l1 @ weights.T) + bias
    # return(emb_mlp_l2)


In [141]:
def main(prompt, parameters):
    '''
    prompt (str):
    parameters(dict): dictionary maping names to tensors
    '''
    tok = gpt2_tokenizer.encode(prompt, return_tensors='np')
    tok = tok.squeeze()

     # word token embeddings
    tok_emb = parameters['transformer.wte.weight'][tok,:].numpy()

    # word position embeddings
    sequence_length = tok.shape[0]
    position_ids = np.arange(sequence_length) #indicies
    position_emb = parameters['transformer.wpe.weight'][position_ids,:].numpy()
    emb = tok_emb + position_emb

    head_result = emb
    for head_num in range(12):
        head_result = decode_block(head_result, parameters, head_num)

    # decoded = gpt2_tokenizer.decode(output[0], skip_special_tokens=True)


In [142]:
main('so its jsut a tensor. More you know', state_dict)

(11, 768)


TypeError: unsupported operand type(s) for *: 'NoneType' and 'float'