# Inference using packages

Differen gpt2 model options
- gpt2: This is the "small" version of GPT-2. It has 124 million parameters. 768 context size, 12 decode blocks
- gpt2-medium: This is the "medium" version of GPT-2. It has 355 million parameters.
- gpt2-large: This is the "large" version of GPT-2. It has 774 million parameters. 1280 context size 36 decode blocks
- gpt2-xl: This is the "extra large" version of GPT-2. It has 1.5 billion parameters.


In [306]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
gpt2 = GPT2LMHeadModel.from_pretrained('gpt2', output_attentions=True, activation_function = 'gelu') # loading gpt2 from transformers library
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2') # loading gpt2 tokenizer from transformers library
print(gpt2.config)

GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "output_attentions": true,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.41.1",
  "use_cache": true,
  "vocab_size": 50257
}



In [307]:
# https://huggingface.co/docs/transformers/en/model_doc/gpt2
prompt = "my favorite music is"
input_ids = gpt2_tokenizer(prompt, return_tensors="pt").input_ids
# print(f'{type(input_ids)}{input_ids}, :{len(input_ids)}')
gen_tokens = gpt2.generate(input_ids, do_sample=True, temperature=0.9, max_length=100)
# print(f'{type(gen_tokens)} {gen_tokens}, :{len(gen_tokens[0])}')
gen_text = gpt2_tokenizer.batch_decode(gen_tokens)[0]

# gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token # set the padding token
# input_ids = gpt2_tokenizer.encode(input_text, return_tensors='pt') # tokenize input
# output = gpt2.generate(input_ids, max_length=max) # run inference
# generated_text = gpt2_tokenizer.decode(output[0], skip_special_tokens=True) # decode output tokens
print(gen_text)

# tok = gpt2_tokenizer.encode(tok, return_tensors='np', padding='max_length', truncation=True, max_length=max_token_len)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


my favorite music is usually found in the early days before the Beatles, which is a good one if it's from the early days prior to the Beatles. I got my first copy, and it was for my first music record. When I discovered the Beatles, I'd always played with someone else, but that was before I'd heard of them.

How did that make sense for you?

I read an interview with Peter Stradlin on the radio where he said there was an


# Inference using Numpy

In [308]:
import numpy as np
import heapq
import random
import torch.nn.functional as F
import torch
import copy

state_dict = gpt2.state_dict()
parameters = {}
for name, val in state_dict.items():
    parameters[name] = val.numpy().astype(np.float32)


for name, param in state_dict.items():
    ans = param.numpy()
    if 'h.' not in name: # each h.# refers to a transformer blocks
        print(f'{name}: {param.shape}')
        pass

for i in range(36):
    counter = 0
    for name, param in state_dict.items():
        ans = param.numpy()
        if 'h.'+ str(i)+ '.' in name and i == 0: # each h.# refers to a transformer block
            print(f'{name}: {ans.shape}')
            print(f'{name}: {param.shape}')
            counter +=1
    # print(f'h.{i}: {counter}')

transformer.wte.weight: torch.Size([50257, 768])
transformer.wpe.weight: torch.Size([1024, 768])
transformer.ln_f.weight: torch.Size([768])
transformer.ln_f.bias: torch.Size([768])
lm_head.weight: torch.Size([50257, 768])
transformer.h.0.ln_1.weight: (768,)
transformer.h.0.ln_1.weight: torch.Size([768])
transformer.h.0.ln_1.bias: (768,)
transformer.h.0.ln_1.bias: torch.Size([768])
transformer.h.0.attn.c_attn.weight: (768, 2304)
transformer.h.0.attn.c_attn.weight: torch.Size([768, 2304])
transformer.h.0.attn.c_attn.bias: (2304,)
transformer.h.0.attn.c_attn.bias: torch.Size([2304])
transformer.h.0.attn.c_proj.weight: (768, 768)
transformer.h.0.attn.c_proj.weight: torch.Size([768, 768])
transformer.h.0.attn.c_proj.bias: (768,)
transformer.h.0.attn.c_proj.bias: torch.Size([768])
transformer.h.0.ln_2.weight: (768,)
transformer.h.0.ln_2.weight: torch.Size([768])
transformer.h.0.ln_2.bias: (768,)
transformer.h.0.ln_2.bias: torch.Size([768])
transformer.h.0.mlp.c_fc.weight: (768, 3072)
transfo

In [309]:
parameters = parameters

def torch_to_numpy(tensor): # not nessessarry?
    if tensor.is_cuda:
        tensor = tensor.cpu()
    numpy_array = tensor.numpy()
    return numpy_array.copy()

def softmax(vec, temperature = None): # the, and,

    # vec = torch.tensor(vec, dtype=torch.float32)
    # if temperature:
    #     vec /= temperature
    # return F.softmax(vec, dim=-1).numpy()

    max_val = np.max(vec)
    if temperature:
        exp = np.exp((vec - max_val)/ temperature)
    else:
         exp = np.exp(vec - max_val)

    sum_exp = np.sum(exp)
    norm_vec = exp/sum_exp
    assert 0.975 < np.sum(norm_vec) < 1.025
    return norm_vec

def log_softmax(vec, epsilon=1e-05): # puncuation
    max_val = np.max(vec)
    exp = np.exp(vec - max_val)
    log_sum_exp = max_val + np.log(np.sum(exp))
    return vec - log_sum_exp

def repo_softmax(vec): # error
    x = vec - np.argmin(vec)
    ex = np.exp(x)
    return ex/ np.sum(ex)

# activation functions
def gelu(x):
    # a = torch.nn.functional.gelu(torch.tensor(x)).numpy()
    # assert a.shape == x.shape
    # return a
    # from https://github.com/openai/gpt-2.git
    # x(np_array) Gausien error liner unit
    # return F.gelu(torch.tensor(x,dtype=torch.float32)).numpy()
    return 0.5*x*(1+np.tanh(np.sqrt(2/np.pi)*(x+0.044715*np.power(x, 3))))

def ReLU(x):
    # x(np_array) clip negitive activation
    return np.maximum(0, x)

# Transformer functions
def embed(tok):
    '''
    creat embeding matrix (token, token embeding vector 768)
    tok(np_array): 1d array of toek encodings
    paramaters(dict): dictionary maping names to tensors
    '''
    # sequence_length = tok.shape[0]
    # position_ids = torch.tensor(np.arange(sequence_length)) #indicies
    # tok = torch.tensor(tok)
    # return (gpt2.transformer.wte(tok) + gpt2.transformer.wpe(position_ids)).detach().numpy()


    # word token embeddings
    tok_emb = parameters['transformer.wte.weight'][tok,:]

    # word position embeddings
    sequence_length = tok.shape[0]
    position_ids = np.arange(sequence_length) #indicies
    position_emb = parameters['transformer.wpe.weight'][position_ids,:]
    assert tok_emb.shape == position_emb.shape
    return tok_emb + position_emb

def li_norm(x, gamma, beta, epsilon=1e-5):
    '''
    layer batch normalization
    x(np_array): array to normalize
    gamma(np_array): scailing paramater vector
    beta(np_array): offset paramater vector
    epsilon(float): div_0_error prevention
    '''
    # x = torch.tensor(x, dtype=torch.float32)
    # gamma = torch.tensor(gamma, dtype=torch.float32)
    # beta = torch.tensor(beta, dtype=torch.float32)
    # return (F.layer_norm(x, (x.size(-1),), gamma, beta, eps=epsilon)).numpy()

    u = np.mean(x, axis=-1, keepdims=True)
    # s = np.var(x, axis=-1, keepdims=True)
    s = np.mean(np.square(x-u))
    x = (x - u) / np.sqrt(s + epsilon)
    return x*gamma + beta

def get_head_weights(head_tot, weights, bias):
    '''
    head_tot(int)
    weights(np.matrix) (tok)
    bias(np.vec)
    return ittorator (tup): head_w, head_b
    '''
    head_width = int(weights.shape[1] / head_tot)
    start = 0
    end = head_width
    for _ in range(head_tot):
        yield (weights[:, start:end], bias[start:end])
        start += head_width
        end += head_width

def split_head(head_param):
    '''
    head_param (tup): head_w, head_b
    '''
    weights, bias = head_param
    width = int(head_param[1].shape[0] / 3)

    start = 0
    end = width
    for _ in range(3):
        yield (weights[:, start:end], bias[start:end])
        start += width
        end += width

def self_attn(emb, block_num, attn_heads = 12):
    '''
    attention block. 12 heads per block
    emb(np_matrix): (tokens, Embedding Size 768)
    paramaters(dict): dictionary maping names to tensors
    block_num: current head
    '''

    # projection
    weights = parameters['transformer.h.'+ str(block_num) + '.attn.c_proj.weight']
    bias = parameters['transformer.h.'+ str(block_num) + '.attn.c_proj.bias']
    emb = (emb @ weights) + bias

    attn_weights = parameters['transformer.h.'+ str(block_num) + '.attn.c_attn.weight']
    attn_bias = parameters['transformer.h.'+ str(block_num) + '.attn.c_attn.bias']

    context_matrix = np.empty((emb.shape[0],0))
    for head_num, head_param in enumerate(get_head_weights(attn_heads, attn_weights, attn_bias)):
        QKV_gen = split_head(head_param)
        query_w, query_b = next(QKV_gen)
        key_w, key_b = next(QKV_gen)
        value_w, value_b = next(QKV_gen)

        Q = np.apply_along_axis(lambda x: x + query_b, axis = 1, arr=emb @ query_w)
        K = np.apply_along_axis(lambda x: x + key_b, axis = 1, arr=emb @ key_w)
        V = np.apply_along_axis(lambda x: x + value_b, axis = 1, arr=emb @ value_w)

        print(f'{Q.shape} @ {K.shape}')
        attn_score = Q @ K.T

        # future_mask
        future_mask = np.full(attn_score.shape, 0.0)
        future_mask[np.triu_indices_from(future_mask, k=1)] = float('-inf')

        attn_score_mask = attn_score + future_mask

        attn_score_norm = np.apply_along_axis(lambda x: softmax(x, temperature = 1.2), axis=1, arr=attn_score_mask) # (1024, 1024)

        #liner layer?

        context_matrix = np.hstack([context_matrix, attn_score_norm @ V])

    assert emb.shape == context_matrix.shape
    return context_matrix

def mlp(emb, block_num):
    '''
    2 layer multi layer perceptron with gelu activation
    emb(np_matrix): (tokens, Embedding Size 768)
    paramaters(dict): dictionary maping names to tensors
    block_num: current head
    '''
    weights = parameters['transformer.h.'+ str(block_num) + '.mlp.c_fc.weight']
    bias = parameters['transformer.h.'+ str(block_num) + '.mlp.c_fc.bias']
    embl1 = (emb @ weights) + bias

    embl1 = gelu(embl1)

    weights = parameters['transformer.h.'+ str(block_num) + '.mlp.c_proj.weight']
    bias = parameters['transformer.h.'+ str(block_num) + '.mlp.c_proj.bias']
    return (embl1 @ weights) + bias

def top_k(k, vec):
    largest = heapq.nlargest(k, range(len(vec)), vec.take)
    # print(gpt2_tokenizer.decode(largest, skip_special_tokens=True)) # see words its picking from.
    probs = [vec[i] for i in largest]
    probs = probs / np.sum(probs) # normalize after the selection
    print(np.max(probs))
    return random.choices(largest, weights=probs, k=1)[0]


In [310]:
parameters = parameters

def decode_block(emb, block_num):
    '''
    runs decode block with ln_1 -> attn -> ln_2 -> mlp
    emb (np_array): (tokens, Embedding Size 768)
    paramaters(dict): dictionary maping names to tensors
    block_num: current head
    '''

    original_emb = copy.deepcopy(emb)

    # ln_1 normalization
    weights = parameters['transformer.h.'+ str(block_num) + '.ln_1.weight']
    bias = parameters['transformer.h.'+ str(block_num) + '.ln_1.bias']
    emb_norm1 = li_norm(emb, weights, bias)


    context_matrix = self_attn(emb_norm1, block_num)
    assert np.array_equal(emb, original_emb)
    context_matrix += emb # Residual Connection

    # ln_2 normalization
    weights = parameters['transformer.h.'+ str(block_num) + '.ln_2.weight']
    bias = parameters['transformer.h.'+ str(block_num) + '.ln_2.bias']
    emb_norm2 = li_norm(context_matrix, weights, bias, epsilon=1e-5)

    emb_mlp = mlp(emb_norm2, block_num)
    emb_mlp += emb_norm2    # Residual Connection
    return emb_mlp


In [313]:
def next_token(tok, transformer_blocks = 12):
    '''
    Generates the next token in sequence
    tok (np_array): 1D token encodigns
    parameters(dict): dictionary maping names to tensors
    '''
    emb = embed(tok) #(tokens, Embedding Size 768)

    block_result = copy.deepcopy(emb)
    for block in range(transformer_blocks):
        block_result = decode_block(block_result, block) # (tokens, Embedding Size 768)

    # ln_f
    # weights = parameters['transformer.ln_f.weight']
    # bias = parameters['transformer.ln_f.bias']
    weights = parameters['transformer.ln_f.weight']
    bias = parameters['transformer.ln_f.bias']
    head_norm = li_norm(block_result, weights, bias)

    # lm_head
    weights = parameters['lm_head.weight'] # (50257, 768)
    logit_matrix = head_norm @ weights.T

    # apply softmax to last words logit
    last_logit_distrabution = softmax(logit_matrix[-1], temperature = 1.2)
    return top_k(40, last_logit_distrabution)


In [314]:
def main(prompt, max_token_len = 1024, num_generate = 5):
    '''
    creates generation feedback loop
    prompt(srt)
    start_dict(dict): name: paramaters
    '''
    for name, val in state_dict.items():
        parameters[name] = val.numpy().astype(np.float32)

    print(parameters.keys())

    gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token
    tok = gpt2_tokenizer.encode(prompt, return_tensors='np', padding='max_length', truncation=True, max_length=max_token_len).squeeze()
    print(f'{type(tok)}{tok}, :{len(tok)}')

    prompt_tok_index = np.where(tok == gpt2_tokenizer.eos_token_id)[0][0]
    print(prompt_tok_index)
    for _ in range(num_generate):
        print(tok[:10])
        new_tok = next_token(tok)
        tok[prompt_tok_index] = new_tok
        prompt_tok_index += 1


    tok = gpt2_tokenizer.decode(tok, skip_special_tokens=True)
    return tok

print(main('once upon a time', num_generate = 20))

dict_keys(['transformer.wte.weight', 'transformer.wpe.weight', 'transformer.h.0.ln_1.weight', 'transformer.h.0.ln_1.bias', 'transformer.h.0.attn.c_attn.weight', 'transformer.h.0.attn.c_attn.bias', 'transformer.h.0.attn.c_proj.weight', 'transformer.h.0.attn.c_proj.bias', 'transformer.h.0.ln_2.weight', 'transformer.h.0.ln_2.bias', 'transformer.h.0.mlp.c_fc.weight', 'transformer.h.0.mlp.c_fc.bias', 'transformer.h.0.mlp.c_proj.weight', 'transformer.h.0.mlp.c_proj.bias', 'transformer.h.1.ln_1.weight', 'transformer.h.1.ln_1.bias', 'transformer.h.1.attn.c_attn.weight', 'transformer.h.1.attn.c_attn.bias', 'transformer.h.1.attn.c_proj.weight', 'transformer.h.1.attn.c_proj.bias', 'transformer.h.1.ln_2.weight', 'transformer.h.1.ln_2.bias', 'transformer.h.1.mlp.c_fc.weight', 'transformer.h.1.mlp.c_fc.bias', 'transformer.h.1.mlp.c_proj.weight', 'transformer.h.1.mlp.c_proj.bias', 'transformer.h.2.ln_1.weight', 'transformer.h.2.ln_1.bias', 'transformer.h.2.attn.c_attn.weight', 'transformer.h.2.attn.c

KeyboardInterrupt: 