In [1]:
from transformers import AutoTokenizer, AutoModel
import torch

model_ckpt = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)
text = "time flies like an arrow"


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
inputs = tokenizer(text, return_tensors="pt", add_special_tokens=False)
inputs.input_ids

tensor([[ 2435, 17607,   588,   281, 15452]])

In [3]:
from torch import nn
from transformers import AutoConfig
import torch.nn.functional as F
from math import sqrt

In [4]:
config = AutoConfig.from_pretrained(model_ckpt)
token_emb = nn.Embedding(config.vocab_size, config.hidden_size)
token_emb



Embedding(50257, 768)

In [5]:
inputs_embeds = token_emb(inputs.input_ids)
inputs_embeds.size()  ## batch size, seq_len, hidden_dim

torch.Size([1, 5, 768])

In [6]:
def scaled_dot_product_attention(query, key, value, mask=None):
    dim_k = query.size(-1)
    scores = torch.bmm(query, key.transpose(1, 2)) / sqrt(dim_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, float("-inf"))
    weights = F.softmax(scores, dim=-1)
    return weights.bmm(value)

In [7]:
class AttentionHead(nn.Module):
    def __init__(self, embed_dim, head_dim):
        super().__init__()
        self.q = nn.Linear(embed_dim, head_dim)
        self.k = nn.Linear(embed_dim, head_dim)
        self.v = nn.Linear(embed_dim, head_dim)

    def forward(self, hidden_state):
        attn_outputs = scaled_dot_product_attention(
            self.q(hidden_state), self.k(hidden_state), self.v(hidden_state))
        return attn_outputs

In [8]:
class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        embed_dim = config.hidden_size
        num_heads = config.num_attention_heads
        head_dim = embed_dim // num_heads
        self.heads = nn.ModuleList(
            [AttentionHead(embed_dim, head_dim) for _ in range(num_heads)]
        )
        self.output_linear = nn.Linear(embed_dim, embed_dim)

    def forward(self, hidden_state):
        x = torch.cat([h(hidden_state) for h in self.heads], dim=-1)
        x = self.output_linear(x)
        return x

In [9]:
multihead_attn = MultiHeadAttention(config)
attn_output = multihead_attn(inputs_embeds)
attn_output.size()

torch.Size([1, 5, 768])

In [None]:
from transformers import AutoTokenizer, AutoModel
from bertviz import head_view

model = AutoModel.from_pretrained(model_ckpt, output_attentions=True)
viz_input = tokenizer(text, return_tensors='pt')
print(viz_input)
attention = model(**viz_input).attentions
starting_point = (viz_input.input_ids == 2435).sum(dim=1)
tokens = tokenizer.convert_ids_to_tokens(viz_input.input_ids[0])

head_view(attention, tokens, starting_point, heads=[8])


In [10]:
config

GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.41.2",
  "use_cache": true,
  "vocab_size": 50257
}

In [11]:
class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.linear_1 = nn.Linear(config.hidden_size, 4*config.hidden_size)
        self.linear_2 = nn.Linear(4*config.hidden_size, config.hidden_size)
        self.gelu = nn.GELU()

    def forward(self, x):
        x = self.linear_1(x)
        x = self.gelu(x)
        x = self.linear_2(x)
        return x

In [12]:
feed_forward = FeedForward(config)
ff_outputs = feed_forward(attn_output)
ff_outputs.size()

torch.Size([1, 5, 768])

In [13]:
class TransformerDecoderLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.layer_norm_1 = nn.LayerNorm(config.hidden_size)
        self.layer_norm_2 = nn.LayerNorm(config.hidden_size)
        self.attention = MultiHeadAttention(config)
        self.feed_forward = FeedForward(config)

    def forward(self, x):
        # Apply layer normalization and then copy input into query, key, value
        hidden_state = self.layer_norm_1(x)
        # Apply attention with a skip connection
        x = x + self.attention(hidden_state)
        # Apply feed-forward layer with a skip connection
        x = x + self.feed_forward(self.layer_norm_2(x))
        return x

In [14]:
class Embeddings(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.token_embeddings = nn.Embedding(config.vocab_size,
                                             config.hidden_size)
        self.position_embeddings = nn.Embedding(config.n_positions,
                                                config.hidden_size)
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
        self.dropout = nn.Dropout()

    def forward(self, input_ids):
        # Create position IDs for input sequence
        seq_length = input_ids.size(-1)
        position_ids = torch.arange(seq_length, dtype=torch.long).unsqueeze(0)
        # Create token and position embeddings
        token_embeddings = self.token_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)
        # Combine token and position embeddings
        embeddings = token_embeddings + position_embeddings
        embeddings = self.layer_norm(embeddings)
        ## embeddings = self.dropout(embeddings) ## I added this!!!!
        return embeddings

In [15]:
embedding_layer = Embeddings(config)
embedding_layer(inputs.input_ids).size()

torch.Size([1, 5, 768])

In [16]:
class TransformerDecoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.embeddings = Embeddings(config)
        self.layers = nn.ModuleList([TransformerDecoderLayer(config)
                                     for _ in range(config.num_hidden_layers)])

    def forward(self, x):
        x = self.embeddings(x)
        for layer in self.layers:
            x = layer(x)
        return x

In [17]:
decoder = TransformerDecoder(config)
decoder(inputs.input_ids).size()

torch.Size([1, 5, 768])

In [18]:
class TransformerForTextGeneration(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.decoder = TransformerDecoder(config)
        self.dropout = nn.Dropout(config.embd_pdrop)
        self.ff_layer = nn.Linear(config.hidden_size, 50257)

    def forward(self, x):
        x = self.decoder(x)[:, 0, :]
        x = self.dropout(x)
        x = self.ff_layer(x)
        x = F.softmax(x, dim=-1)
        return x

In [19]:
decoder_model = TransformerForTextGeneration(config)
decoder_model_output = decoder_model(inputs.input_ids)

In [20]:
max_length = 50
maximum_value = torch.argmax(decoder_model_output)
decoder_model_output[0][maximum_value]

tensor(0.0005, grad_fn=<SelectBackward0>)

In [21]:
decoder_model_output.shape

torch.Size([1, 50257])

In [22]:
inputs.input_ids.shape

torch.Size([1, 5])

In [51]:
input_ids = inputs.input_ids
input_ids

tensor([[ 2435, 17607,   588,   281, 15452]])

In [55]:
while input_ids.size(1) < max_length:
    ## forward the model to get the logits
    decoder_model.eval()
    with torch.no_grad():    
        probs = decoder_model(input_ids) ## (B, T, vocab_size)
        ## We'll do the top k sampling here (HF's default 50 for pipeline)
        print(probs)
        topk_probs, topk_indices = torch.topk(probs, 50, dim=-1, largest=True) ## If you want some fun change the 1 to some other number and you'll have different answers!!!
        ## Select a token from top-k probabilities
        print(topk_indices)
        print(topk_probs)
        ix = torch.multinomial(topk_probs, 5, generator=torch.cuda.manual_seed(42), replacement=False) ## (B, 1)
        print(ix)
        ## gather the corresponding indices
        xcol = torch.gather(topk_indices, -1, ix) ## (B, 1)
        print(xcol)
        ## append to the sequence to get the full generated sentences
        input_ids = torch.cat((input_ids, xcol), dim=1)
        print(input_ids)

# Print the predicted token IDs
print(input_ids)

## decode and print the generated text
tokens = input_ids[0, :max_length].detach().to('cpu').tolist()
decoded = tokenizer.decode(tokens)
print(">", decoded)
input_ids = torch.tensor( [[2435, 17607,   588,   281, 15452]], dtype=torch.int)

tensor([[1.0895e-05, 1.3274e-05, 4.6472e-05,  ..., 1.6482e-05, 5.4983e-06,
         1.2708e-05]])
tensor([[17966,  7786, 13970, 31486, 29115, 17378, 35100, 37895, 38251, 45586,
         22963, 46413, 28757, 43166, 35939,  2395,  3119,  8999, 16076, 38187,
         23770, 38510, 42279, 29938, 30260, 40784, 34430, 33626, 13540, 21460,
          8692, 14488, 20889, 37852, 43586, 18835, 32809, 43011, 35663,  9569,
         24882, 35761, 38345, 21860, 20301, 26021, 47513, 45762,  2907, 43751]])
tensor([[0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003,
         0.0003, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002,
         0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002,
         0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002,
         0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002,
         0.0002, 0.0002, 0.0002, 0.0002, 0.0002]])
tensor([[35, 25, 27, 30, 18]])
tensor([[18835, 40

In [37]:
torch.tensor( [[2435, 17607,   588,   281, 15452]], dtype=torch.int)

tensor([[ 2435, 17607,   588,   281, 15452]], dtype=torch.int32)