In [1]:
from transformers import AutoTokenizer, AutoModel
import torch

model_ckpt = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)
text = "time flies like an arrow"


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
inputs = tokenizer(text, return_tensors="pt", add_special_tokens=False)
inputs.input_ids

tensor([[ 2435, 17607,   588,   281, 15452]])

In [3]:
from torch import nn
from transformers import AutoConfig
import torch.nn.functional as F
from math import sqrt

In [4]:
config = AutoConfig.from_pretrained(model_ckpt)
token_emb = nn.Embedding(config.vocab_size, config.hidden_size)
token_emb



Embedding(50257, 768)

In [5]:
inputs_embeds = token_emb(inputs.input_ids)
inputs_embeds.size()  ## batch size, seq_len, hidden_dim

torch.Size([1, 5, 768])

In [6]:
def scaled_dot_product_attention(query, key, value, mask=None):
    dim_k = query.size(-1)
    scores = torch.bmm(query, key.transpose(1, 2)) / sqrt(dim_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, float("-inf"))
    weights = F.softmax(scores, dim=-1)
    return weights.bmm(value)

In [7]:
class AttentionHead(nn.Module):
    def __init__(self, embed_dim, head_dim):
        super().__init__()
        self.q = nn.Linear(embed_dim, head_dim)
        self.k = nn.Linear(embed_dim, head_dim)
        self.v = nn.Linear(embed_dim, head_dim)

    def forward(self, hidden_state):
        attn_outputs = scaled_dot_product_attention(
            self.q(hidden_state), self.k(hidden_state), self.v(hidden_state))
        return attn_outputs

In [8]:
class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        embed_dim = config.hidden_size
        num_heads = config.num_attention_heads
        head_dim = embed_dim // num_heads
        self.heads = nn.ModuleList(
            [AttentionHead(embed_dim, head_dim) for _ in range(num_heads)]
        )
        self.output_linear = nn.Linear(embed_dim, embed_dim)

    def forward(self, hidden_state):
        x = torch.cat([h(hidden_state) for h in self.heads], dim=-1)
        x = self.output_linear(x)
        return x

In [9]:
multihead_attn = MultiHeadAttention(config)
attn_output = multihead_attn(inputs_embeds)
attn_output.size()

torch.Size([1, 5, 768])

In [10]:
from transformers import AutoTokenizer, AutoModel
from bertviz import head_view

model2 = AutoModel.from_pretrained(model_ckpt, output_attentions=True)
viz_input = tokenizer(text, return_tensors='pt')
print(viz_input)
attention = model2(**viz_input).attentions
starting_point = (viz_input.input_ids == 2435).sum(dim=1)
tokens2 = tokenizer.convert_ids_to_tokens(viz_input.input_ids[0])

head_view(attention, tokens2, starting_point, heads=[8])


{'input_ids': tensor([[ 2435, 17607,   588,   281, 15452]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}


<IPython.core.display.Javascript object>

In [11]:
config

GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.41.2",
  "use_cache": true,
  "vocab_size": 50257
}

In [14]:
class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.linear_1 = nn.Linear(config.hidden_size, 4*config.hidden_size)
        self.linear_2 = nn.Linear(4*config.hidden_size, config.hidden_size)
        self.gelu = nn.GELU()
        self.dropout = nn.Dropout(config.embd_pdrop)

    def forward(self, x):
        x = self.linear_1(x)
        x = self.gelu(x)
        x = self.linear_2(x)
        x = self.dropout(x)
        return x

In [15]:
feed_forward = FeedForward(config)
ff_outputs = feed_forward(attn_output)
ff_outputs.size()

torch.Size([1, 5, 768])

In [24]:
class TransformerDecoderLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.layer_norm_1 = nn.LayerNorm(config.hidden_size)
        self.layer_norm_2 = nn.LayerNorm(config.hidden_size)
        self.attention = MultiHeadAttention(config)
        self.feed_forward = FeedForward(config)

    def forward(self, x):
        # Apply layer normalization and then copy input into query, key, value
        hidden_state = self.layer_norm_1(x)
        # Apply attention with a skip connection
        x = x + self.attention(hidden_state)
        # Apply feed-forward layer with a skip connection
        x = x + self.feed_forward(self.layer_norm_2(x))
        return x

In [25]:
class Embeddings(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.token_embeddings = nn.Embedding(config.vocab_size,
                                             config.hidden_size)
        self.position_embeddings = nn.Embedding(config.n_positions,
                                                config.hidden_size)
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
        self.dropout = nn.Dropout(config.embd_pdrop)

    def forward(self, input_ids):
        # Create position IDs for input sequence
        seq_length = input_ids.size(-1)
        position_ids = torch.arange(seq_length, dtype=torch.long).unsqueeze(0)
        # Create token and position embeddings
        token_embeddings = self.token_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)
        # Combine token and position embeddings
        embeddings = token_embeddings + position_embeddings
        embeddings = self.layer_norm(embeddings)
        embeddings = self.dropout(embeddings) ## I added this!!!!
        return embeddings

In [26]:
embedding_layer = Embeddings(config)
embedding_layer(inputs.input_ids).size()

torch.Size([1, 5, 768])

In [27]:
class TransformerDecoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.embeddings = Embeddings(config)
        self.layers = nn.ModuleList([TransformerDecoderLayer(config)
                                     for _ in range(config.num_hidden_layers)])

    def forward(self, x):
        x = self.embeddings(x)
        for layer in self.layers:
            x = layer(x)
        return x

In [28]:
decoder = TransformerDecoder(config)
decoder(inputs.input_ids).size()

torch.Size([1, 5, 768])

In [41]:
class TransformerForTextGeneration(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.decoder = TransformerDecoder(config)
        self.dropout = nn.Dropout(config.embd_pdrop)
        self.ff_layer = nn.Linear(config.hidden_size, config.vocab_size)

    def forward(self, x):
        x = self.decoder(x)[:, -1, :]
        x = self.dropout(x)
        x = self.ff_layer(x)
        x = F.softmax(x, dim=-1)
        return x

In [42]:
decoder_model = TransformerForTextGeneration(config)
decoder_model_output = decoder_model(inputs.input_ids)

In [43]:
config.task_specific_params

{'text-generation': {'do_sample': True, 'max_length': 50}}

In [44]:
max_length = config.task_specific_params["text-generation"]["max_length"]
maximum_value = torch.argmax(decoder_model_output)
decoder_model_output[0][maximum_value]

tensor(0.0005, grad_fn=<SelectBackward0>)

In [45]:
decoder_model_output.shape

torch.Size([1, 50257])

In [46]:
inputs.input_ids.shape

torch.Size([1, 5])

In [47]:
input_ids = inputs.input_ids
input_ids

tensor([[ 2435, 17607,   588,   281, 15452]])

In [48]:
while input_ids.size(1) < max_length:
    ## forward the model to get the logits
    decoder_model.eval()
    with torch.no_grad():    
        probs = decoder_model(input_ids) ## (B, T, vocab_size)
        ## We'll do the top k sampling here (HF's default 50 for pipeline)
        print(probs)
        topk_probs, topk_indices = torch.topk(probs, 50, dim=-1, largest=True) ## If you want some fun change the 1 to some other number and you'll have different answers!!!
        ## Select a token from top-k probabilities
        print(topk_indices)
        print(topk_probs)
        ix = torch.multinomial(topk_probs, 5, generator=torch.cuda.manual_seed(42), replacement=False) ## (B, 1)
        print(ix)
        ## gather the corresponding indices
        xcol = torch.gather(topk_indices, -1, ix) ## (B, 1)
        print(xcol)
        ## append to the sequence to get the full generated sentences
        input_ids = torch.cat((input_ids, xcol), dim=1)
        print(input_ids)

# Print the predicted token IDs
print(input_ids)

## decode and print the generated text
tokens = input_ids[0, :max_length].detach().to('cpu').tolist()
decoded = tokenizer.decode(tokens)
print(">", decoded)
input_ids = torch.tensor( [[2435, 17607,   588,   281, 15452]], dtype=torch.int)

tensor([[3.0322e-05, 2.4812e-05, 1.0124e-05,  ..., 6.4245e-06, 4.7306e-06,
         3.6903e-05]])
tensor([[16460, 30878, 38508, 48373, 28155, 33314, 13855, 42582,  4986, 28315,
         29765, 15264, 44792, 40846, 33139,  6385, 14352, 16994, 27710, 17059,
         43087, 37457, 35707, 27017, 35905,  1909, 48139,  6091, 46888, 24147,
         33799, 38767, 46935, 18459, 46992, 22361, 13497, 50229, 29562,  8967,
         29575, 21152,  4093, 33993,  1434, 14508, 24156, 12144, 17771,  3027]])
tensor([[0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002, 0.0002,
         0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002,
         0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002,
         0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002,
         0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002,
         0.0002, 0.0002, 0.0002, 0.0002, 0.0002]])
tensor([[26, 43,  0,  5, 14]])
tensor([[48139, 33

In [49]:
torch.tensor( [[2435, 17607,   588,   281, 15452]], dtype=torch.int)

tensor([[ 2435, 17607,   588,   281, 15452]], dtype=torch.int32)