# Introducing the self-attention mechanism

In [1]:
import torch
torch.manual_seed(123)

sentence = torch.tensor([0, 7, 1, 2, 5, 6, 4, 3])
embed = torch.nn.Embedding(10, 16)
embeded_sentence = embed(sentence).detach()
print(embeded_sentence.shape)
embeded_sentence

torch.Size([8, 16])


tensor([[ 3.3737e-01, -1.7778e-01, -3.0353e-01, -5.8801e-01,  3.4861e-01,
          6.6034e-01, -2.1964e-01, -3.7917e-01,  7.6711e-01, -1.1925e+00,
          6.9835e-01, -1.4097e+00,  1.7938e-01,  1.8951e+00,  4.9545e-01,
          2.6920e-01],
        [-9.4053e-01, -4.6806e-01,  1.0322e+00, -2.8300e-01,  4.9275e-01,
         -1.4078e-02, -2.7466e-01, -7.6409e-01,  1.3966e+00, -9.9491e-01,
         -1.5822e-03,  1.2471e+00, -7.7105e-02,  1.2774e+00, -1.4596e+00,
         -2.1595e+00],
        [-7.7020e-02, -1.0205e+00, -1.6896e-01,  9.1776e-01,  1.5810e+00,
          1.3010e+00,  1.2753e+00, -2.0095e-01,  4.9647e-01, -1.5723e+00,
          9.6657e-01, -1.1481e+00, -1.1589e+00,  3.2547e-01, -6.3151e-01,
         -2.8400e+00],
        [-1.3250e+00,  1.7843e-01, -2.1338e+00,  1.0524e+00, -3.8848e-01,
         -9.3435e-01, -4.9914e-01, -1.0867e+00,  8.8054e-01,  1.5542e+00,
          6.2662e-01, -1.7549e-01,  9.8284e-02, -9.3507e-02,  2.6621e-01,
         -5.8504e-01],
        [ 2.5529e-01

In [2]:
## compute the similarity-based weights
omega_mat = embeded_sentence.matmul(embeded_sentence.T)
omega_mat

tensor([[ 9.7601,  1.7326,  4.7543, -1.3587,  0.4752, -1.6717,  1.0227, -0.1286],
        [ 1.7326, 16.0787,  9.0642, -0.3370,  1.1368,  1.1972,  1.6485, -1.2789],
        [ 4.7543,  9.0642, 22.6615, -0.8519,  7.7799,  2.7483, -0.6832,  1.6236],
        [-1.3587, -0.3370, -0.8519, 13.9473, -1.4198, 10.9659, -0.5887,  2.3869],
        [ 0.4752,  1.1368,  7.7799, -1.4198, 13.7511, -6.8568, -2.5114, -3.3468],
        [-1.6717,  1.1972,  2.7483, 10.9659, -6.8568, 24.6738, -3.8294,  4.9581],
        [ 1.0227,  1.6485, -0.6832, -0.5887, -2.5114, -3.8294, 15.8691,  2.0269],
        [-0.1286, -1.2789,  1.6236,  2.3869, -3.3468,  4.9581,  2.0269, 18.7382]])

In [3]:
## compute the attention weights using PyTorch’s softmax function
import torch.nn.functional as F 
attention_weights = F.softmax(omega_mat, dim=1)
attention_weights

tensor([[9.9270e-01, 3.2398e-04, 6.6502e-03, 1.4723e-05, 9.2135e-05, 1.0766e-05,
         1.5929e-04, 5.0374e-05],
        [5.8773e-07, 9.9910e-01, 8.9788e-04, 7.4187e-08, 3.2391e-07, 3.4407e-07,
         5.4033e-07, 2.8926e-08],
        [1.6712e-08, 1.2438e-06, 1.0000e+00, 6.1412e-11, 3.4437e-07, 2.2482e-09,
         7.2703e-11, 7.3008e-10],
        [2.1438e-07, 5.9550e-07, 3.5585e-07, 9.5172e-01, 2.0167e-07, 4.8272e-02,
         4.6299e-07, 9.0760e-06],
        [1.7110e-06, 3.3158e-06, 2.5448e-03, 2.5719e-07, 9.9745e-01, 1.1195e-09,
         8.6338e-08, 3.7443e-08],
        [3.6165e-12, 6.3713e-11, 3.0052e-10, 1.1136e-06, 2.0250e-14, 1.0000e+00,
         4.1804e-13, 2.7390e-09],
        [3.5667e-07, 6.6694e-07, 6.4779e-08, 7.1194e-08, 1.0410e-08, 2.7865e-09,
         1.0000e+00, 9.7366e-07],
        [6.4013e-09, 2.0263e-09, 3.6918e-08, 7.9205e-08, 2.5622e-10, 1.0361e-06,
         5.5258e-08, 1.0000e+00]])

In [4]:
print(torch.sum(attention_weights[0, :]))
print(attention_weights.sum(dim=1))

tensor(1.0000)
tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000])


In [5]:
## computing the context vectors, 𝒛(𝑖) , as the attention-weighted sum of the inputs
# let’s assume we are computing the context vector for the second input word, that is, 𝒛(2)
x_2 = embeded_sentence[1, :]
context_vector_2 = torch.zeros(x_2.shape)

for j in range(8):
    x_j = embeded_sentence[j, :]
    context_vector_2 += attention_weights[1, j] * x_j
context_vector_2

tensor([-9.3975e-01, -4.6856e-01,  1.0311e+00, -2.8192e-01,  4.9373e-01,
        -1.2896e-02, -2.7327e-01, -7.6358e-01,  1.3958e+00, -9.9543e-01,
        -7.1287e-04,  1.2449e+00, -7.8077e-02,  1.2765e+00, -1.4589e+00,
        -2.1601e+00])

In [6]:
## we can achieve this more efficiently by using matrix multiplication
context_vectors = torch.matmul(attention_weights, embeded_sentence)
context_vectors

tensor([[ 3.3420e-01, -1.8324e-01, -3.0218e-01, -5.7772e-01,  3.5662e-01,
          6.6452e-01, -2.0998e-01, -3.7798e-01,  7.6537e-01, -1.1946e+00,
          6.9960e-01, -1.4067e+00,  1.7021e-01,  1.8838e+00,  4.8729e-01,
          2.4730e-01],
        [-9.3975e-01, -4.6856e-01,  1.0311e+00, -2.8192e-01,  4.9373e-01,
         -1.2896e-02, -2.7327e-01, -7.6358e-01,  1.3958e+00, -9.9543e-01,
         -7.1287e-04,  1.2449e+00, -7.8077e-02,  1.2765e+00, -1.4589e+00,
         -2.1601e+00],
        [-7.7021e-02, -1.0205e+00, -1.6895e-01,  9.1776e-01,  1.5810e+00,
          1.3010e+00,  1.2753e+00, -2.0095e-01,  4.9647e-01, -1.5723e+00,
          9.6657e-01, -1.1481e+00, -1.1589e+00,  3.2547e-01, -6.3151e-01,
         -2.8400e+00],
        [-1.3679e+00,  1.0614e-01, -2.1317e+00,  1.0480e+00, -3.7127e-01,
         -9.1234e-01, -4.3802e-01, -1.0329e+00,  9.3425e-01,  1.5453e+00,
          5.7218e-01, -1.8049e-01, -6.0454e-03, -8.8691e-02,  2.0559e-01,
         -5.2292e-01],
        [ 2.5444e-01

# Parameterizing the self-attention mechanism: scaled dot-product attention

In [7]:
torch.manual_seed(123)
d = embeded_sentence.shape[1]
print(d)
U_query = torch.rand(d, d)
U_key = torch.rand(d, d)
U_value = torch.rand(d, d)

16


In [8]:
## Using the query projection matrix, we can then compute the query sequence.
x_2 = embeded_sentence[1, :]
query_2 = U_query.matmul(x_2)

In [9]:
## compute the key and value sequences
key_2 = U_key.matmul(x_2)
value_2 = U_value.matmul(x_2)

In [10]:
## we also need the key and value sequences for all other input elements, which we can compute as follows
keys = U_key.matmul(embeded_sentence.T).T
values = U_value.matmul(embeded_sentence.T).T

In [11]:
## we compute unnormalized attention weight (𝜔([i,j]) as the dot product between the query and key
omega_23 = query_2.matmul(keys[2])
omega_23

tensor(14.3667)

In [12]:
## we can scale up this computation to all keys
omega_2 = query_2.matmul(keys.T)

In [13]:
omega_2

tensor([-25.1623,   9.3602,  14.3667,  32.1482,  53.8976,  46.6626,  -1.2131,
        -32.9392])

In [14]:
## Normalize the attention weights for the second input vector
attention_weights_2 = F.softmax(omega_2 / d**0.5, dim=0)
print(attention_weights_2)
print(attention_weights_2.sum())

tensor([2.2317e-09, 1.2499e-05, 4.3696e-05, 3.7242e-03, 8.5596e-01, 1.4026e-01,
        8.8897e-07, 3.1935e-10])
tensor(1.)


In [15]:
## Compute the weighted average of value sequences
context_vector_2 = attention_weights_2.matmul(values)
context_vector_2

tensor([-1.2226, -3.4387, -4.3928, -5.2125, -1.1249, -3.3041, -1.4316, -3.2765,
        -2.5114, -2.6105, -1.5793, -2.8433, -2.4142, -0.3998, -1.9917, -3.3499])

# Using GPT-2 to generate new text

In [16]:
from transformers import pipeline, set_seed

generator = pipeline('text-generation', model='gpt2')



In [17]:
## we can prompt the model with a text snippet and ask it to generate new text based on that input snippet
set_seed(123)
generator("Hey readers, today is", max_length=20, num_return_sequences=3)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Hey readers, today is not the last time we'll be seeing one of our favorite indie rock bands"},
 {'generated_text': 'Hey readers, today is Christmas. This is not Christmas, because Christmas is so long and I hope'},
 {'generated_text': "Hey readers, today is CTA Day!\n\nWe're proud to be hosting a special event"}]

In [18]:
## we can use a transformer model to generate features for training other models.
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
text = "Let us encode this sentence"
encoded_input = tokenizer(text, return_tensors='pt')
encoded_input

{'input_ids': tensor([[ 5756,   514, 37773,   428,  6827]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}

In [19]:
from transformers import GPT2Model
model = GPT2Model.from_pretrained('gpt2')
output = model(**encoded_input) # stores the last hidden state
output['last_hidden_state'].shape # it outputs [batch size, sentence length, size of feature encoding]

torch.Size([1, 5, 768])

# Fine-tuning a BERT model in PyTorch

## Loading the IMDb movie review dataset
we will begin by loading the required packages and the dataset, split into train,
validation, and test sets.

In [20]:
import gzip
import shutil
import time
import pandas as pd
import requests
import torch
import torch.nn.functional as F
import torchtext
import transformers
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForSequenceClassification

In [21]:
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)
NUM_EPOCHS = 3

df = pd.read_csv('movie_data.csv')
df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


In [22]:
train_texts = df.iloc[:35000]['review'].values
train_labels = df.iloc[:35000]['sentiment'].values

valid_texts = df.iloc[35000:40000]['review'].values
valid_labels = df.iloc[35000:40000]['sentiment'].values

test_texts = df.iloc[40000:]['review'].values
test_labels = df.iloc[40000:]['sentiment'].values

In [23]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
valid_encodings = tokenizer(list(valid_texts), truncation=True, padding=True)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)


In [24]:
class IMDBDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) 
        for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)
        

In [25]:
train_dataset = IMDBDataset(train_encodings, train_labels)
valid_dataset = IMDBDataset(valid_encodings, valid_labels)
test_dataset = IMDBDataset(test_encodings, test_labels)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=16, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)


## Loading and fine-tuning a pre-trained BERT model

In [26]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.train()

optim = torch.optim. Adam(model.parameters(), lr=5e-5)

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classi

In [28]:
def compute_accuracy(model, dataloader):
    with torch.no_grad():
        correct_preds, num_examples = 0, 0
        for batch_idx, batch in enumerate(dataloader):
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs['logits']
            predicted_labels =torch.argmax(logits, 1)
            num_examples += labels.size(0)
            correct_preds += (predicted_labels == labels).sum()
    
    return correct_preds.float() / num_examples * 100

In [30]:
NUM_EPOCHS = 1

start_time = time.time()

for epoch in range(NUM_EPOCHS):
    model.train()

    for batch_idx, batch in enumerate(train_loader):

        ### Prepare the data
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        ### Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)        
        loss, logits = outputs['loss'], outputs['logits']

        ### Backward pass
        optim.zero_grad()
        loss.backward()
        optim.step()

        ### Logging
        if not batch_idx % 250:
            if not batch_idx % 250: 
                print(f'Epoch: {epoch+1:04d}/{NUM_EPOCHS:04d}' 
                f' | Batch' f'{batch_idx:04d}/' f'{len(train_loader):04d} | ' 
                f'Loss: {loss:.4f}')
        
    model.eval()
    with torch.set_grad_enabled(False):
        print(f'Training accuracy: ', f'{compute_accuracy(model, train_loader): .f}%'
        f'\nValid accuracy: ', f'{compute_accuracy(model, valid_loader): .f}%')
    
    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')

print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_accuracy(model, test_loader):.2f}%')


Epoch: 0001/0001 | Batch0000/2188 | Loss: 0.6915


KeyboardInterrupt: 

# Fine-tuning a transformer more conveniently using the Trainer API

In [None]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

model.train()

optim = torch.optim.Adam(model.parameters(), lr=5e-5)


In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_steps=10
)
trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, optimizers=(optim, None))