<a href="https://colab.research.google.com/github/sabre-code/machine-learning-notes/blob/master/16_Transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Transformers

In [None]:
import torch


# input sequence / sentence:
#  "Can you help me to translate this sentence"

sentence = torch.tensor(
    [0, # can
     7, # you
     1, # help
     2, # me
     5, # to
     6, # translate
     4, # this
     3] # sentence
)

sentence

tensor([0, 7, 1, 2, 5, 6, 4, 3])

In [None]:
torch.manual_seed(123)
embed = torch.nn.Embedding(10, 16)
embedded_sentence = embed(sentence).detach()
embedded_sentence.shape

torch.Size([8, 16])

In [None]:
omega = torch.empty(8, 8)

for i, x_i in enumerate(embedded_sentence):
    for j, x_j in enumerate(embedded_sentence):
        omega[i, j] = torch.dot(x_i, x_j)

In [None]:
omega_mat = embedded_sentence.matmul(embedded_sentence.T)

In [None]:
torch.allclose(omega_mat, omega)

True

In [None]:
import torch.nn.functional as F

attention_weights = F.softmax(omega, dim=1)
attention_weights.shape

torch.Size([8, 8])

In [None]:
attention_weights.sum(dim=1)

tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000])

In [None]:
x_2 = embedded_sentence[1, :]
context_vec_2 = torch.zeros(x_2.shape)
for j in range(8):
    x_j = embedded_sentence[j, :]
    context_vec_2 += attention_weights[1, j] * x_j

context_vec_2

tensor([-9.3975e-01, -4.6856e-01,  1.0311e+00, -2.8192e-01,  4.9373e-01,
        -1.2896e-02, -2.7327e-01, -7.6358e-01,  1.3958e+00, -9.9543e-01,
        -7.1287e-04,  1.2449e+00, -7.8077e-02,  1.2765e+00, -1.4589e+00,
        -2.1601e+00])

In [None]:
context_vectors = torch.matmul(
        attention_weights, embedded_sentence)


torch.allclose(context_vec_2, context_vectors[1])

True

In [None]:
torch.manual_seed(123)

d = embedded_sentence.shape[1]
U_query = torch.rand(d, d)
U_key = torch.rand(d, d)
U_value = torch.rand(d, d)


In [None]:
x_2 = embedded_sentence[1]
query_2 = U_query.matmul(x_2)


In [None]:
key_2 = U_key.matmul(x_2)
value_2 = U_value.matmul(x_2)


In [None]:

keys = U_key.matmul(embedded_sentence.T).T
torch.allclose(key_2, keys[1])

True

In [None]:
print(torch.tensor([[2, 3, 5],[4, 5, 6]]).T)

tensor([[2, 4],
        [3, 5],
        [5, 6]])


In [None]:
values = U_value.matmul(embedded_sentence.T).T
torch.allclose(value_2, values[1])

True

In [None]:
omega_23 = query_2.dot(keys[2])
omega_23

tensor(14.3667)

In [None]:
omega_2 = query_2.matmul(keys.T)
omega_2

tensor([-25.1623,   9.3602,  14.3667,  32.1482,  53.8976,  46.6626,  -1.2131,
        -32.9391])

In [None]:
query_2

tensor([-1.2403, -2.9754, -0.2894, -0.4004, -2.9577, -0.2939, -0.2266, -3.6482,
        -2.6450, -0.9536, -1.1116,  1.1717, -2.2671, -0.7874, -2.0140, -1.6652])

In [None]:
attention_weights_2 = F.softmax(omega_2 / d**0.5, dim=0)
attention_weights_2

tensor([2.2317e-09, 1.2499e-05, 4.3696e-05, 3.7242e-03, 8.5596e-01, 1.4026e-01,
        8.8897e-07, 3.1936e-10])

In [None]:
context_vector_2 = attention_weights_2.matmul(values)
context_vector_2

tensor([-1.2226, -3.4387, -4.3928, -5.2125, -1.1249, -3.3041, -1.4316, -3.2765,
        -2.5114, -2.6105, -1.5793, -2.8433, -2.4142, -0.3998, -1.9917, -3.3499])

In [None]:
torch.manual_seed(123)

d = embedded_sentence.shape[1]
one_U_query = torch.rand(d, d)

In [None]:
h = 8
multihead_U_query = torch.rand(h, d, d)
multihead_U_key = torch.rand(h, d, d)
multihead_U_value = torch.rand(h, d, d)

In [None]:
multihead_query_2 = multihead_U_query.matmul(x_2)
multihead_query_2.shape

torch.Size([8, 16])

In [None]:
multihead_key_2 = multihead_U_key.matmul(x_2)
multihead_value_2 = multihead_U_value.matmul(x_2)

In [None]:
multihead_key_2[2]

tensor([-1.9619, -0.7701, -0.7280, -1.6840, -1.0801, -1.6778,  0.6763,  0.6547,
         1.4445, -2.7016, -1.1364, -1.1204, -2.4430, -0.5982, -0.8292, -1.4401])

In [None]:
stacked_inputs = embedded_sentence.T.repeat(8, 1, 1)
stacked_inputs.shape

torch.Size([8, 16, 8])

In [None]:
multihead_keys = torch.bmm(multihead_U_key, stacked_inputs)
multihead_keys.shape

torch.Size([8, 16, 8])

In [None]:
multihead_keys = multihead_keys.permute(0, 2, 1)
multihead_keys.shape

torch.Size([8, 8, 16])

In [None]:
multihead_keys[2, 1] # index: [2nd attention head, 2nd key]

tensor([-1.9619, -0.7701, -0.7280, -1.6840, -1.0801, -1.6778,  0.6763,  0.6547,
         1.4445, -2.7016, -1.1364, -1.1204, -2.4430, -0.5982, -0.8292, -1.4401])

In [None]:
multihead_values = torch.matmul(multihead_U_value, stacked_inputs)
multihead_values = multihead_values.permute(0, 2, 1)

In [None]:
multihead_z_2 = torch.rand(8, 16)

In [None]:
linear = torch.nn.Linear(8*16, 16)
context_vector_2 = linear(multihead_z_2.flatten())
context_vector_2.shape

torch.Size([16])

In [None]:
from transformers import pipeline, set_seed
generator = pipeline('text-generation', model='gpt2')

In [None]:
set_seed(123)
print(generator("This is going to be the best suspense thiriller story you have read. It was a rainy night. A lady was walking down the street. The sound of thunders scared her. But she kept walking Little did she know what she was about to witness.",max_length=1000, num_return_sequences=1)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


This is going to be the best suspense thiriller story you have read. It was a rainy night. A lady was walking down the street. The sound of thunders scared her. But she kept walking Little did she know what she was about to witness.

The next evening, a little girl was walking down the street and there was a dog in a black kennel coat that caught her attention. The dog wasn't so bad that it was actually so happy for its friend. When it noticed, it called to its friend and called it a dog. The dog gave it back to the dog, but the dog hadn't seen what its friend knew.

The dog told the child's friend some nasty lies about what his friend was wearing and so her friend kept walking away. The dog called their friend "little f***ing dog."

So now that she's here in a home, her friend is scared the dog will come out and she'll have to keep walking because what the dog might think is this lady is walking around with her pink pheromone. But it doesn't matter how she acts that dog will not come 

In [None]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
text = "Let us encode this sentence"
encoded_input = tokenizer(text, return_tensors='pt')
encoded_input

{'input_ids': tensor([[ 5756,   514, 37773,   428,  6827]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}

In [None]:
from transformers import GPT2Model
model = GPT2Model.from_pretrained('gpt2')
output = model(**encoded_input)
output['last_hidden_state'].shape

torch.Size([1, 5, 768])

In [1]:
!pip install transformers
!pip install transformers[torch]
!pip install datasets



In [2]:
import gzip
import shutil
import time
import pandas as pd
import requests
import torch
import torch.nn.functional as F
import torchtext
import transformers
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForSequenceClassification

In [3]:
torch.backends.cudnn.deterministic = True
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
NUM_EPOCHS = 3

In [4]:
url = ("https://github.com/rasbt/machine-learning-book/raw/main/ch08/movie_data.csv.gz")
filename = url.split("/")[-1]

with open(filename, "wb") as f:
  r = requests.get(url)
  f.write(r.content)

with gzip.open('movie_data.csv.gz', 'rb') as f_in:
  with open('movie_data.csv', 'wb') as f_out:
    shutil.copyfileobj(f_in, f_out)

In [5]:
df = pd.read_csv('movie_data.csv')
df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


In [8]:
df.shape

(50000, 2)

In [9]:
train_texts = df.iloc[:35000]['review'].values
train_labels = df.iloc[:35000]['sentiment'].values

valid_texts = df.iloc[35000:40000]['review'].values
valid_labels = df.iloc[35000:40000]['sentiment'].values

test_texts = df.iloc[40000:]['review'].values
test_labels = df.iloc[40000:]['sentiment'].values

In [10]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [11]:
train_encodings = tokenizer(list(train_texts), truncation = True, padding=True)
valid_encodings = tokenizer(list(valid_texts), truncation = True, padding=True)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)

In [12]:
train_encodings[1]

Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

Dataset Class and Loaders

In [13]:
class IMDbDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)


train_dataset = IMDbDataset(train_encodings, train_labels)
valid_dataset = IMDbDataset(valid_encodings, valid_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)


In [14]:
# for i in range(len(train_dataset)):
#     item = train_dataset[i]
#     if item is None:
#         print("Item at index", i, "is None.")

type(train_dataset[0])

dict

In [23]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(DEVICE)
model.train()

optim = torch.optim.Adam(model.parameters(), lr=5e-5)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
def compute_accuracy(model, data_loader, device):
  with torch.no_grad():
    correct_pred, num_examples = 0, 0

    for batch_idx, batch in enumerate(data_loader):
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['labels'].to(device)
      outputs = model(input_ids, attention_mask=attention_mask)
      logits = outputs['logits']
      predicted_labels = torch.argmax(logits, 1)
      num_examples += labels.size(0)
      correct_pred += (predicted_labels == labels).sum()
    return correct_pred.float() / num_examples * 100

In [None]:
start_time = time.time()

for epoch in range(NUM_EPOCHS):

    model.train()

    for batch_idx, batch in enumerate(train_loader):

        ### Prepare data
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        ### Forward
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss, logits = outputs['loss'], outputs['logits']

        ### Backward
        optim.zero_grad()
        loss.backward()
        optim.step()

        ### Logging
        if not batch_idx % 250:
            print (f'Epoch: {epoch+1:04d}/{NUM_EPOCHS:04d} | '
                   f'Batch {batch_idx:04d}/{len(train_loader):04d} | '
                   f'Loss: {loss:.4f}')

    model.eval()

    with torch.set_grad_enabled(False):
        print(f'Training accuracy: '
              f'{compute_accuracy(model, train_loader, DEVICE):.2f}%'
              f'\nValid accuracy: '
              f'{compute_accuracy(model, valid_loader, DEVICE):.2f}%')

    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')

print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}%')

Epoch: 0001/0003 | Batch 0000/2188 | Loss: 0.7094
Epoch: 0001/0003 | Batch 0250/2188 | Loss: 0.1164
Epoch: 0001/0003 | Batch 0500/2188 | Loss: 0.2028
Epoch: 0001/0003 | Batch 0750/2188 | Loss: 0.0958
Epoch: 0001/0003 | Batch 1000/2188 | Loss: 0.2632
Epoch: 0001/0003 | Batch 1250/2188 | Loss: 0.4285
Epoch: 0001/0003 | Batch 1500/2188 | Loss: 0.0519
Epoch: 0001/0003 | Batch 1750/2188 | Loss: 0.1087
Epoch: 0001/0003 | Batch 2000/2188 | Loss: 0.0609
Training accuracy: 96.31%
Valid accuracy: 92.64%
Time elapsed: 38.62 min
Epoch: 0002/0003 | Batch 0000/2188 | Loss: 0.0478
Epoch: 0002/0003 | Batch 0250/2188 | Loss: 0.0169
Epoch: 0002/0003 | Batch 0500/2188 | Loss: 0.0925
Epoch: 0002/0003 | Batch 0750/2188 | Loss: 0.1275
Epoch: 0002/0003 | Batch 1000/2188 | Loss: 0.1271
Epoch: 0002/0003 | Batch 1250/2188 | Loss: 0.2720
Epoch: 0002/0003 | Batch 1500/2188 | Loss: 0.4965
Epoch: 0002/0003 | Batch 1750/2188 | Loss: 0.2039
Epoch: 0002/0003 | Batch 2000/2188 | Loss: 0.1471
Training accuracy: 98.97%
V

In [25]:
torch.save(model.state_dict(),"distilbert-base-uncased-imdbtuned.pt")

In [None]:
del model # free memory

In [6]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(DEVICE)
model.train();

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
from transformers import Trainer, TrainingArguments


optim = torch.optim.Adam(model.parameters(), lr=5e-5)
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

In [16]:
# install dataset via pip install datasets
from datasets import load_metric
import numpy as np


metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred # logits are a numpy array, not pytorch tensor
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(
               predictions=predictions, references=labels)

  metric = load_metric("accuracy")


In [17]:
optim = torch.optim.Adam(model.parameters(), lr=5e-5)


training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_steps=10
)

trainer = Trainer(
    model=model,
    compute_metrics=compute_metrics,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    optimizers=(optim, None) # optimizer and learning rate scheduler
)

# force model to only use 1 GPU (even if multiple are availabe)
# to compare more fairly to previous code

trainer.args._n_gpu = 1


In [18]:
start_time = time.time()
trainer.train()
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')

Step,Training Loss
10,0.6924
20,0.6624
30,0.5704
40,0.3826
50,0.3745
60,0.4078
70,0.3742
80,0.3157
90,0.2539
100,0.3822


Total Training Time: 88.55 min


In [19]:
trainer.evaluate()

{'eval_loss': 0.302387535572052,
 'eval_accuracy': 0.9356,
 'eval_runtime': 183.5354,
 'eval_samples_per_second': 54.485,
 'eval_steps_per_second': 3.405,
 'epoch': 3.0}

In [24]:
model.eval()
model.to(DEVICE)
print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}%')

Test accuracy: 93.56%


In [66]:

sentence_1 = "I really liked this movie"
sentence_2 = "the acting was terrible in this movie."

test_encodings = tokenizer(sentence_2, truncation=True, padding=True, return_tensors="pt")


In [67]:
test_encodings.to(DEVICE)

{'input_ids': tensor([[ 101, 1996, 3772, 2001, 6659, 1999, 2023, 3185, 1012,  102]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [76]:
def inference(encoding):
  with torch.no_grad():
    #logits = model(**encoding).logits
    input_ids = encoding['input_ids'].to(DEVICE)
    attention_mask = encoding['attention_mask'].to(DEVICE)
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs['logits']
    predicted_label = torch.argmax(logits, 1).item()
    return predicted_label


In [77]:
inference(test_encodings)

tensor([0], device='cuda:0')