In [1]:
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import dataset

import numpy as np
import matplotlib.pyplot as plt

In [2]:
class MultiHeadAttention(nn.Module):
  def __init__(self, d_k, d_model, n_heads):
    super().__init__()

    # Assume d_v=d_k
    self.d_k = d_k
    self.n_heads = n_heads

    # ALL the attentions all at once with n_heads
    self.key = nn.Linear(d_model, d_k*n_heads)
    self.query = nn.Linear(d_model, d_k*n_heads)
    self.value = nn.Linear(d_model, d_k*n_heads)

    # final linear layer
    self.fc = nn.Linear(d_k * n_heads, d_model)

  def forward(self, q, k, v, mask=None):
    q = self.query(q) # N x T x (hd_k)
    k = self.key(k) # N x T x (hd_k)
    v = self.value(v) # N x T x (hd_v)

    N = q.shape[0] #sample
    T = q.shape[1] #sequence

    # change the shape to:
    # (N, T, h, d_k) -> (N, h, T, d_k)
    # in order for matrix multiply to work properly
    q = q.view(N, T, self.n_heads, self.d_k).transpose(1,2)
    k = k.view(N, T, self.n_heads, self.d_k).transpose(1,2)
    v = v.view(N, T, self.n_heads, self.d_k).transpose(1,2)

    #compute attention weights
    #(N, h, T, d_k) x (N, h, d_k, T) --> (N, h, T, T)
    # scaling score  = query * key Transpose/ square root of(dimension)
    attn_scores = q @ k.transpose(-2,-1)/ math.sqrt(self.d_k)

    #we mask
    if mask is not None:
      attn_scores = attn_scores.masked_fill(
          #mask:(N,T)-> mask[:, None, None, :] -> mask:(N,1,1,T)
          #this allows us to broadcast correctly
          mask[:, None, None, :] == 0, float('-inf')
      )

    #attention weights
    attn_weights = F.softmax(attn_scores, dim=-1)

    #compute attention weights-weighted values
    # (N, h, T, T) X (N, h, T, d_k) --> (N, h, T, d_k)
    A = attn_weights @ v

    #reshape it back before final linear layer
    A = A.transpose(1,2) # (N, h, T, d_k) --> (N, T, h, d_k)
    #contiguous allows us to set our values correctly in memory
    A = A.contiguous().view(N, T, self.d_k * self.n_heads) #(N, T, h*d_k)

    #projection
    return self.fc(A)

In [3]:
class TransformerBlock(nn.Module):
  def __init__(self, d_k, d_model, n_heads, dropout_prob=0.1):
    super().__init__()

    self.ln1 = nn.LayerNorm(d_model)
    self.ln2 = nn.LayerNorm(d_model)
    self.mha = MultiHeadAttention(d_k, d_model, n_heads)
    self.ann = nn.Sequential(
        nn.Linear(d_model, d_model *4),
        nn.GELU(),
        nn.Linear(d_model * 4, d_model),
        nn.Dropout(dropout_prob),
    )
    self.dropout = nn.Dropout(p=dropout_prob)

  def forward(self, x, mask=None):
    x = self.ln1(x + self.mha(x,x,x,mask))
    x = self.ln2(x + self.ann(x))
    x = self.dropout(x)
    return x

In [4]:
class PositionalEncoding(nn.Module):
  def __init__(self, d_model, max_len=2048, dropout_prob=0.1):
    super().__init__()
    self.dropout = nn.Dropout(p=dropout_prob)
    #equations
    #PE(pos,2i) = sin(pos/10000^2i/dmodel)
    #PE(pos, 2i + 1) = cos(pos/10000^2i/dmodel)

    #arange goes from 0 to max lenght
    position = torch.arange(max_len).unsqueeze(1) #Pos
    exp_term = torch.arange(0, d_model, 2) #2i
    div_term = torch.exp(exp_term * (-math.log(10000.0) / d_model)) #10000^-2i/dmodel
    pe = torch.zeros(1, max_len, d_model) #(1, T, D) to brodcast to (N, T, D)
    pe[0, :, 0::2] = torch.sin(position * div_term) #PE(pos,2i) = sin(pos/10000^2i/dmodel)
    pe[0, :, 1::2] = torch.cos(position * div_term) #PE(pos, 2i + 1) = cos(pos/10000^2i/dmodel)
    self.register_buffer("pe", pe) # save and load correctly register and does not required gradient

  def forward(self, x):
    # x.shape: N x T x D
    x = x + self.pe[:, :x.size(1), :] #accessing register buffer
    return self.dropout(x)


In [5]:
class Encoder(nn.Module):
  def __init__(self,
               vocab_size,
               max_len,
               d_k,
               d_model,
               n_heads,
               n_layers,
               n_classes,
               dropout_prob):
    super().__init__()

    self.embedding = nn.Embedding(vocab_size, d_model)
    self.pos_encoding = PositionalEncoding(d_model, max_len, dropout_prob)
    transformer_blocks = [
        TransformerBlock(
            d_k,
            d_model,
            n_heads,
            dropout_prob
        ) for _ in range(n_layers)]

    self.transformer_blocks = nn.Sequential(*transformer_blocks) #encapsulate in sequential
    self.ln = nn.LayerNorm(d_model)
    self.fc = nn.Linear(d_model, n_classes) #outputs n_classes


  def forward(self, x, mask=None):
    x = self.embedding(x)
    x = self.pos_encoding(x)
    for block in self.transformer_blocks:
      x = block(x, mask)

    #many-to-one (x has the shape N x T x D)
    x = x[:, 0, :] #x: (N x T x D) --> x: (N X D) single output

    x = self.ln(x)
    x = self.fc(x)

    return x

In [6]:
# vocab size= 20_000,
# max_length of T = 1024,
# dimensionality of keys d_k = 16
# dimensionality of model d_model = 64
# number of attention heads n_heads = 4
# layers of transformers n_layers = 2
# number of classes to identify n_classes = 5
# dropout probability = 0.1
model = Encoder(20_000, 1024, 16, 64,4,2,5,0.1)

In [7]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

cuda:0


Encoder(
  (embedding): Embedding(20000, 64)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (mha): MultiHeadAttention(
        (key): Linear(in_features=64, out_features=64, bias=True)
        (query): Linear(in_features=64, out_features=64, bias=True)
        (value): Linear(in_features=64, out_features=64, bias=True)
        (fc): Linear(in_features=64, out_features=64, bias=True)
      )
      (ann): Sequential(
        (0): Linear(in_features=64, out_features=256, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=256, out_features=64, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, 

In [8]:
x = np.random.randint(0, 20_000, size=(8, 512))
x_t = torch.tensor(x).to(device)

In [9]:
mask = np.ones((8, 512))
mask[:, 256:] = 0
mask_t = torch.tensor(mask).to(device)

In [10]:
y = model(x_t, mask_t)

In [11]:
y.shape

torch.Size([8, 5])

In [12]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [13]:
from transformers import AutoTokenizer, DataCollatorWithPadding

In [14]:
checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [15]:
from datasets import load_dataset

In [16]:
raw_datasets = load_dataset("glue", "sst2")

Downloading builder script:   0%|          | 0.00/28.8k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/27.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.44M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [17]:
raw_datasets["train"]["sentence"]

['hide new secretions from the parental units ',
 'contains no wit , only labored gags ',
 'that loves its characters and communicates something rather beautiful about human nature ',
 'remains utterly satisfied to remain the same throughout ',
 'on the worst revenge-of-the-nerds clichés the filmmakers could dredge up ',
 "that 's far too tragic to merit such superficial treatment ",
 'demonstrates that the director of such hollywood blockbusters as patriot games can still turn out a small , personal film with an emotional wallop . ',
 'of saucy ',
 "a depressed fifteen-year-old 's suicidal poetry ",
 "are more deeply thought through than in most ` right-thinking ' films ",
 'goes to absurd lengths ',
 "for those moviegoers who complain that ` they do n't make movies like they used to anymore ",
 "the part where nothing 's happening , ",
 'saw how bad this movie was ',
 'lend some dignity to a dumb story ',
 'the greatest musicians ',
 'cold movie ',
 'with his usual intelligence and s

In [18]:
def tokenizer_fn(batch):
  return tokenizer(batch["sentence"], truncation=True)

In [19]:
tokenized_datasets = raw_datasets.map(tokenizer_fn, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [20]:
data_collator

DataCollatorWithPadding(tokenizer=DistilBertTokenizerFast(name_or_path='distilbert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}, padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [21]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [22]:
tokenized_datasets = tokenized_datasets.remove_columns(["sentence", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

In [23]:
tokenized_datasets["train"]["input_ids"][0]

[101, 4750, 1207, 3318, 5266, 1121, 1103, 22467, 2338, 102]

In [24]:
tokenized_datasets["train"]["labels"][0:8]

[0, 0, 1, 0, 0, 0, 1, 1]

In [25]:
from torch.utils.data import DataLoader

In [26]:
train_loader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    batch_size = 32,
    collate_fn = data_collator
)

valid_loader = DataLoader(
    tokenized_datasets["validation"],
    batch_size = 32,
    collate_fn = data_collator
)

In [27]:
#check how it works
for batch in train_loader:
  for k, v in batch.items():
    print("k:", k, 'v.shape', v.shape)
  break

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


k: labels v.shape torch.Size([32])
k: input_ids v.shape torch.Size([32, 35])
k: attention_mask v.shape torch.Size([32, 35])


In [28]:
set(tokenized_datasets["train"]["labels"])

{0, 1}

In [29]:
tokenizer.vocab_size

28996

In [30]:
tokenizer.max_model_input_sizes

{'distilbert-base-uncased': 512,
 'distilbert-base-uncased-distilled-squad': 512,
 'distilbert-base-cased': 512,
 'distilbert-base-cased-distilled-squad': 512,
 'distilbert-base-german-cased': 512,
 'distilbert-base-multilingual-cased': 512}

In [31]:
model = Encoder(
    vocab_size = tokenizer.vocab_size,
    max_len = tokenizer.max_model_input_sizes[checkpoint],
    d_k = 16,
    d_model=64,
    n_heads=4,
    n_layers=2,
    n_classes=2,
    dropout_prob=0.1
)
model.to(device)

Encoder(
  (embedding): Embedding(28996, 64)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (mha): MultiHeadAttention(
        (key): Linear(in_features=64, out_features=64, bias=True)
        (query): Linear(in_features=64, out_features=64, bias=True)
        (value): Linear(in_features=64, out_features=64, bias=True)
        (fc): Linear(in_features=64, out_features=64, bias=True)
      )
      (ann): Sequential(
        (0): Linear(in_features=64, out_features=256, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=256, out_features=64, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, 

In [32]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

In [33]:
from datetime import datetime

In [34]:
# A function to encapsulate the training loop
def train(model, critetion, optimizer, train_loader, valid_loader, epochs):
  train_losses = np.zeros(epochs)
  test_losses = np.zeros(epochs)

  for it in range(epochs):
    model.train()
    t0 = datetime.now()
    train_loss = 0
    n_train = 0
    for batch in train_loader:
      #move data to GPU
      batch = {k: v.to(device) for k,v in batch.items()}

      #zero the parameter gradients
      optimizer.zero_grad()

      #forward pass
      outputs = model(batch["input_ids"], batch["attention_mask"])
      loss = criterion(outputs, batch["labels"])

      #backward and optimize
      loss.backward()
      optimizer.step()

      train_loss += loss.item()*batch['input_ids'].size(0)
      n_train += batch["input_ids"].size(0)

    # Get average train loss
    train_loss = train_loss / n_train

    model.eval()
    test_loss = 0
    n_test = 0
    for batch in valid_loader:
      batch = {k: v.to(device) for k,v in batch.items()}
      outputs = model(batch["input_ids"], batch["attention_mask"])
      loss = criterion(outputs, batch["labels"])
      test_loss += loss.item()*batch['input_ids'].size(0)
      n_test += batch["input_ids"].size(0)
    test_loss = test_loss / n_test

    # Save losses
    train_losses[it] = train_loss
    test_losses[it] = test_loss

    dt = datetime.now()- t0
    print(f'Epoch {it+1}/{epochs}, Train Loss: {train_loss:.4f}, \
      Test Loss: { test_loss:.4f}, Duration: {dt}' )

  return train_losses, test_losses


In [37]:
train_losses, test_losses = train(
    model, criterion, optimizer, train_loader, valid_loader, epochs=20
)

Epoch 1/20, Train Loss: 0.2286,       Test Loss: 0.4717, Duration: 0:00:24.988787
Epoch 2/20, Train Loss: 0.2071,       Test Loss: 0.5251, Duration: 0:00:18.634197
Epoch 3/20, Train Loss: 0.1910,       Test Loss: 0.4949, Duration: 0:00:19.860045
Epoch 4/20, Train Loss: 0.1769,       Test Loss: 0.5581, Duration: 0:00:18.812810
Epoch 5/20, Train Loss: 0.1624,       Test Loss: 0.5672, Duration: 0:00:19.744400
Epoch 6/20, Train Loss: 0.1541,       Test Loss: 0.6374, Duration: 0:00:18.960018
Epoch 7/20, Train Loss: 0.1454,       Test Loss: 0.6076, Duration: 0:00:23.499815
Epoch 8/20, Train Loss: 0.1376,       Test Loss: 0.6165, Duration: 0:00:21.510836
Epoch 9/20, Train Loss: 0.1294,       Test Loss: 0.5704, Duration: 0:00:19.482703
Epoch 10/20, Train Loss: 0.1233,       Test Loss: 0.7565, Duration: 0:00:19.259295
Epoch 11/20, Train Loss: 0.1154,       Test Loss: 0.6096, Duration: 0:00:18.791876
Epoch 12/20, Train Loss: 0.1107,       Test Loss: 0.7124, Duration: 0:00:20.022531
Epoch 13/20, 

In [38]:
model.eval()
n_correct = 0.
n_total = 0.
for batch in train_loader:
  #move to GPU
  batch = {k: v.to(device) for k,v in batch.items()}

  # Forward pass
  outputs = model(batch["input_ids"], batch["attention_mask"])

  #Get prediction
  # torch.max returns both max and argmax
  _, predictions = torch.max(outputs, 1)

  #update counts
  n_correct += (predictions ==batch["labels"]).sum().item()
  n_total += batch["labels"].shape[0]

train_acc = n_correct / n_total

n_correct = 0
n_total = 0
for batch in valid_loader:

  #move to GPU
  batch = {k:v.to(device) for k,v in batch.items()}

  #forward pass
  outputs = model(batch["input_ids"], batch["attention_mask"])

  #Get predictions
  _, predictions = torch.max(outputs, 1)

  #update counts
  n_correct += (predictions == batch["labels"]).sum().item()
  n_total += batch["labels"].shape[0]

test_acc = n_correct / n_total

print(f" Train acc: {train_acc:.4f}, Test acc: {test_acc:.4f}")

 Train acc: 0.9868, Test acc: 0.7695


In [113]:
import random
for i in range(3):
  count = 0
  batch_idx = random.randint(0,len(valid_loader)-1)
  for batch in valid_loader:
    idx = random.randint(0,len(batch))

    if count == batch_idx:
      #move to GPU
      batch = {k:v.to(device) for k,v in batch.items()}

      idx = random.randint(0,len(batch))

      end_idx =tokenizer.convert_ids_to_tokens(batch["input_ids"][idx]).index("[SEP]")

      sentence = [token[2:] if token.startswith("#")
                  else " " +token
                  for token in tokenizer.convert_ids_to_tokens(batch["input_ids"][idx])[1: end_idx]]


        #forward pass
      outputs = model(batch["input_ids"], batch["attention_mask"])

      #Get predictions
      _, predictions = torch.max(outputs, 1)

      print(''.join(sentence)[1:])
      if predictions[idx].item() == 0:
        print("Negative\n")
      else:
        print("Positive\n")
      break

    else:
      count += 1








the quality of the art combined with the humor and intelligence of the script allow the filmmakers to present the biblical message of forgiveness without it ever becoming preachy or syrupy .
Positive

the magic of the film lies not in the mysterious spring but in the richness of its performances .
Positive

there is no pleasure in watching a child suffer .
Negative

