In [None]:
import math
import torch
import torch.nn as nn

In [None]:
%matplotlib inline
import sys
import copy
import math
import numpy as np
import os
import random
import torch
import torch.nn as nn
import unittest

from collections import Counter
from datetime import datetime
from torch.utils.data import Dataset, DataLoader

def set_seed(seed):  # For reproducibility, fix random seeds.
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)

set_seed(69)

In [None]:
print('python: {}, torch: {}'.format(sys.version, torch.__version__))

python: 3.9.16 (main, Dec  7 2022, 01:11:51) 
[GCC 9.4.0], torch: 2.0.0+cu118


In [None]:
assert torch.cuda.is_available(), 'GPU unavailable'

In [None]:
# Load the Drive helper and mount. You will have to authorize this operation. 
from google.colab import drive
drive.mount('/content/drive')

MessageError: ignored

In [None]:
def load_data(filename):
  with open(os.path.join('/content/drive/My Drive/data/quora_data/', filename)) as f:
    f.readline() 
    data = [line.split('\t') for line in f]
  data = [(x.split(), int(y)) for (x, y) in data]  # Text already tokenized (woohoo!), just use whitespace
  balance = len([_ for x, y in data if y == 1]) / len(data) * 100.  # What percentage is positive?
  return data, balance, [len(x) for x, _ in data]

train_data, balance_train, lengths_train = load_data('train.tsv')
val_data, balance_val, lengths_val = load_data('dev.tsv')

print('{} train examples ({:.1f}% positive)'.format(len(train_data), balance_train))
print('{} val examples ({:.1f}% positive)'.format(len(val_data), balance_val))
print('No test labels released\n')

print('\nSentence lengths')
print('  Train: average {:5.1f}, max {}, min {}'.format(sum(lengths_train) / len(lengths_train), max(lengths_train), min(lengths_train)))
print('  Val:   average {:5.1f}, max {}, min {}'.format(sum(lengths_val) / len(lengths_val), max(lengths_val), min(lengths_val)))

FileNotFoundError: ignored

In [None]:
class BinaryCrossEntropyLossFunction(torch.autograd.Function):
    
  @staticmethod
  def forward(ctx, logits, labels):
    probs = 1. / (1 + (-logits).exp())
    ctx.save_for_backward(probs, labels)  # Just need probabilities for backward
    losses = (-1) * ( ((labels == 1)*torch.log(probs)) + ((labels == 0)*torch.log(1 - probs)) ) # TODO: Compute a vector of losses.
    return losses.sum()  

  @staticmethod
  def backward(ctx, grad_output):
    probs, labels = ctx.saved_tensors
    jacobian = probs - labels  # TODO: Compute the Jacobian.
    grad_logits = grad_output * jacobian
    return grad_logits, None  # No need to calculate gradient wrt labels
    
    
class BinaryCrossEntropyLoss(nn.Module):
    
  def forward(self, logits, labels):
    return BinaryCrossEntropyLossFunction.apply(logits, labels)

In [None]:
class TestBinaryCrossEntropyLoss(unittest.TestCase):

  def setUp(self):
    self.batch_size = 42
    self.places = 6
    self.logits = np.random.randn(self.batch_size)
    self.labels = np.random.randint(2, size=self.batch_size)
    self.mine = BinaryCrossEntropyLoss()
    self.gold = torch.nn.BCEWithLogitsLoss(reduction='sum') 
      
  def test_forward_backward(self):        
    def run_loss_layer(layer):
      variables = torch.tensor(self.logits, requires_grad=True)
      
      # The final node is actually a scalar division node. 
      loss_node = layer(variables, torch.tensor(self.labels).float()) / self.batch_size

      # This runs forward backward. Our layer will propagate gradient from the final node (scalar division).
      loss_node.backward()  
      
      grad = copy.deepcopy(variables.grad)
      return loss_node.item(), grad.tolist()

    loss, grad = run_loss_layer(self.mine)
    loss_gold, grad_gold = run_loss_layer(self.gold)
    self.assertAlmostEqual(loss, loss_gold, places=self.places)    
    for i in range(len(grad)):
        self.assertAlmostEqual(grad[i], grad_gold[i], places=self.places)
          
unittest.main(TestBinaryCrossEntropyLoss(), argv=[''], verbosity=2, exit=False)

test_forward_backward (__main__.TestBinaryCrossEntropyLoss) ... ok

----------------------------------------------------------------------
Ran 1 test in 0.142s

OK


<unittest.main.TestProgram at 0x7ff31ba2ed00>

In [None]:
dropper = nn.Dropout(0.75)  # Train mode by default.
x = torch.randn(10)
print('x:          ({})'.format(' '.join(['{:.2f}'.format(val) for val in x])))
print('dropper(x): ({})'.format(' '.join(['{:.2f}'.format(val) for val in dropper(x)])))  # Survivors multiplied by 4

dropper.eval()
print('At eval:    ({})'.format(' '.join(['{:.2f}'.format(val) for val in dropper(x)])))

x:          (-0.53 -2.60 -0.64 1.44 -0.58 -0.73 1.61 -0.07 0.48 -0.33)
dropper(x): (-0.00 -0.00 -2.58 0.00 -0.00 -2.90 6.44 -0.00 0.00 -0.00)
At eval:    (-0.53 -2.60 -0.64 1.44 -0.58 -0.73 1.61 -0.07 0.48 -0.33)


In [None]:
class preMHA(nn.Module):
  def __init__(self, d_model, heads, d_k, bias):
    super().__init__()
    self.linear = nn.Linear(d_model, heads * d_k, bias=bias)
    self.heads = heads
    self.d_k = d_k


class MHA(nn.Module):
  def __init__(self, heads,  d_model, dropout_prob = 0.1, bias = True):
    super().__init__()
    self.d_k = d_model // heads
    self.heads = heads
    self.query = preMHA(d_model, heads, self.d_k, bias=bias)
    self.key = preMHA(d_model, heads, self.d_k, bias=bias)
    self.value = preMHA(d_model, heads, self.d_k, bias=True)
    self.softmax = nn.Softmax(dim=1)
    self.output = nn.Linear(d_model, d_model)
    self.dropout = nn.Dropout(dropout_prob)
    self.scale = 1 / math.sqrt(self.d_k)
    self.attn = None

In [None]:
def get_slopes(n_heads):
  n = 2 ** math.floor(math.log2(n_heads))
  m_0 = 2.0 ** (-8.0 / n)
  m = torch.pow(m_0, torch.arange(1, 1 + n))
  if n < n_heads:
    m_hat_0 = 2.0 ** (-4.0 / n)
    m_hat = torch.pow(m_hat_0, torch.arange(1, 1 + 2 * (n_heads - n), 2))
    m = torch.cat([m, m_hat])
  return m

In [None]:
@torch.no_grad()
def get_alibi_biases(n_heads: int, mask: torch.Tensor):
  m = get_slopes(n_heads).to(mask.device)
  distance = mask.cumsum(dim=-1)
  return distance[:, :, None] * m[None, None, :]

In [None]:
class AlibiMHA(MHA):
  def __init__(self, heads, d_model, dropout_prob = 0.1):
    super().__init__(heads, d_model, dropout_prob)
    self.alibi_biases = None

  def forward(self, *, query, key, value, mask = None):
    seq_len, batch_size, _ = query.shape
    mask = self.prepare_mask(mask, query.shape, key.shape)
    query = self.query(query)
    key = self.key(key)
    value = self.value(value)
    scores = self.get_scores(query, key)
    scores *= self.scale
    if self.alibi_biases is None or self.alibi_biases.shape[1] < seq_len:
      self.alibi_biases = get_alibi_biases(scores.shape[-1], mask[:, :, 0, 0])
    scores += self.alibi_biases[:seq_len, :seq_len, None, :]
    scores = scores.masked_fill(mask == 0, float('-inf'))
    attn = self.softmax(scores)
    attn = self.dropout(attn)
    x = torch.einsum("ijbh,jbhd->ibhd", attn, value)
    x = x.reshape(seq_len, batch_size, -1)
    return self.output(x)

In [None]:
def count_params(model):
  return sum(p.numel() for p in model.parameters())

model = AlibiMHA(16, 32, 0.1)
print('Model has {} parameters\n'.format(count_params(model)))
print(model)
print()

print('First few values of the score layer\'s weight vector')
print(model.score.weight.data[0][:10]) 

Model has 4224 parameters

AlibiMHA(
  (query): preMHA(
    (linear): Linear(in_features=32, out_features=32, bias=True)
  )
  (key): preMHA(
    (linear): Linear(in_features=32, out_features=32, bias=True)
  )
  (value): preMHA(
    (linear): Linear(in_features=32, out_features=32, bias=True)
  )
  (softmax): Softmax(dim=1)
  (output): Linear(in_features=32, out_features=32, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

First few values of the score layer's weight vector


AttributeError: ignored

In [None]:
PAD = '<pad>'
UNK = '<unk>'
vocab = Counter([tok for toks, _ in train_data for tok in toks])
assert not PAD in vocab  
assert not UNK in vocab  
vocab[PAD] = 9999999  # PAD will get index 0
vocab[UNK] = 9999998  # UNK will get index 1
vocab_size = 10000 
vocab = [word for word, _ in vocab.most_common(vocab_size)]
assert vocab[0] == PAD
print('Vocab size: {} (with PAD and UNK added)'.format(len(vocab)))
print('vocab[0]:', vocab[0])
print('vocab[1]:', vocab[1])
w2i = {}
for i, word in enumerate(vocab):
  w2i[word] = i

# Note that we're preserving word ordering.
train_sents = [[w2i[tok] if tok in w2i else w2i[UNK] for tok in x] for x, _ in train_data]  
val_sents = [[w2i[tok] if tok in w2i else w2i[UNK] for tok in x] for x, _ in val_data]

NameError: ignored

In [None]:
class Quora(Dataset):  # A child class of torch.utils.data.Dataset

  def __init__(self, sents, labels, max_length):      
    self.sents = sents  
    self.labels = labels
    self.max_length = max_length

  def __len__(self):  # This defines the "size" of the dataset. 
    return len(self.sents)

  def __getitem__(self, index):  # This returns a single indexed example.
    sent = torch.tensor(self.sents[index]) 
    sent_padded = torch.cat([sent, torch.zeros(self.max_length - len(sent))]).long()  # Avoid for loop by using torch.zeros. 
    label = torch.tensor(self.labels[index])
    return sent_padded, label, len(sent)  # Since we've padded, we need to inform the original length.

dataset_train = Quora(train_sents, [y for _, y in train_data], max(lengths_train))
dataset_val = Quora(val_sents, [y for _, y in val_data], max(lengths_val))

x1, y1, length1 = dataset_train[0]
print(x1, x1.size(), y1, length1)

NameError: ignored